From 66311274691ec65972cad3626057fa8d00c146d8 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sat, 23 Mar 2013 11:42:24 +0800 Subject: block: add a flag to identify PM request Add a flag REQ_PM to identify the request is PM related, such requests will not change the device request queue's runtime status. It is intended to be used in driver's runtime PM callback, so that driver can perform some IO to the device there with the queue's runtime status unaffected. e.g. in SCSI disk's runtime suspend callback, the disk will be put into stopped power state, and this require sending a command to the device. Such command processing should not change the disk's runtime status. Signed-off-by: Lin Ming Signed-off-by: Aaron Lu Acked-by: Alan Stern Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cdf11191e645..fcc1ce28d5ca 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -175,6 +175,7 @@ enum rq_flag_bits { __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_KERNEL, /* direct IO to kernel pages */ + __REQ_PM, /* runtime pm request */ __REQ_NR_BITS, /* stops here */ }; @@ -223,5 +224,6 @@ enum rq_flag_bits { #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) #define REQ_SECURE (1 << __REQ_SECURE) #define REQ_KERNEL (1 << __REQ_KERNEL) +#define REQ_PM (1 << __REQ_PM) #endif /* __LINUX_BLK_TYPES_H */ -- cgit v1.2.3 From 6c9546675864f51506af69eca388e5d922942c56 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sat, 23 Mar 2013 11:42:26 +0800 Subject: block: add runtime pm helpers Add runtime pm helper functions: void blk_pm_runtime_init(struct request_queue *q, struct device *dev) - Initialization function for drivers to call. int blk_pre_runtime_suspend(struct request_queue *q) - If any requests are in the queue, mark last busy and return -EBUSY. Otherwise set q->rpm_status to RPM_SUSPENDING and return 0. void blk_post_runtime_suspend(struct request_queue *q, int err) - If the suspend succeeded then set q->rpm_status to RPM_SUSPENDED. Otherwise set it to RPM_ACTIVE and mark last busy. void blk_pre_runtime_resume(struct request_queue *q) - Set q->rpm_status to RPM_RESUMING. void blk_post_runtime_resume(struct request_queue *q, int err) - If the resume succeeded then set q->rpm_status to RPM_ACTIVE and call __blk_run_queue, then mark last busy and autosuspend. Otherwise set q->rpm_status to RPM_SUSPENDED. The idea and API is designed by Alan Stern and described here: http://marc.info/?l=linux-scsi&m=133727953625963&w=2 Signed-off-by: Lin Ming Signed-off-by: Aaron Lu Acked-by: Alan Stern Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 78feda9bbae2..89d89c7162aa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -361,6 +361,12 @@ struct request_queue { */ struct kobject kobj; +#ifdef CONFIG_PM_RUNTIME + struct device *dev; + int rpm_status; + unsigned int nr_pending; +#endif + /* * queue settings */ @@ -960,6 +966,27 @@ struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +/* + * block layer runtime pm functions + */ +#ifdef CONFIG_PM_RUNTIME +extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); +extern int blk_pre_runtime_suspend(struct request_queue *q); +extern void blk_post_runtime_suspend(struct request_queue *q, int err); +extern void blk_pre_runtime_resume(struct request_queue *q); +extern void blk_post_runtime_resume(struct request_queue *q, int err); +#else +static inline void blk_pm_runtime_init(struct request_queue *q, + struct device *dev) {} +static inline int blk_pre_runtime_suspend(struct request_queue *q) +{ + return -ENOSYS; +} +static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} +static inline void blk_pre_runtime_resume(struct request_queue *q) {} +static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} +#endif + /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests -- cgit v1.2.3 From 57fb233f078beb5d0437a4ae575fbd4d9eb9c738 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Aug 2012 04:56:11 -0700 Subject: block: Reorder struct bio_set This is prep work for the next patch, which embeds a struct bio_list in struct bio_set. Signed-off-by: Kent Overstreet CC: Jens Axboe --- include/linux/bio.h | 66 ++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 820e7aaad4fd..93d3d17a300d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -298,39 +298,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } #endif /* CONFIG_BLK_CGROUP */ -/* - * bio_set is used to allow other portions of the IO system to - * allocate their own private memory pools for bio and iovec structures. - * These memory pools in turn all allocate from the bio_slab - * and the bvec_slabs[]. - */ -#define BIO_POOL_SIZE 2 -#define BIOVEC_NR_POOLS 6 -#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) - -struct bio_set { - struct kmem_cache *bio_slab; - unsigned int front_pad; - - mempool_t *bio_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t *bio_integrity_pool; -#endif - mempool_t *bvec_pool; -}; - -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 - #ifdef CONFIG_HIGHMEM /* * remember never ever reenable interrupts between a bvec_kmap_irq and @@ -527,6 +494,39 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +/* + * bio_set is used to allow other portions of the IO system to + * allocate their own private memory pools for bio and iovec structures. + * These memory pools in turn all allocate from the bio_slab + * and the bvec_slabs[]. + */ +#define BIO_POOL_SIZE 2 +#define BIOVEC_NR_POOLS 6 +#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) + +struct bio_set { + struct kmem_cache *bio_slab; + unsigned int front_pad; + + mempool_t *bio_pool; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + mempool_t *bio_integrity_pool; +#endif + mempool_t *bvec_pool; +}; + +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + +/* + * a small number of entries is fine, not going to be performance critical. + * basically we just need to survive + */ +#define BIO_SPLIT_ENTRIES 2 + #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) -- cgit v1.2.3 From df2cb6daa4cbc34406bc4b1ac9b9335df1083a72 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:33:46 -0700 Subject: block: Avoid deadlocks with bio allocation by stacking drivers Previously, if we ever try to allocate more than once from the same bio set while running under generic_make_request() (i.e. a stacking block driver), we risk deadlock. This is because of the code in generic_make_request() that converts recursion to iteration; any bios we submit won't actually be submitted (so they can complete and eventually be freed) until after we return - this means if we allocate a second bio, we're blocking the first one from ever being freed. Thus if enough threads call into a stacking block driver at the same time with bios that need multiple splits, and the bio_set's reserve gets used up, we deadlock. This can be worked around in the driver code - we could check if we're running under generic_make_request(), then mask out __GFP_WAIT when we go to allocate a bio, and if the allocation fails punt to workqueue and retry the allocation. But this is tricky and not a generic solution. This patch solves it for all users by inverting the previously described technique. We allocate a rescuer workqueue for each bio_set, and then in the allocation code if there are bios on current->bio_list we would be blocking, we punt them to the rescuer workqueue to be submitted. This guarantees forward progress for bio allocations under generic_make_request() provided each bio is submitted before allocating the next, and provided the bios are freed after they complete. Note that this doesn't do anything for allocation from other mempools. Instead of allocating per bio data structures from a mempool, code should use bio_set's front_pad. Tested it by forcing the rescue codepath to be taken (by disabling the first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot of arbitrary bio splitting) and verified that the rescuer was being invoked. Signed-off-by: Kent Overstreet CC: Jens Axboe Acked-by: Tejun Heo Reviewed-by: Muthukumar Ratty --- include/linux/bio.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 93d3d17a300d..b31036ff779f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -513,6 +513,15 @@ struct bio_set { mempool_t *bio_integrity_pool; #endif mempool_t *bvec_pool; + + /* + * Deadlock avoidance for stacking block drivers: see comments in + * bio_alloc_bioset() for details + */ + spinlock_t rescue_lock; + struct bio_list rescue_list; + struct work_struct rescue_work; + struct workqueue_struct *rescue_workqueue; }; struct biovec_slab { -- cgit v1.2.3 From 6fda981cafbf908acd11e1e636fec50e99d56a47 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Oct 2012 13:18:27 -0700 Subject: block: Fix a buffer overrun in bio_integrity_split() bio_integrity_split() seemed to be confusing pointers and arrays - bip_vec in bio_integrity_payload was an array appended to the end of the payload, so the bio_vecs in struct bio_pair should have come after the bio_integrity_payload they're for. Fix it by making bip_vec a pointer to the inline vecs - a later patch is going to make more use of this pointer. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen --- include/linux/bio.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index b31036ff779f..81004fdcc277 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -182,7 +182,9 @@ struct bio_integrity_payload { unsigned short bip_idx; /* current bip_vec index */ struct work_struct bip_work; /* I/O completion */ - struct bio_vec bip_vec[0]; /* embedded bvec array */ + + struct bio_vec *bip_vec; + struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ -- cgit v1.2.3 From 9f060e2231ca96ca94f2ffcff730acd72606b280 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Oct 2012 15:29:33 -0700 Subject: block: Convert integrity to bvec_alloc_bs() This adds a pointer to the bvec array to struct bio_integrity_payload, instead of the bvecs always being inline; then the bvecs are allocated with bvec_alloc_bs(). Changed bvec_alloc_bs() and bvec_free_bs() to take a pointer to a mempool instead of the bioset, so that bio integrity can use a different mempool for its bvecs, and thus avoid a potential deadlock. This is eventually for immutable bio vecs - immutable bvecs aren't useful if we still have to copy them, hence the need for the pointer. Less code is always nice too, though. Also, bio_integrity_alloc() was using fs_bio_set if no bio_set was specified. This was wrong - using the bio_set doesn't protect us from memory allocation failures, because we just used kmalloc for the bio_integrity_payload. But it does introduce the possibility of deadlock, if for some reason we weren't supposed to be using fs_bio_set. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen --- include/linux/bio.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 81004fdcc277..669b1cb18fee 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -213,6 +213,7 @@ extern void bio_pair_release(struct bio_pair *dbio); extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); +extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); @@ -288,8 +289,8 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, int, int, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); -extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); -extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); +extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); +extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); #ifdef CONFIG_BLK_CGROUP @@ -511,10 +512,11 @@ struct bio_set { unsigned int front_pad; mempool_t *bio_pool; + mempool_t *bvec_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; + mempool_t *bvec_integrity_pool; #endif - mempool_t *bvec_pool; /* * Deadlock avoidance for stacking block drivers: see comments in -- cgit v1.2.3 From 054bdf646e36c2f7dc1bf6bc6209dbbb5909164b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Sep 2012 13:17:55 -0700 Subject: block: Add bio_advance() This is prep work for immutable bio vecs; we first want to centralize where bvecs are modified. Next two patches convert some existing code to use this function. Signed-off-by: Kent Overstreet CC: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/blk_types.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 669b1cb18fee..fcb4dba2d8ea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -248,6 +248,8 @@ extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); +extern void bio_advance(struct bio *, unsigned); + extern void bio_init(struct bio *); extern void bio_reset(struct bio *); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cdf11191e645..c178d25e588b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -197,6 +197,8 @@ enum rq_flag_bits { REQ_SECURE) #define REQ_CLONE_MASK REQ_COMMON_MASK +#define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) + /* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) -- cgit v1.2.3 From f73a1c7d117d07a96d89475066188a2b79e53c48 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Sep 2012 15:05:12 -0700 Subject: block: Add bio_end_sector() Just a little convenience macro - main reason to add it now is preparing for immutable bio vecs, it'll reduce the size of the patch that puts bi_sector/bi_size/bi_idx into a struct bvec_iter. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Lars Ellenberg CC: Jiri Kosina CC: Alasdair Kergon CC: dm-devel@redhat.com CC: Neil Brown CC: Martin Schwidefsky CC: Heiko Carstens CC: linux-s390@vger.kernel.org CC: Chris Mason CC: Steven Whitehouse Acked-by: Steven Whitehouse --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index fcb4dba2d8ea..20507eb7c979 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -67,6 +67,7 @@ #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) +#define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio))) static inline unsigned int bio_cur_bytes(struct bio *bio) { -- cgit v1.2.3 From aa8b57aa3d1c06ca53312294ee6dfc767ee3ddb3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 5 Feb 2013 15:19:29 -0800 Subject: block: Use bio_sectors() more consistently Bunch of places in the code weren't using it where they could be - this'll reduce the size of the patch that puts bi_sector/bi_size/bi_idx into a struct bvec_iter. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: "Ed L. Cashin" CC: Nick Piggin CC: Jiri Kosina CC: Jim Paris CC: Geoff Levand CC: Alasdair Kergon CC: dm-devel@redhat.com CC: Neil Brown CC: Steven Rostedt Acked-by: Ed Cashin --- include/trace/events/block.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9961726523d0..5a28843725df 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -244,7 +244,7 @@ TRACE_EVENT(block_bio_bounce, __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -281,7 +281,7 @@ TRACE_EVENT(block_bio_complete, __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); __entry->error = error; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); ), @@ -309,7 +309,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -376,7 +376,7 @@ TRACE_EVENT(block_bio_queue, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -404,7 +404,7 @@ DECLARE_EVENT_CLASS(block_get_rq, TP_fast_assign( __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio ? bio->bi_sector : 0; - __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; + __entry->nr_sector = bio ? bio_sectors(bio) : 0; blk_fill_rwbs(__entry->rwbs, bio ? bio->bi_rw : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); @@ -580,7 +580,7 @@ TRACE_EVENT(block_bio_remap, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); -- cgit v1.2.3 From 9e882242c6193ae6f416f2d8d8db0d9126bd996b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:41:12 -0700 Subject: block: Add submit_bio_wait(), remove from md Random cleanup - this code was duplicated and it's not really specific to md. Also added the ability to return the actual error code. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown Acked-by: Tejun Heo --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 20507eb7c979..b20a9cd776dd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -249,6 +249,7 @@ extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); +extern int submit_bio_wait(int rw, struct bio *bio); extern void bio_advance(struct bio *, unsigned); extern void bio_init(struct bio *); -- cgit v1.2.3 From 16ac3d63e74f3d6e34e42d6e523b6a61de0020f0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 13:57:51 -0700 Subject: block: Add bio_copy_data() This gets open coded quite a bit and it's tricky to get right, so make a generic version and convert some existing users over to it instead. Signed-off-by: Kent Overstreet CC: Jens Axboe --- include/linux/bio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index b20a9cd776dd..90d36c65cb70 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -286,6 +286,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi) } #endif +extern void bio_copy_data(struct bio *dst, struct bio *src); + extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, -- cgit v1.2.3 From d74c6d514fe314b8bdab58b487b25992291577ec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 6 Feb 2013 12:23:11 -0800 Subject: block: Add bio_for_each_segment_all() __bio_for_each_segment() iterates bvecs from the specified index instead of bio->bv_idx. Currently, the only usage is to walk all the bvecs after the bio has been advanced by specifying 0 index. For immutable bvecs, we need to split these apart; bio_for_each_segment() is going to have a different implementation. This will also help document the intent of code that's using it - bio_for_each_segment_all() is only legal to use for code that owns the bio. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Neil Brown CC: Boaz Harrosh --- include/linux/bio.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 90d36c65cb70..be2efa09f9bf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -137,16 +137,27 @@ static inline int bio_has_allocated_vec(struct bio *bio) #define bio_io_error(bio) bio_endio((bio), -EIO) /* - * drivers should not use the __ version unless they _really_ want to - * run through the entire bio and not just pending pieces + * drivers should not use the __ version unless they _really_ know what + * they're doing */ #define __bio_for_each_segment(bvl, bio, i, start_idx) \ for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ i < (bio)->bi_vcnt; \ bvl++, i++) +/* + * drivers should _never_ use the all version - the bio may have been split + * before it got to the driver and the driver won't own all of it + */ +#define bio_for_each_segment_all(bvl, bio, i) \ + for (i = 0; \ + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ + i++) + #define bio_for_each_segment(bvl, bio, i) \ - __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) + for (i = (bio)->bi_idx; \ + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ + i++) /* * get a reference to a bio, so it won't disappear. the intended use is -- cgit v1.2.3 From a07876064a0b73ab5ef1ebcf14b1cf0231c07858 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:03:28 -0700 Subject: block: Add bio_alloc_pages() More utility code to replace stuff that's getting open coded. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index be2efa09f9bf..e25378f2f408 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -298,6 +298,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi) #endif extern void bio_copy_data(struct bio *dst, struct bio *src); +extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); -- cgit v1.2.3 From a38352e0ac02dbbd4fa464dc22d1352b5fbd06fd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 25 May 2012 13:03:11 -0700 Subject: block: Add an explicit bio flag for bios that own their bvec This is for the new bio splitting code. When we split a bio, if the split occured on a bvec boundry we reuse the bvec for the new bio. But that means bio_free() can't free it, hence the explicit flag. Signed-off-by: Kent Overstreet CC: Jens Axboe Acked-by: Tejun Heo --- include/linux/bio.h | 5 ----- include/linux/blk_types.h | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index e25378f2f408..794bcd0c5039 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -85,11 +85,6 @@ static inline void *bio_data(struct bio *bio) return NULL; } -static inline int bio_has_allocated_vec(struct bio *bio) -{ - return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs; -} - /* * will die */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c178d25e588b..538289ffc704 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -117,6 +117,7 @@ struct bio { * BIO_POOL_IDX() */ #define BIO_RESET_BITS 12 +#define BIO_OWNS_VEC 12 /* bio_free() should free bvec */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) -- cgit v1.2.3 From 29ed7813ce5c4661261aeebddb1b8660e0860223 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Sep 2012 09:54:22 -0700 Subject: bio-integrity: Add explicit field for owner of bip_buf This was the only real user of BIO_CLONED, which didn't have very clear semantics. Convert to its own flag so we can get rid of BIO_CLONED. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 794bcd0c5039..ef24466d8f82 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -187,6 +187,7 @@ struct bio_integrity_payload { unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_idx; /* current bip_vec index */ + unsigned bip_owns_buf:1; /* should free bip_buf */ struct work_struct bip_work; /* I/O completion */ -- cgit v1.2.3 From 181387da2d64c3129e5b5186c4dd388bc5041d53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: remove unused bdi_pending_list There's no user left. Remove it. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu --- include/linux/backing-dev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 350459910fe1..a5ef27f5411a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -130,7 +130,6 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -extern struct list_head bdi_pending_list; static inline int wb_has_dirty_io(struct bdi_writeback *wb) { -- cgit v1.2.3 From 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: replace custom worker pool implementation with unbound workqueue Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer --- include/linux/backing-dev.h | 15 +++++---------- include/trace/events/writeback.h | 5 ----- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a5ef27f5411a..c3881553f7d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,6 +18,7 @@ #include #include #include +#include struct page; struct device; @@ -27,7 +28,6 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pending, /* On its way to being activated */ BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ @@ -53,10 +53,8 @@ struct bdi_writeback { unsigned int nr; unsigned long last_old_flush; /* last old data flush */ - unsigned long last_active; /* last time bdi thread was active */ - struct task_struct *task; /* writeback thread */ - struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ + struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -int bdi_writeback_thread(void *data); +void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); @@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct workqueue_struct *bdi_wq; + static inline int wb_has_dirty_io(struct bdi_writeback *wb) { return !list_empty(&wb->b_dirty) || @@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) return bdi->capabilities & BDI_CAP_SWAP_BACKED; } -static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) -{ - return bdi == &default_backing_dev_info; -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 6a16fd2e70ed..464ea82e10db 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class, DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ TP_ARGS(bdi, work)) -DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \ DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_wake_thread); -DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); -DEFINE_WRITEBACK_EVENT(writeback_thread_start); -DEFINE_WRITEBACK_EVENT(writeback_thread_stop); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), -- cgit v1.2.3 From 871dd9286e25330c8a581e5dacfa8b1dfe1dd641 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 24 Apr 2013 08:52:50 -0600 Subject: block: fix max discard sectors limit linux-v3.8-rc1 and later support for plug for blkdev_issue_discard with commit 0cfbcafcae8b7364b5fa96c2b26ccde7a3a296a9 (block: add plug for blkdev_issue_discard ) For example, 1) DISCARD rq-1 with size size 4GB 2) DISCARD rq-2 with size size 1GB If these 2 discard requests get merged, final request size will be 5GB. In this case, request's __data_len field may overflow as it can store max 4GB(unsigned int). This issue was observed while doing mkfs.f2fs on 5GB SD card: https://lkml.org/lkml/2013/4/1/292 Info: sector size = 512 Info: total sectors = 11370496 (in 512bytes) Info: zone aligned segment0 blkaddr: 512 [ 257.789764] blk_update_request: bio idx 0 >= vcnt 0 mkfs process gets stuck in D state and I see the following in the dmesg: [ 257.789733] __end_that: dev mmcblk0: type=1, flags=122c8081 [ 257.789764] sector 4194304, nr/cnr 2981888/4294959104 [ 257.789764] bio df3840c0, biotail df3848c0, buffer (null), len 1526726656 [ 257.789764] blk_update_request: bio idx 0 >= vcnt 0 [ 257.794921] request botched: dev mmcblk0: type=1, flags=122c8081 [ 257.794921] sector 4194304, nr/cnr 2981888/4294959104 [ 257.794921] bio df3840c0, biotail df3848c0, buffer (null), len 1526726656 This patch fixes this issue. Reported-by: Max Filippov Signed-off-by: James Bottomley Signed-off-by: Namjae Jeon Tested-by: Max Filippov Cc: Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 89d89c7162aa..6189bf26b53d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -844,7 +844,7 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, unsigned int cmd_flags) { if (unlikely(cmd_flags & REQ_DISCARD)) - return q->limits.max_discard_sectors; + return min(q->limits.max_discard_sectors, UINT_MAX >> 9); if (unlikely(cmd_flags & REQ_WRITE_SAME)) return q->limits.max_write_same_sectors; -- cgit v1.2.3