From a91a5ac6858fbf7477131e1210cb3e897b668e6f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 20:40:53 -0700 Subject: mempool: add @gfp_mask to mempool_create_node() mempool_create_node() currently assumes %GFP_KERNEL. Its only user, blk_init_free_list(), is about to be updated to use other allocation flags - add @gfp_mask argument to the function. Signed-off-by: Tejun Heo Cc: Andrew Morton Cc: Hugh Dickins Signed-off-by: Jens Axboe --- include/linux/mempool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7c08052e3321..39ed62ab5b8a 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -26,7 +26,8 @@ typedef struct mempool_s { extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, int nid); + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int nid); extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); extern void mempool_destroy(mempool_t *pool); -- cgit v1.2.3 From 86072d8112595ea1b6beeb33f578e7c2839e014e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 20:40:54 -0700 Subject: block: drop custom queue draining used by scsi_transport_{iscsi|fc} iscsi_remove_host() uses bsg_remove_queue() which implements custom queue draining. fc_bsg_remove() open-codes mostly identical logic. The draining logic isn't correct in that blk_stop_queue() doesn't prevent new requests from being queued - it just stops processing, so nothing prevents new requests to be queued after the logic determines that the queue is drained. blk_cleanup_queue() now implements proper queue draining and these custom draining logics aren't necessary. Drop them and use bsg_unregister_queue() + blk_cleanup_queue() instead. Signed-off-by: Tejun Heo Reviewed-by: Mike Christie Acked-by: Vivek Goyal Cc: James Bottomley Cc: James Smart Signed-off-by: Jens Axboe --- include/linux/bsg-lib.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index f55ab8cdc106..4d0fb3df2f4a 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -67,7 +67,6 @@ void bsg_job_done(struct bsg_job *job, int result, int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name, bsg_job_fn *job_fn, int dd_job_size); void bsg_request_fn(struct request_queue *q); -void bsg_remove_queue(struct request_queue *q); void bsg_goose_queue(struct request_queue *q); #endif -- cgit v1.2.3 From 8a5ecdd42862bf87ceab00bf2a15d7eabf58c02d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 20:40:58 -0700 Subject: block: add q->nr_rqs[] and move q->rq.elvpriv to q->nr_rqs_elvpriv Add q->nr_rqs[] which currently behaves the same as q->rq.count[] and move q->rq.elvpriv to q->nr_rqs_elvpriv. blk_drain_queue() is updated to use q->nr_rqs[] instead of q->rq.count[]. These counters separates queue-wide request statistics from the request list and allow implementation of per-queue request allocation. While at it, properly indent fields of struct request_list. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 07954b05b86c..7e44ed93f84b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -51,11 +51,10 @@ struct request_list { * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC */ - int count[2]; - int starved[2]; - int elvpriv; - mempool_t *rq_pool; - wait_queue_head_t wait[2]; + int count[2]; + int starved[2]; + mempool_t *rq_pool; + wait_queue_head_t wait[2]; }; /* @@ -282,6 +281,8 @@ struct request_queue { struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; + int nr_rqs[2]; /* # allocated [a]sync rqs */ + int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ /* * the queue request freelist, one for reads and one for writes -- cgit v1.2.3 From 5b788ce3e2acac9bf109743b1281d77347cf2101 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 20:40:59 -0700 Subject: block: prepare for multiple request_lists Request allocation is about to be made per-blkg meaning that there'll be multiple request lists. * Make queue full state per request_list. blk_*queue_full() functions are renamed to blk_*rl_full() and takes @rl instead of @q. * Rename blk_init_free_list() to blk_init_rl() and make it take @rl instead of @q. Also add @gfp_mask parameter. * Add blk_exit_rl() instead of destroying rl directly from blk_release_queue(). * Add request_list->q and make request alloc/free functions - blk_free_request(), [__]freed_request(), __get_request() - take @rl instead of @q. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e44ed93f84b..f2385ee7c7b2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -46,7 +46,12 @@ struct blkcg_gq; struct request; typedef void (rq_end_io_fn)(struct request *, int); +#define BLK_RL_SYNCFULL (1U << 0) +#define BLK_RL_ASYNCFULL (1U << 1) + struct request_list { + struct request_queue *q; /* the queue this rl belongs to */ + /* * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC @@ -55,6 +60,7 @@ struct request_list { int starved[2]; mempool_t *rq_pool; wait_queue_head_t wait[2]; + unsigned int flags; }; /* @@ -562,27 +568,25 @@ static inline bool rq_is_sync(struct request *rq) return rw_is_sync(rq->cmd_flags); } -static inline int blk_queue_full(struct request_queue *q, int sync) +static inline bool blk_rl_full(struct request_list *rl, bool sync) { - if (sync) - return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags); - return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + return rl->flags & flag; } -static inline void blk_set_queue_full(struct request_queue *q, int sync) +static inline void blk_set_rl_full(struct request_list *rl, bool sync) { - if (sync) - queue_flag_set(QUEUE_FLAG_SYNCFULL, q); - else - queue_flag_set(QUEUE_FLAG_ASYNCFULL, q); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + rl->flags |= flag; } -static inline void blk_clear_queue_full(struct request_queue *q, int sync) +static inline void blk_clear_rl_full(struct request_list *rl, bool sync) { - if (sync) - queue_flag_clear(QUEUE_FLAG_SYNCFULL, q); - else - queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + rl->flags &= ~flag; } -- cgit v1.2.3 From a051661ca6d134c18599498b185b667859d4339b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Jun 2012 15:05:44 -0700 Subject: blkcg: implement per-blkg request allocation Currently, request_queue has one request_list to allocate requests from regardless of blkcg of the IO being issued. When the unified request pool is used up, cfq proportional IO limits become meaningless - whoever grabs the next request being freed wins the race regardless of the configured weights. This can be easily demonstrated by creating a blkio cgroup w/ very low weight, put a program which can issue a lot of random direct IOs there and running a sequential IO from a different cgroup. As soon as the request pool is used up, the sequential IO bandwidth crashes. This patch implements per-blkg request_list. Each blkg has its own request_list and any IO allocates its request from the matching blkg making blkcgs completely isolated in terms of request allocation. * Root blkcg uses the request_list embedded in each request_queue, which was renamed to @q->root_rl from @q->rq. While making blkcg rl handling a bit harier, this enables avoiding most overhead for root blkcg. * Queue fullness is properly per request_list but bdi isn't blkcg aware yet, so congestion state currently just follows the root blkcg. As writeback isn't aware of blkcg yet, this works okay for async congestion but readahead may get the wrong signals. It's better than blkcg completely collapsing with shared request_list but needs to be improved with future changes. * After this change, each block cgroup gets a full request pool making resource consumption of each cgroup higher. This makes allowing non-root users to create cgroups less desirable; however, note that allowing non-root users to directly manage cgroups is already severely broken regardless of this patch - each block cgroup consumes kernel memory and skews IO weight (IO weights are not hierarchical). v2: queue-sysfs.txt updated and patch description udpated as suggested by Vivek. v3: blk_get_rl() wasn't checking error return from blkg_lookup_create() and may cause oops on lookup failure. Fix it by falling back to root_rl on blkg lookup failures. This problem was spotted by Rakesh Iyer . v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in request waitqueue". blk_drain_queue() now wakes up waiters on all blkg->rl on the target queue. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Cc: Wu Fengguang Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f2385ee7c7b2..3816ce8a08fc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -51,7 +51,9 @@ typedef void (rq_end_io_fn)(struct request *, int); struct request_list { struct request_queue *q; /* the queue this rl belongs to */ - +#ifdef CONFIG_BLK_CGROUP + struct blkcg_gq *blkg; /* blkg this request pool belongs to */ +#endif /* * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC @@ -143,6 +145,7 @@ struct request { struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; unsigned long long io_start_time_ns; /* when passed to hardware */ #endif @@ -291,9 +294,12 @@ struct request_queue { int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ /* - * the queue request freelist, one for reads and one for writes + * If blkcg is not used, @q->root_rl serves all requests. If blkcg + * is used, root blkg allocates from @q->root_rl and all other + * blkgs from their own blkg->rl. Which one to use should be + * determined using bio_request_list(). */ - struct request_list rq; + struct request_list root_rl; request_fn_proc *request_fn; make_request_fn *make_request_fn; -- cgit v1.2.3 From c83f6bf98dc1f1a194118b3830706cebbebda8c4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Aug 2012 12:24:18 +0200 Subject: block: add partition resize function to blkpg ioctl Add a new operation code (BLKPG_RESIZE_PARTITION) to the BLKPG ioctl that allows altering the size of an existing partition, even if it is currently in use. This patch converts hd_struct->nr_sects into sequence counter because One might extend a partition while IO is happening to it and update of nr_sects can be non-atomic on 32bit machines with 64bit sector_t. This can lead to issues like reading inconsistent size of a partition. Sequence counter have been used so that readers don't have to take bdev mutex lock as we call sector_in_part() very frequently. Now all the access to hd_struct->nr_sects should happen using sequence counter read/update helper functions part_nr_sects_read/part_nr_sects_write. There is one exception though, set_capacity()/get_capacity(). I think theoritically race should exist there too but this patch does not modify set_capacity()/get_capacity() due to sheer number of call sites and I am afraid that change might break something. I have left that as a TODO item. We can handle it later if need be. This patch does not introduce any new races as such w.r.t set_capacity()/get_capacity(). v2: Add CONFIG_LBDAF test to UP preempt case as suggested by Phillip. Signed-off-by: Vivek Goyal Signed-off-by: Phillip Susi Signed-off-by: Jens Axboe --- include/linux/blkpg.h | 1 + include/linux/genhd.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkpg.h b/include/linux/blkpg.h index faf8a45af210..a8519446c111 100644 --- a/include/linux/blkpg.h +++ b/include/linux/blkpg.h @@ -40,6 +40,7 @@ struct blkpg_ioctl_arg { /* The subfunctions (for the op field) */ #define BLKPG_ADD_PARTITION 1 #define BLKPG_DEL_PARTITION 2 +#define BLKPG_RESIZE_PARTITION 3 /* Sizes of name fields. Unused at present. */ #define BLKPG_DEVNAMELTH 64 diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 017a7fb5a1fc..b88723b81b3d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -98,7 +98,13 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; + /* + * nr_sects is protected by sequence counter. One might extend a + * partition while IO is happening to it and update of nr_sects + * can be non-atomic on 32bit machines with 64bit sector_t. + */ sector_t nr_sects; + seqcount_t nr_sects_seq; sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; @@ -648,6 +654,57 @@ static inline void hd_struct_put(struct hd_struct *part) __delete_partition(part); } +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPT case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); +#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -- cgit v1.2.3