From b82d9fa257cb3725c49d94d2aeafc4677c34448a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:20 -0700 Subject: block: fix infinite loop for invalid zone append Returning 0 early from __bio_iov_append_get_pages() for the max_append_sectors warning just creates an infinite loop since 0 means success, and the bio will never fill from the unadvancing iov_iter. We could turn the return into an error value, but it will already be turned into an error value later on, so just remove the warning. Clearly no one ever hit it anyway. Fixes: 0512a75b98f84 ("block: Introduce REQ_OP_ZONE_APPEND") Signed-off-by: Keith Busch Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220610195830.3574005-2-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bio.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 51c99f2c5c90..d9ff51fc457e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1229,9 +1229,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) size_t offset; int ret = 0; - if (WARN_ON_ONCE(!max_append_sectors)) - return 0; - /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning -- cgit v1.2.3 From c58c0074c54c2e2bb3bb0d5a4d8896bb660cc8bc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:21 -0700 Subject: block/bio: remove duplicate append pages code The getting pages setup for zone append and normal IO are identical. Use common code for each. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220610195830.3574005-3-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bio.c | 102 +++++++++++++++++++++++++----------------------------------- 1 file changed, 42 insertions(+), 60 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index d9ff51fc457e..ee5fe1bb015e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1159,6 +1159,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off) put_page(pages[i]); } +static int bio_iov_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + bool same_page = false; + + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (WARN_ON_ONCE(bio_full(bio, len))) + return -EINVAL; + __bio_add_page(bio, page, len, offset); + return 0; + } + + if (same_page) + put_page(page); + return 0; +} + +static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + bool same_page = false; + + if (bio_add_hw_page(q, bio, page, len, offset, + queue_max_zone_append_sectors(q), &same_page) != len) + return -EINVAL; + if (same_page) + put_page(page); + return 0; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1177,7 +1208,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - bool same_page = false; ssize_t size, left; unsigned len, i; size_t offset; @@ -1186,7 +1216,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. - */ + */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); @@ -1196,18 +1226,18 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; + int ret; len = min_t(size_t, PAGE_SIZE - offset, left); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = bio_iov_add_zone_append_page(bio, page, len, + offset); + else + ret = bio_iov_add_page(bio, page, len, offset); - if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (same_page) - put_page(page); - } else { - if (WARN_ON_ONCE(bio_full(bio, len))) { - bio_put_pages(pages + i, left, offset); - return -EINVAL; - } - __bio_add_page(bio, page, len, offset); + if (ret) { + bio_put_pages(pages + i, left, offset); + return ret; } offset = 0; } @@ -1216,51 +1246,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } -static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) -{ - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; - unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int max_append_sectors = queue_max_zone_append_sectors(q); - struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; - struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i; - size_t offset; - int ret = 0; - - /* - * Move page array up in the allocated memory for the bio vecs as far as - * possible so that we can start filling biovecs from the beginning - * without overwriting the temporary page array. - */ - BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); - pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; - - for (left = size, i = 0; left > 0; left -= len, i++) { - struct page *page = pages[i]; - bool same_page = false; - - len = min_t(size_t, PAGE_SIZE - offset, left); - if (bio_add_hw_page(q, bio, page, len, offset, - max_append_sectors, &same_page) != len) { - bio_put_pages(pages + i, left, offset); - ret = -EINVAL; - break; - } - if (same_page) - put_page(page); - offset = 0; - } - - iov_iter_advance(iter, size - left); - return ret; -} - /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to @@ -1295,10 +1280,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = __bio_iov_append_get_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); /* don't account direct I/O as memory stall */ -- cgit v1.2.3 From 3850e13f2853a17c083eb00fad2c6da4fde0b41e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:22 -0700 Subject: block: export dma_alignment attribute User space may want to know how to align their buffers to avoid bouncing. Export the queue attribute. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220610195830.3574005-4-kbusch@fb.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9b905e9443e4..ec716ea26b92 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page return queue_var_show(q->limits.virt_boundary_mask, page); } +static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_dma_alignment(q), page); +} + #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ queue_##name##_show(struct request_queue *q, char *page) \ @@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); @@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = { &blk_throtl_sample_time_entry.attr, #endif &queue_virt_boundary_mask_entry.attr, + &queue_dma_alignment_entry.attr, NULL, }; -- cgit v1.2.3 From 37fee2e42ebbc33e5b7b6944979792bd9aa09e34 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:24 -0700 Subject: block: add a helper function for dio alignment This will make it easier to add more complex acceptable alignment criteria in the future. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220610195830.3574005-6-kbusch@fb.com Signed-off-by: Jens Axboe --- block/fops.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index d6b3276a6c68..9d32df6fc315 100644 --- a/block/fops.c +++ b/block/fops.c @@ -42,6 +42,13 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) return op; } +static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, + struct iov_iter *iter) +{ + return ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)); +} + #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, @@ -54,8 +61,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) + if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (nr_pages <= DIO_INLINE_BIO_VECS) @@ -173,8 +179,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; int ret = 0; - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) + if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (iocb->ki_flags & IOCB_ALLOC_CACHE) @@ -298,8 +303,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, loff_t pos = iocb->ki_pos; int ret = 0; - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) + if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (iocb->ki_flags & IOCB_ALLOC_CACHE) -- cgit v1.2.3 From 67927d22015060967122facc8cfeaad8012e8808 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:25 -0700 Subject: block/merge: count bytes instead of sectors Individual bv_len's may not be a sector size. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220610195830.3574005-7-kbusch@fb.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 7771dacc99cb..db2e03c8af7f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -201,11 +201,11 @@ static inline unsigned get_max_segment_size(const struct request_queue *q, * @nsegs: [in,out] Number of segments in the bio being built. Incremented * by the number of segments from @bv that may be appended to that * bio without exceeding @max_segs - * @sectors: [in,out] Number of sectors in the bio being built. Incremented - * by the number of sectors from @bv that may be appended to that - * bio without exceeding @max_sectors + * @bytes: [in,out] Number of bytes in the bio being built. Incremented + * by the number of bytes from @bv that may be appended to that + * bio without exceeding @max_bytes * @max_segs: [in] upper bound for *@nsegs - * @max_sectors: [in] upper bound for *@sectors + * @max_bytes: [in] upper bound for *@bytes * * When splitting a bio, it can happen that a bvec is encountered that is too * big to fit in a single segment and hence that it has to be split in the @@ -216,10 +216,10 @@ static inline unsigned get_max_segment_size(const struct request_queue *q, */ static bool bvec_split_segs(const struct request_queue *q, const struct bio_vec *bv, unsigned *nsegs, - unsigned *sectors, unsigned max_segs, - unsigned max_sectors) + unsigned *bytes, unsigned max_segs, + unsigned max_bytes) { - unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9; + unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; @@ -237,7 +237,7 @@ static bool bvec_split_segs(const struct request_queue *q, break; } - *sectors += total_len >> 9; + *bytes += total_len; /* tell the caller to split the bvec if it is too big to fit */ return len > 0 || bv->bv_len > max_len; @@ -269,8 +269,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; - unsigned nsegs = 0, sectors = 0; - const unsigned max_sectors = get_max_io_size(q, bio); + unsigned nsegs = 0, bytes = 0; + const unsigned max_bytes = get_max_io_size(q, bio) << 9; const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) { @@ -282,12 +282,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, goto split; if (nsegs < max_segs && - sectors + (bv.bv_len >> 9) <= max_sectors && + bytes + bv.bv_len <= max_bytes && bv.bv_offset + bv.bv_len <= PAGE_SIZE) { nsegs++; - sectors += bv.bv_len >> 9; - } else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs, - max_sectors)) { + bytes += bv.bv_len; + } else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs, + max_bytes)) { goto split; } @@ -300,13 +300,20 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, split: *segs = nsegs; + /* + * Individual bvecs might not be logical block aligned. Round down the + * split size so that each bio is properly block size aligned, even if + * we do not use the full hardware limits. + */ + bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q)); + /* * Bio splitting may cause subtle trouble such as hang when doing sync * iopoll in direct IO routine. Given performance gain of iopoll for * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); - return bio_split(bio, sectors, GFP_NOIO, bs); + return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } /** @@ -375,7 +382,7 @@ EXPORT_SYMBOL(blk_queue_split); unsigned int blk_recalc_rq_segments(struct request *rq) { unsigned int nr_phys_segs = 0; - unsigned int nr_sectors = 0; + unsigned int bytes = 0; struct req_iterator iter; struct bio_vec bv; @@ -398,7 +405,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq) } rq_for_each_bvec(bv, rq, iter) - bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors, + bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes, UINT_MAX, UINT_MAX); return nr_phys_segs; } -- cgit v1.2.3 From 9cfe3ddecdc556ab1d1693b29e8a26ba80953ccc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:26 -0700 Subject: block/bounce: count bytes instead of sectors Individual bv_len's may not be a sector size. Signed-off-by: Keith Busch Reviewed-by: Damien Le Moal Reviewed-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220610195830.3574005-8-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bounce.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bounce.c b/block/bounce.c index 8f7b6fe3b4db..c8f487af7be3 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -205,19 +205,26 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) int rw = bio_data_dir(*bio_orig); struct bio_vec *to, from; struct bvec_iter iter; - unsigned i = 0; + unsigned i = 0, bytes = 0; bool bounce = false; - int sectors = 0; + int sectors; bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_VECS) - sectors += from.bv_len >> 9; + bytes += from.bv_len; if (PageHighMem(from.bv_page)) bounce = true; } if (!bounce) return; + /* + * Individual bvecs might not be logical block aligned. Round down + * the split size so that each bio is properly block size aligned, + * even if we do not use the full hardware limits. + */ + sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> + SECTOR_SHIFT; if (sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); -- cgit v1.2.3 From b1a000d3b8ec582da64bb644be633e5a0beffcbf Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Jun 2022 12:58:29 -0700 Subject: block: relax direct io memory alignment Use the address alignment requirements from the block_device for direct io instead of requiring addresses be aligned to the block size. User space can discover the alignment requirements from the dma_alignment queue attribute. User space can specify any hardware compatible DMA offset for each segment, but every segment length is still required to be a multiple of the block size. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220610195830.3574005-11-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bio.c | 9 +++++++++ block/fops.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index ee5fe1bb015e..933ea3210954 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1220,7 +1220,16 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + /* + * Each segment in the iov is required to be a block size multiple. + * However, we may not be able to get the entire segment if it spans + * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the + * result to ensure the bio's total size is correct. The remainder of + * the iov data will be picked up in the next bio iteration. + */ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (size > 0) + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); if (unlikely(size <= 0)) return size ? size : -EFAULT; diff --git a/block/fops.c b/block/fops.c index 9d32df6fc315..86d3cab9bf93 100644 --- a/block/fops.c +++ b/block/fops.c @@ -45,8 +45,8 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, struct iov_iter *iter) { - return ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)); + return pos & (bdev_logical_block_size(bdev) - 1) || + !bdev_iter_is_aligned(bdev, iter); } #define DIO_INLINE_BIO_VECS 4 -- cgit v1.2.3 From 798f2a6f734de87633351c3ab13b17b07397cf68 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Wed, 15 Jun 2022 04:18:16 -0400 Subject: block: Directly use ida_alloc()/free() Use ida_alloc()/ida_free() instead of ida_simple_get()/ida_simple_remove(). The latter is deprecated and more verbose. Signed-off-by: Bo Liu Reviewed-by: Christophe JAILLET Link: https://lore.kernel.org/r/20220615081816.4342-1-liubo03@inspur.com Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/blk-sysfs.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 27fb1357ad4b..c2cec402d01c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -435,7 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); + q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) goto fail_srcu; @@ -485,7 +485,7 @@ fail_stats: fail_split: bioset_exit(&q->bio_split); fail_id: - ida_simple_remove(&blk_queue_ida, q->id); + ida_free(&blk_queue_ida, q->id); fail_srcu: if (alloc_srcu) cleanup_srcu_struct(q->srcu); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ec716ea26b92..69e53d1a4f0e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -791,7 +791,7 @@ static void blk_release_queue(struct kobject *kobj) if (blk_queue_has_srcu(q)) cleanup_srcu_struct(q->srcu); - ida_simple_remove(&blk_queue_ida, q->id); + ida_free(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } -- cgit v1.2.3 From 62c159a03da92121b1b909fd8039028de97885fc Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 15 Jun 2022 15:55:47 -0700 Subject: blk-iocost: Simplify ioc_rqos_done() Leave out the superfluous "& REQ_OP_MASK" code. The definition of req_op() shows that that code is superfluous: #define req_op(req) ((req)->cmd_flags & REQ_OP_MASK) Compile-tested only. Cc: Tejun Heo Acked-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220615225549.1054905-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 33a11ba971ea..b7082f2aed9c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2769,7 +2769,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; - switch (req_op(rq) & REQ_OP_MASK) { + switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; -- cgit v1.2.3 From 7e923f40a4d229bd9573d6cae64479d6a9770fc3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 15 Jun 2022 15:55:48 -0700 Subject: block: Rename a blk_mq_map_queue() argument Before the introduction of blk_mq_get_hctx_type(), blk_mq_map_queue() only used the flags from its second argument. Since the introduction of blk_mq_get_hctx_type(), blk_mq_map_queue() uses both the operation and the flags encoded in that argument. Rename the second argument of blk_mq_map_queue() to make this clear. Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220615225549.1054905-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.h b/block/blk-mq.h index 2615bd58bad3..e4c6fe2c8ac8 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -86,16 +86,16 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } -static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) +static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ - if (flags & REQ_POLLED) + if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; - else if ((flags & REQ_OP_MASK) == REQ_OP_READ) + else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } @@ -103,14 +103,14 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue - * @flags: request command flags + * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - unsigned int flags, + unsigned int opf, struct blk_mq_ctx *ctx) { - return ctx->hctxs[blk_mq_get_hctx_type(flags)]; + return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* -- cgit v1.2.3 From 51ab80f0aa861335eb80327af53e444a27e824b8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 15 Jun 2022 15:55:49 -0700 Subject: block: Make blk_mq_get_sq_hctx() select the proper hardware queue type Since the introduction of blk_mq_get_hctx_type() the operation type in the second argument of blk_mq_get_hctx_type() matters. The introduction of blk_mq_get_hctx_type() caused blk_mq_get_sq_hctx() to select a hardware queue of type HCTX_TYPE_READ instead of HCTX_TYPE_DEFAULT. Switch to hardware queue type HCTX_TYPE_DEFAULT since HCTX_TYPE_READ should only be used for read requests. Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220615225549.1054905-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 93d9d60980fb..fa3dc4f8f35d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2156,7 +2156,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) * just causes lock contention inside the scheduler and pointless cache * bouncing. */ - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx); + struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; -- cgit v1.2.3 From c887519074957f0ebd73b8158c2e0546d97ce0e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 11:09:31 +0200 Subject: block: open code blk_max_size_offset in blk_rq_get_max_sectors blk_rq_get_max_sectors always uses q->limits.chunk_sectors as the chunk_sectors argument, and already checks for max_sectors through the call to blk_queue_get_max_sectors. That means much of blk_max_size_offset is not needed and open coding it simplifies the code. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220614090934.570632-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index db2e03c8af7f..df003ecfbd47 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -566,17 +566,18 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; + unsigned int max_sectors; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; + max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); if (!q->limits.chunk_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); - - return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); + return max_sectors; + return min(max_sectors, + blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); } static inline int ll_new_hw_segment(struct request *req, struct bio *bio, -- cgit v1.2.3 From 84613beda427b6e4c3d7a9ed2b68efd26300ec63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 11:09:32 +0200 Subject: block: cleanup variable naming in get_max_io_size get_max_io_size has a very odd choice of variables names and initialization patterns. Switch to more descriptive names and more clear initialization of them. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220614090934.570632-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index df003ecfbd47..4da981efddee 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -164,18 +164,16 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { - unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); - unsigned max_sectors = sectors; unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; - unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); - - max_sectors += start_offset; - max_sectors &= ~(pbs - 1); - if (max_sectors > start_offset) - return max_sectors - start_offset; - - return sectors & ~(lbs - 1); + unsigned max_sectors, start, end; + + max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); + start = bio->bi_iter.bi_sector & (pbs - 1); + end = (start + max_sectors) & ~(pbs - 1); + if (end > start) + return end - start; + return max_sectors & ~(lbs - 1); } static inline unsigned get_max_segment_size(const struct request_queue *q, -- cgit v1.2.3 From efef739d5f37dc998b113fb965aea68d42a9eddc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 11:09:33 +0200 Subject: block: fold blk_max_size_offset into get_max_io_size Now that blk_max_size_offset has a single caller left, fold it into that and clean up the naming convention for the local variables there. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Raghav Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220614090934.570632-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 4da981efddee..0f5f42ebd0bb 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -166,9 +166,14 @@ static inline unsigned get_max_io_size(struct request_queue *q, { unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; - unsigned max_sectors, start, end; + unsigned max_sectors = queue_max_sectors(q), start, end; + + if (q->limits.chunk_sectors) { + max_sectors = min(max_sectors, + blk_chunk_sectors_left(bio->bi_iter.bi_sector, + q->limits.chunk_sectors)); + } - max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); start = bio->bi_iter.bi_sector & (pbs - 1); end = (start + max_sectors) & ~(pbs - 1); if (end > start) -- cgit v1.2.3 From 2a9336c42a6abdcef564d522648a684a474a3483 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 11:09:34 +0200 Subject: block: move blk_queue_get_max_sectors to blk.h blk_queue_get_max_sectors is private to the block layer, so move it out of blkdev.h. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220614090934.570632-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 434017701403..8e79296ee97a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -159,6 +159,19 @@ static inline bool blk_discard_mergable(struct request *req) return false; } +static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, + int op) +{ + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) + return min(q->limits.max_discard_sectors, + UINT_MAX >> SECTOR_SHIFT); + + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + + return q->limits.max_sectors; +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); -- cgit v1.2.3 From 6c77b152f5f1324972cdbdb71e9a6e02d601f49f Mon Sep 17 00:00:00 2001 From: GuoYong Zheng Date: Fri, 17 Jun 2022 18:28:04 +0800 Subject: bfq: Remove useless code in bfq_lookup_next_entity It is no need to judge entity is null or not here, directly return entity is ok, so remove it. Signed-off-by: GuoYong Zheng Link: https://lore.kernel.org/r/1655461684-19075-1-git-send-email-zhenggy@chinatelecom.cn Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index f8eb340381cf..089d07022066 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1472,9 +1472,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, break; } - if (!entity) - return NULL; - return entity; } -- cgit v1.2.3 From c28c49b09e493adf5f79201d6de2d16d9356e9eb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 17 Jun 2022 13:44:33 -0700 Subject: block: bfq: Remove an unused function definition This patch is the result of the analysis of a sparse report. Cc: Jan Kara Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20220617204433.102022-1-bvanassche@acm.org --- block/bfq-cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 09574af83566..dc0fa93219df 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1471,8 +1471,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) return bfqq->bfqd->root_group; } -void bfqg_and_blkg_get(struct bfq_group *bfqg) {} - void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -- cgit v1.2.3 From 1d87be8212c8c2bb1216a0ba49373e4e0123aaf3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 17 Jun 2022 14:08:59 -0700 Subject: block: bfq: Fix kernel-doc headers Fix the following warnings: block/bfq-cgroup.c:721: warning: Function parameter or member 'bfqg' not described in '__bfq_bic_change_cgroup' block/bfq-cgroup.c:721: warning: Excess function parameter 'blkcg' description in '__bfq_bic_change_cgroup' block/bfq-cgroup.c:870: warning: Function parameter or member 'ioprio_class' not described in 'bfq_reparent_leaf_entity' block/bfq-cgroup.c:900: warning: Function parameter or member 'ioprio_class' not described in 'bfq_reparent_active_queues' Cc: Jan Kara Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20220617210859.106623-1-bvanassche@acm.org --- block/bfq-cgroup.c | 6 ++++-- block/bfq-wf2q.c | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index dc0fa93219df..9fc605791b1e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -706,10 +706,10 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, } /** - * __bfq_bic_change_cgroup - move @bic to @cgroup. + * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. + * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see @@ -863,6 +863,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. + * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, @@ -892,6 +893,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. + * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 089d07022066..983413cdefad 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1360,6 +1360,8 @@ left: /** * __bfq_lookup_next_entity - return the first eligible entity in @st. * @st: the service tree. + * @in_service: whether or not there is an in-service entity for the sched_data + * this active tree belongs to. * * If there is no in-service entity for the sched_data st belongs to, * then return the entity that will be set in service if: -- cgit v1.2.3 From 3c8f9da41ed90294d8ca42b3ad8a13c5379bd549 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 22 Jun 2022 10:25:54 +0200 Subject: blk-mq: Don't disable preemption around __blk_mq_run_hw_queue(). __blk_mq_delay_run_hw_queue() disables preemption to get a stable current CPU number and then invokes __blk_mq_run_hw_queue() if the CPU number is part the mask. __blk_mq_run_hw_queue() acquires a spin_lock_t which is a sleeping lock on PREEMPT_RT and can't be acquired with disabled preemption. It is not required for correctness to invoke __blk_mq_run_hw_queue() on a CPU matching hctx->cpumask. Both (async and direct requests) can run on a CPU not matching hctx->cpumask. The CPU mask without disabling preemption and invoking __blk_mq_run_hw_queue(). Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/YrLSEiNvagKJaDs5@linutronix.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index fa3dc4f8f35d..62b7025d6854 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2085,14 +2085,10 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); - if (cpumask_test_cpu(cpu, hctx->cpumask)) { + if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - put_cpu(); return; } - - put_cpu(); } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, -- cgit v1.2.3 From e589f46445960c274cc813a1cc8e2fc73b2a1849 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:26 +0200 Subject: block: fix default IO priority handling again Commit e70344c05995 ("block: fix default IO priority handling") introduced an inconsistency in get_current_ioprio() that tasks without IO context return IOPRIO_DEFAULT priority while tasks with freshly allocated IO context will return 0 (IOPRIO_CLASS_NONE/0) IO priority. Tasks without IO context used to be rare before 5a9d041ba2f6 ("block: move io_context creation into where it's needed") but after this commit they became common because now only BFQ IO scheduler setups task's IO context. Similar inconsistency is there for get_task_ioprio() so this inconsistency is now exposed to userspace and userspace will see different IO priority for tasks operating on devices with BFQ compared to devices without BFQ. Furthemore the changes done by commit e70344c05995 change the behavior when no IO priority is set for BFQ IO scheduler which is also documented in ioprio_set(2) manpage: "If no I/O scheduler has been set for a thread, then by default the I/O priority will follow the CPU nice value (setpriority(2)). In Linux kernels before version 2.6.24, once an I/O priority had been set using ioprio_set(), there was no way to reset the I/O scheduling behavior to the default. Since Linux 2.6.24, specifying ioprio as 0 can be used to reset to the default I/O scheduling behavior." So make sure we default to IOPRIO_CLASS_NONE as used to be the case before commit e70344c05995. Also cleanup alloc_io_context() to explicitely set this IO priority for the allocated IO context to avoid future surprises. Note that we tweak ioprio_best() to maintain ioprio_get(2) behavior and make this commit easily backportable. CC: stable@vger.kernel.org Fixes: e70344c05995 ("block: fix default IO priority handling") Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-ioc.c | 2 ++ block/ioprio.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index df9cfe4ca532..63fc02042408 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -247,6 +247,8 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); #endif + ioc->ioprio = IOPRIO_DEFAULT; + return ioc; } diff --git a/block/ioprio.c b/block/ioprio.c index 2fe068fcaad5..2a34cbca18ae 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -157,9 +157,9 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_DEFAULT; + aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); if (!ioprio_valid(bprio)) - bprio = IOPRIO_DEFAULT; + bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); return min(aprio, bprio); } -- cgit v1.2.3 From 893e5d32d5832674bcf6465f27958e883b72b346 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:28 +0200 Subject: block: Generalize get_current_ioprio() for any task get_current_ioprio() operates only on current task. We will need the same functionality for other tasks as well. Generalize get_current_ioprio() for that and also move the bulk out of the header file because it is large enough. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-3-jack@suse.cz Signed-off-by: Jens Axboe --- block/ioprio.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'block') diff --git a/block/ioprio.c b/block/ioprio.c index 2a34cbca18ae..c4e3476155a1 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -138,6 +138,32 @@ out: return ret; } +/* + * If the task has set an I/O priority, use that. Otherwise, return + * the default I/O priority. + * + * Expected to be called for current task or with task_lock() held to keep + * io_context stable. + */ +int __get_task_ioprio(struct task_struct *p) +{ + struct io_context *ioc = p->io_context; + int prio; + + if (p != current) + lockdep_assert_held(&p->alloc_lock); + if (ioc) + prio = ioc->ioprio; + else + prio = IOPRIO_DEFAULT; + + if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) + prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), + task_nice_ioprio(p)); + return prio; +} +EXPORT_SYMBOL_GPL(__get_task_ioprio); + static int get_task_ioprio(struct task_struct *p) { int ret; -- cgit v1.2.3 From fc25545e17bd74befe0b8ab2c65ac84936be5066 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:29 +0200 Subject: block: Make ioprio_best() static Nobody outside of block/ioprio.c uses it. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-4-jack@suse.cz Signed-off-by: Jens Axboe --- block/ioprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/ioprio.c b/block/ioprio.c index c4e3476155a1..8c46f672a0ba 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -180,7 +180,7 @@ out: return ret; } -int ioprio_best(unsigned short aprio, unsigned short bprio) +static int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); -- cgit v1.2.3 From 4b838d9ee950b37bee624e301bd8e923165b1cf3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:30 +0200 Subject: block: Fix handling of tasks without ioprio in ioprio_get(2) ioprio_get(2) can be asked to return the best IO priority from several tasks (IOPRIO_WHO_PGRP, IOPRIO_WHO_USER). Currently the call treats tasks without set IO priority as having priority IOPRIO_CLASS_BE/IOPRIO_BE_NORM however this does not really reflect the IO priority the task will get (which depends on task's nice value). Fix the code to use the real IO priority task's IO will use. We have to modify code for ioprio_get(IOPRIO_WHO_PROCESS) to keep returning IOPRIO_CLASS_NONE priority for tasks without set IO priority as a special case to maintain userspace visible API. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220623074840.5960-5-jack@suse.cz Signed-off-by: Jens Axboe --- block/ioprio.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/ioprio.c b/block/ioprio.c index 8c46f672a0ba..32a456b45804 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -171,10 +171,31 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_DEFAULT; + task_lock(p); + ret = __get_task_ioprio(p); + task_unlock(p); +out: + return ret; +} + +/* + * Return raw IO priority value as set by userspace. We use this for + * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and + * also so that userspace can distinguish unset IO priority (which just gets + * overriden based on task's nice value) from IO priority set to some value. + */ +static int get_task_raw_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + else + ret = IOPRIO_DEFAULT; task_unlock(p); out: return ret; @@ -182,11 +203,6 @@ out: static int ioprio_best(unsigned short aprio, unsigned short bprio) { - if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); - if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); - return min(aprio, bprio); } @@ -207,7 +223,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) else p = find_task_by_vpid(who); if (p) - ret = get_task_ioprio(p); + ret = get_task_raw_ioprio(p); break; case IOPRIO_WHO_PGRP: if (!who) -- cgit v1.2.3 From f25865447294bf2468c2587dd98f8fa999260893 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:31 +0200 Subject: blk-ioprio: Remove unneeded field blkcg->ioprio_set field is not really useful except for avoiding possibly more expensive checks inside blkcg_ioprio_track(). The check for blkcg->prio_policy being equal to POLICY_NO_CHANGE does the same service so just remove the ioprio_set field and replace the check. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-6-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-ioprio.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 79e797f5d194..3f605583598b 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -62,7 +62,6 @@ struct ioprio_blkg { struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; - bool prio_set; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) @@ -113,7 +112,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, if (ret < 0) return ret; blkcg->prio_policy = ret; - blkcg->prio_set = true; return nbytes; } @@ -193,16 +191,15 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; - if (!blkcg->prio_set) + if (blkcg->prio_policy == POLICY_NO_CHANGE) return; /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. - * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the - * bio I/O priority is not modified. If the bio I/O priority equals - * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. + * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O + * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); -- cgit v1.2.3 From 82b74cac28493fb40ea74fb2fe648b5fc7ea0c1c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:32 +0200 Subject: blk-ioprio: Convert from rqos policy to direct call Convert blk-ioprio handling from a rqos policy to a direct call from blk_mq_submit_bio(). Firstly, blk-ioprio is not much of a rqos policy anyway, it just needs a hook in bio submission path to set the bio's IO priority. Secondly, the rqos .track hook gets actually called too late for blk-ioprio purposes and introducing a special rqos hook just for blk-ioprio looks even weirder. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-7-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + block/blk-ioprio.c | 50 +++++--------------------------------------------- block/blk-ioprio.h | 9 +++++++++ block/blk-mq.c | 8 ++++++++ 4 files changed, 23 insertions(+), 45 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 764e740b0c0f..6906981563f8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1299,6 +1299,7 @@ int blkcg_init_queue(struct request_queue *q) ret = blk_iolatency_init(q); if (ret) { blk_throtl_exit(q); + blk_ioprio_exit(q); goto err_destroy_all; } diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 3f605583598b..c00060a02c6e 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -181,17 +181,12 @@ static struct blkcg_policy ioprio_policy = { .pd_free_fn = ioprio_free_pd, }; -struct blk_ioprio { - struct rq_qos rqos; -}; - -static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, - struct bio *bio) +void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; - if (blkcg->prio_policy == POLICY_NO_CHANGE) + if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; /* @@ -207,49 +202,14 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, bio->bi_ioprio = prio; } -static void blkcg_ioprio_exit(struct rq_qos *rqos) +void blk_ioprio_exit(struct request_queue *q) { - struct blk_ioprio *blkioprio_blkg = - container_of(rqos, typeof(*blkioprio_blkg), rqos); - - blkcg_deactivate_policy(rqos->q, &ioprio_policy); - kfree(blkioprio_blkg); + blkcg_deactivate_policy(q, &ioprio_policy); } -static struct rq_qos_ops blkcg_ioprio_ops = { - .track = blkcg_ioprio_track, - .exit = blkcg_ioprio_exit, -}; - int blk_ioprio_init(struct request_queue *q) { - struct blk_ioprio *blkioprio_blkg; - struct rq_qos *rqos; - int ret; - - blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL); - if (!blkioprio_blkg) - return -ENOMEM; - - ret = blkcg_activate_policy(q, &ioprio_policy); - if (ret) { - kfree(blkioprio_blkg); - return ret; - } - - rqos = &blkioprio_blkg->rqos; - rqos->id = RQ_QOS_IOPRIO; - rqos->ops = &blkcg_ioprio_ops; - rqos->q = q; - - /* - * Registering the rq-qos policy after activating the blk-cgroup - * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the - * rq-qos callbacks. - */ - rq_qos_add(q, rqos); - - return 0; + return blkcg_activate_policy(q, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index a7785c2f1aea..5a1eb550e178 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -6,14 +6,23 @@ #include struct request_queue; +struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO int blk_ioprio_init(struct request_queue *q); +void blk_ioprio_exit(struct request_queue *q); +void blkcg_set_ioprio(struct bio *bio); #else static inline int blk_ioprio_init(struct request_queue *q) { return 0; } +static inline void blk_ioprio_exit(struct request_queue *q) +{ +} +static inline void blkcg_set_ioprio(struct bio *bio) +{ +} #endif #endif /* _BLK_IOPRIO_H_ */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 62b7025d6854..4c4944b6f520 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -42,6 +42,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); @@ -2779,6 +2780,11 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, return rq; } +static void bio_set_ioprio(struct bio *bio) +{ + blkcg_set_ioprio(bio); +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2819,6 +2825,8 @@ void blk_mq_submit_bio(struct bio *bio) trace_block_getrq(bio); + bio_set_ioprio(bio); + rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); -- cgit v1.2.3 From 9c6227e04355a430aa59709bbf869d9126112d0d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:33 +0200 Subject: block: Initialize bio priority earlier Bio's IO priority needs to be initialized before we try to merge the bio with other bios. Otherwise we could merge bios which would otherwise receive different IO priorities leading to possible QoS issues. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-8-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c4944b6f520..c0ec1938feee 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2814,6 +2814,8 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; + bio_set_ioprio(bio); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { if (!bio) @@ -2825,8 +2827,6 @@ void blk_mq_submit_bio(struct bio *bio) trace_block_getrq(bio); - bio_set_ioprio(bio); - rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); -- cgit v1.2.3 From a78418e6a04c93b9ffd3f0f601c5cb10612acb7f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:34 +0200 Subject: block: Always initialize bio IO priority on submit Currently, IO priority set in task's IO context is not reflected in the bio->bi_ioprio for most IO (only io_uring and direct IO set it). This results in odd results where process is submitting some bios with one priority and other bios with a different (unset) priority and due to differing priorities bios cannot be merged. Make sure bio->bi_ioprio is always set on bio submission. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-9-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c0ec1938feee..92aae03103b7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2782,6 +2782,9 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, static void bio_set_ioprio(struct bio *bio) { + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); blkcg_set_ioprio(bio); } -- cgit v1.2.3 From ee78ec1077d37d1a4a0860589a65df8ae6d2255c Mon Sep 17 00:00:00 2001 From: Liu Song Date: Sat, 25 Jun 2022 23:15:21 +0800 Subject: blk-mq: blk_mq_tag_busy is no need to return a value Currently "blk_mq_tag_busy" return value has no effect, so adjust it. Some code implementations have also been adjusted to enhance readability. Signed-off-by: Liu Song Link: https://lore.kernel.org/r/1656170121-1619-1-git-send-email-liusong@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 18 +++++++----------- block/blk-mq-tag.h | 10 ++++------ 2 files changed, 11 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2dcd738c6952..3cfffef1feb3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -37,29 +37,25 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, * to get tag when first time, the other shared-tag users could reserve * budget for it. */ -bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || - test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { - return true; - } + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + return; + set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags); } else { - if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || - test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { - return true; - } + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state); } users = atomic_inc_return(&hctx->tags->active_queues); blk_mq_update_wake_batch(hctx->tags, users); - - return true; } /* diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 5668e28be0b7..91ff37e3b43d 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -47,15 +47,13 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); -static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) - return false; - - return __blk_mq_tag_busy(hctx); + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) -- cgit v1.2.3 From 1f90307e5f0d7bc9a336ead528f616a5df8e5944 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Jun 2022 08:05:49 +0200 Subject: block: remove QUEUE_FLAG_DEAD Disallow setting the blk-mq state on any queue that is already dying as setting the state even then is a bad idea, and remove the now unused QUEUE_FLAG_DEAD flag. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20220619060552.1850436-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 3 --- block/blk-mq-debugfs.c | 8 +++----- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c2cec402d01c..f86df390afad 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -313,9 +313,6 @@ void blk_cleanup_queue(struct request_queue *q) * after draining finished. */ blk_freeze_queue(q); - - blk_queue_flag_set(QUEUE_FLAG_DEAD, q); - blk_sync_queue(q); if (queue_is_mq(q)) { blk_mq_cancel_work_sync(q); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4d1ce9ef4318..b80fae7ab1d9 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -116,7 +116,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), QUEUE_FLAG_NAME(SAME_FORCE), - QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STABLE_WRITES), QUEUE_FLAG_NAME(POLL), @@ -151,11 +150,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf, char opbuf[16] = { }, *op; /* - * The "state" attribute is removed after blk_cleanup_queue() has called - * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid - * triggering a use-after-free. + * The "state" attribute is removed when the queue is removed. Don't + * allow setting the state on a dying queue to avoid a use-after-free. */ - if (blk_queue_dead(q)) + if (blk_queue_dying(q)) return -ENOENT; if (count >= sizeof(opbuf)) { -- cgit v1.2.3 From 0e3534022f26ae51f7cf28347a253230604b6f4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Jun 2022 08:05:50 +0200 Subject: block: stop setting the nomerges flags in blk_cleanup_queue These flags only apply to file system I/O, and all file system I/O is already drained by del_gendisk and thus can't be in progress when blk_cleanup_queue is called. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20220619060552.1850436-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f86df390afad..04029ffea031 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -304,9 +304,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - /* * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that blk_mq_run_hw_queues() accesses the hardware queues -- cgit v1.2.3 From 6f8191fdf41d3a53cc1d63fe2234e812c55a0092 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Jun 2022 08:05:51 +0200 Subject: block: simplify disk shutdown Set the queue dying flag and call blk_mq_exit_queue from del_gendisk for all disks that do not have separately allocated queues, and thus remove the need to call blk_cleanup_queue for them. Rename blk_cleanup_disk to blk_mq_destroy_queue to make it clear that this function is intended only for separately allocated blk-mq queues. This saves an extra queue freeze for devices without a separately allocated queue. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20220619060552.1850436-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 37 ------------------------------------- block/blk-mq.c | 43 +++++++++++++++++++++++++++++++++++++++++-- block/blk-sysfs.c | 5 ----- block/blk.h | 3 +++ block/bsg-lib.c | 4 ++-- block/genhd.c | 23 +++++++++++++---------- 6 files changed, 59 insertions(+), 56 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 04029ffea031..5ad7bd93077c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -284,43 +284,6 @@ void blk_queue_start_drain(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } -/** - * blk_cleanup_queue - shutdown a request queue - * @q: request queue to shutdown - * - * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and - * put it. All future requests will be failed immediately with -ENODEV. - * - * Context: can sleep - */ -void blk_cleanup_queue(struct request_queue *q) -{ - /* cannot be called from atomic context */ - might_sleep(); - - WARN_ON_ONCE(blk_queue_registered(q)); - - /* mark @q DYING, no new request or merges will be allowed afterwards */ - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - blk_queue_start_drain(q); - - /* - * Drain all requests queued before DYING marking. Set DEAD flag to - * prevent that blk_mq_run_hw_queues() accesses the hardware queues - * after draining finished. - */ - blk_freeze_queue(q); - blk_sync_queue(q); - if (queue_is_mq(q)) { - blk_mq_cancel_work_sync(q); - blk_mq_exit_queue(q); - } - - /* @q is and will stay empty, shutdown and put */ - blk_put_queue(q); -} -EXPORT_SYMBOL(blk_cleanup_queue); - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer diff --git a/block/blk-mq.c b/block/blk-mq.c index 92aae03103b7..b1dbc4b2c2c9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3902,7 +3902,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { - blk_cleanup_queue(q); + blk_put_queue(q); return ERR_PTR(ret); } return q; @@ -3914,6 +3914,35 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +/** + * blk_mq_destroy_queue - shutdown a request queue + * @q: request queue to shutdown + * + * This shuts down a request queue allocated by blk_mq_init_queue() and drops + * the initial reference. All future requests will failed with -ENODEV. + * + * Context: can sleep + */ +void blk_mq_destroy_queue(struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_mq(q)); + WARN_ON_ONCE(blk_queue_registered(q)); + + might_sleep(); + + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); + blk_freeze_queue(q); + + blk_sync_queue(q); + blk_mq_cancel_work_sync(q); + blk_mq_exit_queue(q); + + /* @q is and will stay empty, shutdown and put */ + blk_put_queue(q); +} +EXPORT_SYMBOL(blk_mq_destroy_queue); + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct lock_class_key *lkclass) { @@ -3926,13 +3955,23 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { - blk_cleanup_queue(q); + blk_put_queue(q); return ERR_PTR(-ENOMEM); } + set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); +struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, + struct lock_class_key *lkclass) +{ + if (!blk_get_queue(q)) + return NULL; + return __alloc_disk_node(q, NUMA_NO_NODE, lkclass); +} +EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 69e53d1a4f0e..9b211e519de8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -755,11 +755,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) * decremented with blk_put_queue(). Once the refcount reaches 0 this function * is called. * - * For drivers that have a request_queue on a gendisk and added with - * __device_add_disk() the refcount to request_queue will reach 0 with - * the last put_disk() called by the driver. For drivers which don't use - * __device_add_disk() this happens with blk_cleanup_queue(). - * * Drivers exist which depend on the release of the request_queue to be * synchronous, it should not be deferred. * diff --git a/block/blk.h b/block/blk.h index 8e79296ee97a..1a0d3e6a4a63 100644 --- a/block/blk.h +++ b/block/blk.h @@ -424,6 +424,9 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); void blk_drop_partitions(struct gendisk *disk); +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); + int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index acfe1357bf6c..fd4cd5e68282 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -324,7 +324,7 @@ void bsg_remove_queue(struct request_queue *q) container_of(q->tag_set, struct bsg_set, tag_set); bsg_unregister_queue(bset->bd); - blk_cleanup_queue(q); + blk_mq_destroy_queue(q); blk_mq_free_tag_set(&bset->tag_set); kfree(bset); } @@ -399,7 +399,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, return q; out_cleanup_queue: - blk_cleanup_queue(q); + blk_mq_destroy_queue(q); out_queue: blk_mq_free_tag_set(set); out_tag_set: diff --git a/block/genhd.c b/block/genhd.c index 278227ba1d53..4d15f828c449 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -617,6 +617,8 @@ void del_gendisk(struct gendisk *disk) * Fail any new I/O. */ set_bit(GD_DEAD, &disk->state); + if (test_bit(GD_OWNS_QUEUE, &disk->state)) + blk_queue_flag_set(QUEUE_FLAG_DYING, q); set_capacity(disk, 0); /* @@ -663,11 +665,16 @@ void del_gendisk(struct gendisk *disk) blk_mq_unquiesce_queue(q); /* - * Allow using passthrough request again after the queue is torn down. + * If the disk does not own the queue, allow using passthrough requests + * again. Else leave the queue frozen to fail all I/O. */ - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); - __blk_mq_unfreeze_queue(q, true); - + if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + } else { + if (queue_is_mq(q)) + blk_mq_exit_queue(q); + } } EXPORT_SYMBOL(del_gendisk); @@ -1338,9 +1345,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, { struct gendisk *disk; - if (!blk_get_queue(q)) - return NULL; - disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) goto out_put_queue; @@ -1391,7 +1395,6 @@ out_put_queue: blk_put_queue(q); return NULL; } -EXPORT_SYMBOL(__alloc_disk_node); struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { @@ -1404,9 +1407,10 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) disk = __alloc_disk_node(q, node, lkclass); if (!disk) { - blk_cleanup_queue(q); + blk_put_queue(q); return NULL; } + set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_alloc_disk); @@ -1439,7 +1443,6 @@ EXPORT_SYMBOL(put_disk); */ void blk_cleanup_disk(struct gendisk *disk) { - blk_cleanup_queue(disk->queue); put_disk(disk); } EXPORT_SYMBOL(blk_cleanup_disk); -- cgit v1.2.3 From 8b9ab62662048a3274361c7e5f64037c2c133e2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Jun 2022 08:05:52 +0200 Subject: block: remove blk_cleanup_disk blk_cleanup_disk is nothing but a trivial wrapper for put_disk now, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20220619060552.1850436-7-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4d15f828c449..bf9be06af2c8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1432,21 +1432,6 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/** - * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk - * @disk: gendisk to shutdown - * - * Mark the queue hanging off @disk DYING, drain all pending requests, then mark - * the queue DEAD, destroy and put it and the gendisk structure. - * - * Context: can sleep - */ -void blk_cleanup_disk(struct gendisk *disk) -{ - put_disk(disk); -} -EXPORT_SYMBOL(blk_cleanup_disk); - static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; -- cgit v1.2.3 From cc5c516df028b221d94c65c47c5ae8d20f61b6f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:45 +0200 Subject: block: simplify blktrace sysfs attribute creation Add the trace attributes to the default gendisk attributes, just like we already do for partitions. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 11 +---------- block/blk.h | 2 ++ block/genhd.c | 3 +++ block/partitions/core.c | 1 - 4 files changed, 6 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9b211e519de8..5f3f73115988 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -810,21 +810,14 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - ret = blk_trace_init_sysfs(dev); - if (ret) - return ret; - mutex_lock(&q->sysfs_dir_lock); ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); - if (ret < 0) { - blk_trace_remove_sysfs(dev); + if (ret < 0) goto unlock; - } ret = sysfs_create_group(&q->kobj, &queue_attr_group); if (ret) { - blk_trace_remove_sysfs(dev); kobject_del(&q->kobj); kobject_put(&dev->kobj); goto unlock; @@ -890,7 +883,6 @@ put_dev: mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; @@ -931,7 +923,6 @@ void blk_unregister_queue(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); blk_crypto_sysfs_unregister(q); - blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); elv_unregister_queue(q); diff --git a/block/blk.h b/block/blk.h index 1a0d3e6a4a63..74d59435870c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -452,6 +452,8 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; +extern struct attribute_group blk_trace_attr_group; + long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/genhd.c b/block/genhd.c index bf9be06af2c8..b1fb7e058b9c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1134,6 +1134,9 @@ static struct attribute_group disk_attr_group = { static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif NULL }; diff --git a/block/partitions/core.c b/block/partitions/core.c index 8a0ec929023b..7dc487f5b03c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include "check.h" -- cgit v1.2.3 From 060f131e9c438837f9792e456fae424e621fb881 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:46 +0200 Subject: block: remove a superflous queue kobject reference kobject_add already adds a reference to the parent that is dropped on deletion, so don't bother grabbing another one. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5f3f73115988..f9373da591b8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -812,14 +812,13 @@ int blk_register_queue(struct gendisk *disk) mutex_lock(&q->sysfs_dir_lock); - ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); + ret = kobject_add(&q->kobj, &dev->kobj, "%s", "queue"); if (ret < 0) goto unlock; ret = sysfs_create_group(&q->kobj, &queue_attr_group); if (ret) { kobject_del(&q->kobj); - kobject_put(&dev->kobj); goto unlock; } @@ -883,7 +882,6 @@ put_dev: mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); - kobject_put(&dev->kobj); return ret; } @@ -941,6 +939,4 @@ void blk_unregister_queue(struct gendisk *disk) q->sched_debugfs_dir = NULL; q->rqos_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); - - kobject_put(&disk_to_dev(disk)->kobj); } -- cgit v1.2.3 From 4a8d14bba486cca6880062f1ef240cf1d45f3367 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:47 +0200 Subject: block: use default groups to register the queue attributes Set up the default_groups for blk_queue_ktype instead of manually calling sysfs_create_group. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f9373da591b8..b72506770b97 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -795,7 +795,13 @@ static const struct sysfs_ops queue_sysfs_ops = { .store = queue_attr_store, }; +static const struct attribute_group *blk_queue_attr_groups[] = { + &queue_attr_group, + NULL +}; + struct kobj_type blk_queue_ktype = { + .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_release_queue, }; @@ -816,12 +822,6 @@ int blk_register_queue(struct gendisk *disk) if (ret < 0) goto unlock; - ret = sysfs_create_group(&q->kobj, &queue_attr_group); - if (ret) { - kobject_del(&q->kobj); - goto unlock; - } - if (queue_is_mq(q)) __blk_mq_register_dev(dev, q); mutex_lock(&q->sysfs_lock); -- cgit v1.2.3 From 81f0c2ef41b02185928563899cd4d618ffc7eebf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:48 +0200 Subject: block: remove the extra gendisk reference in __blk_mq_register_dev kobject_add already grabs a reference to the parent, no need to have another one. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index c08426975856..f4caaa668e3c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -215,7 +215,6 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); - kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; } @@ -261,7 +260,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); - ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + ret = kobject_add(q->mq_kobj, &dev->kobj, "%s", "mq"); if (ret < 0) goto out; @@ -286,7 +285,6 @@ unreg: kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); - kobject_put(&dev->kobj); return ret; } -- cgit v1.2.3 From eaa870f97544668025ba1f96ee267abac7b3c73c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:49 +0200 Subject: blk-mq: rename blk_mq_sysfs_{,un}register Add a _hctx postfix to better describe what the functions do, match the debugfs equivalents and release the old names for functions that should be using them. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 4 ++-- block/blk-mq.c | 4 ++-- block/blk-mq.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index f4caaa668e3c..ee6efe2b250d 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -288,7 +288,7 @@ unreg: return ret; } -void blk_mq_sysfs_unregister(struct request_queue *q) +void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; @@ -304,7 +304,7 @@ unlock: mutex_unlock(&q->sysfs_dir_lock); } -int blk_mq_sysfs_register(struct request_queue *q) +int blk_mq_sysfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; diff --git a/block/blk-mq.c b/block/blk-mq.c index b1dbc4b2c2c9..15c7c5c4ad22 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4559,7 +4559,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); - blk_mq_sysfs_unregister(q); + blk_mq_sysfs_unregister_hctxs(q); } prev_nr_hw_queues = set->nr_hw_queues; @@ -4590,7 +4590,7 @@ fallback: reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_sysfs_register(q); + blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); } diff --git a/block/blk-mq.h b/block/blk-mq.h index e4c6fe2c8ac8..a92639f2bfd2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -119,8 +119,8 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); -extern int blk_mq_sysfs_register(struct request_queue *q); -extern void blk_mq_sysfs_unregister(struct request_queue *q); +int blk_mq_sysfs_register_hctxs(struct request_queue *q); +void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -- cgit v1.2.3 From 8682b92e5ab852b93739a0f2b261fff4c733be57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jun 2022 19:18:50 +0200 Subject: blk-mq: cleanup disk sysfs registration Pass a gendisk to the sysfs register/unregister functions and give them descriptive names. Also move the unregistration helper next to the one doing the registration. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20220628171850.1313069-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 39 ++++++++++++++++++++------------------- block/blk-mq.h | 3 ++- block/blk-sysfs.c | 9 ++++----- 3 files changed, 26 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ee6efe2b250d..93997d297d42 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -203,22 +203,6 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ - struct blk_mq_hw_ctx *hctx; - unsigned long i; - - lockdep_assert_held(&q->sysfs_dir_lock); - - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_unregister_hctx(hctx); - - kobject_uevent(q->mq_kobj, KOBJ_REMOVE); - kobject_del(q->mq_kobj); - - q->mq_sysfs_init_done = false; -} - void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); @@ -251,16 +235,16 @@ void blk_mq_sysfs_init(struct request_queue *q) } } -int __blk_mq_register_dev(struct device *dev, struct request_queue *q) +int blk_mq_sysfs_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; unsigned long i, j; int ret; - WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); - ret = kobject_add(q->mq_kobj, &dev->kobj, "%s", "mq"); + ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq"); if (ret < 0) goto out; @@ -288,6 +272,23 @@ unreg: return ret; } +void blk_mq_sysfs_unregister(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + lockdep_assert_held(&q->sysfs_dir_lock); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); + + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); + + q->mq_sysfs_init_done = false; +} + void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-mq.h b/block/blk-mq.h index a92639f2bfd2..54e20edf0da3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -118,7 +118,8 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); -extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); +int blk_mq_sysfs_register(struct gendisk *disk); +void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b72506770b97..85ea43eff094 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -812,18 +812,17 @@ struct kobj_type blk_queue_ktype = { */ int blk_register_queue(struct gendisk *disk) { - int ret; - struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; + int ret; mutex_lock(&q->sysfs_dir_lock); - ret = kobject_add(&q->kobj, &dev->kobj, "%s", "queue"); + ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) goto unlock; if (queue_is_mq(q)) - __blk_mq_register_dev(dev, q); + blk_mq_sysfs_register(disk); mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); @@ -919,7 +918,7 @@ void blk_unregister_queue(struct gendisk *disk) * structures that can be modified through sysfs. */ if (queue_is_mq(q)) - blk_mq_unregister_dev(disk_to_dev(disk), q); + blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(q); mutex_lock(&q->sysfs_lock); -- cgit v1.2.3 From b9a1c179bdfa133d28ab8b7d30631b0accdc2057 Mon Sep 17 00:00:00 2001 From: Ying Sun Date: Wed, 29 Jun 2022 14:24:09 +0800 Subject: block: remove "select BLK_RQ_IO_DATA_LEN" from BLK_CGROUP_IOCOST dependency The configuration item BLK_RQ_IO_DATA_LEN is not declared in the kernel. Select BLK_RQ_IO_DATA_LEN is meaningless which could be removed. Signed-off-by: Ying Sun Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220629062409.19458-1-sunying@nj.iscas.ac.cn Signed-off-by: Jens Axboe --- block/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 50b17e260fa2..444c5ab3b67e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -147,7 +147,6 @@ config BLK_CGROUP_FC_APPID config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" depends on BLK_CGROUP - select BLK_RQ_IO_DATA_LEN select BLK_RQ_ALLOC_TIME help Enabling this option enables the .weight interface for cost -- cgit v1.2.3 From 6a27d28c81bc5843de2490688a04ee5baa6615e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jun 2022 08:20:12 +0200 Subject: block: move ->ia_ranges from the request_queue to the gendisk Independent access ranges only matter for file system I/O and are only valid with a registered gendisk, so move them there. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20220629062013.1331068-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-ia-ranges.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 47c89e65b57f..c1bf14bcd15f 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -106,7 +106,7 @@ static struct kobj_type blk_ia_ranges_ktype = { * * Register with sysfs a set of independent access ranges for @disk. * If @new_iars is not NULL, this set of ranges is registered and the old set - * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is + * specified by disk->ia_ranges is unregistered. Otherwise, disk->ia_ranges is * registered if it is not already. */ int disk_register_independent_access_ranges(struct gendisk *disk, @@ -121,12 +121,12 @@ int disk_register_independent_access_ranges(struct gendisk *disk, /* If a new range set is specified, unregister the old one */ if (new_iars) { - if (q->ia_ranges) + if (disk->ia_ranges) disk_unregister_independent_access_ranges(disk); - q->ia_ranges = new_iars; + disk->ia_ranges = new_iars; } - iars = q->ia_ranges; + iars = disk->ia_ranges; if (!iars) return 0; @@ -138,7 +138,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk, ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, &q->kobj, "%s", "independent_access_ranges"); if (ret) { - q->ia_ranges = NULL; + disk->ia_ranges = NULL; kobject_put(&iars->kobj); return ret; } @@ -164,7 +164,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk, void disk_unregister_independent_access_ranges(struct gendisk *disk) { struct request_queue *q = disk->queue; - struct blk_independent_access_ranges *iars = q->ia_ranges; + struct blk_independent_access_ranges *iars = disk->ia_ranges; int i; lockdep_assert_held(&q->sysfs_dir_lock); @@ -182,7 +182,7 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk) kfree(iars); } - q->ia_ranges = NULL; + disk->ia_ranges = NULL; } static struct blk_independent_access_range * @@ -242,7 +242,7 @@ static bool disk_check_ia_ranges(struct gendisk *disk, static bool disk_ia_ranges_changed(struct gendisk *disk, struct blk_independent_access_ranges *new) { - struct blk_independent_access_ranges *old = disk->queue->ia_ranges; + struct blk_independent_access_ranges *old = disk->ia_ranges; int i; if (!old) @@ -331,7 +331,7 @@ reg: if (blk_queue_registered(q)) { disk_register_independent_access_ranges(disk, iars); } else { - swap(q->ia_ranges, iars); + swap(disk->ia_ranges, iars); kfree(iars); } -- cgit v1.2.3 From 22d0c4080fe49299640d9d6c43154c49794c2825 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jun 2022 08:20:13 +0200 Subject: block: simplify disk_set_independent_access_ranges Lift setting disk->ia_ranges from disk_register_independent_access_ranges into disk_set_independent_access_ranges, and make the behavior the same for the registered vs non-registered queue cases. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20220629062013.1331068-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-ia-ranges.c | 57 +++++++++++++++------------------------------------ block/blk-sysfs.c | 2 +- block/blk.h | 3 +-- 3 files changed, 18 insertions(+), 44 deletions(-) (limited to 'block') diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index c1bf14bcd15f..2bd1d311033b 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -102,31 +102,18 @@ static struct kobj_type blk_ia_ranges_ktype = { * disk_register_independent_access_ranges - register with sysfs a set of * independent access ranges * @disk: Target disk - * @new_iars: New set of independent access ranges * * Register with sysfs a set of independent access ranges for @disk. - * If @new_iars is not NULL, this set of ranges is registered and the old set - * specified by disk->ia_ranges is unregistered. Otherwise, disk->ia_ranges is - * registered if it is not already. */ -int disk_register_independent_access_ranges(struct gendisk *disk, - struct blk_independent_access_ranges *new_iars) +int disk_register_independent_access_ranges(struct gendisk *disk) { + struct blk_independent_access_ranges *iars = disk->ia_ranges; struct request_queue *q = disk->queue; - struct blk_independent_access_ranges *iars; int i, ret; lockdep_assert_held(&q->sysfs_dir_lock); lockdep_assert_held(&q->sysfs_lock); - /* If a new range set is specified, unregister the old one */ - if (new_iars) { - if (disk->ia_ranges) - disk_unregister_independent_access_ranges(disk); - disk->ia_ranges = new_iars; - } - - iars = disk->ia_ranges; if (!iars) return 0; @@ -210,6 +197,9 @@ static bool disk_check_ia_ranges(struct gendisk *disk, sector_t sector = 0; int i; + if (WARN_ON_ONCE(!iars->nr_ia_ranges)) + return false; + /* * While sorting the ranges in increasing LBA order, check that the * ranges do not overlap, that there are no sector holes and that all @@ -298,25 +288,15 @@ void disk_set_independent_access_ranges(struct gendisk *disk, { struct request_queue *q = disk->queue; - if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) { + mutex_lock(&q->sysfs_dir_lock); + mutex_lock(&q->sysfs_lock); + if (iars && !disk_check_ia_ranges(disk, iars)) { kfree(iars); iars = NULL; } - - mutex_lock(&q->sysfs_dir_lock); - mutex_lock(&q->sysfs_lock); - - if (iars) { - if (!disk_check_ia_ranges(disk, iars)) { - kfree(iars); - iars = NULL; - goto reg; - } - - if (!disk_ia_ranges_changed(disk, iars)) { - kfree(iars); - goto unlock; - } + if (iars && !disk_ia_ranges_changed(disk, iars)) { + kfree(iars); + goto unlock; } /* @@ -324,17 +304,12 @@ void disk_set_independent_access_ranges(struct gendisk *disk, * revalidation. If that is the case, we need to unregister the old * set of independent access ranges and register the new set. If the * queue is not registered, registration of the device request queue - * will register the independent access ranges, so only swap in the - * new set and free the old one. + * will register the independent access ranges. */ -reg: - if (blk_queue_registered(q)) { - disk_register_independent_access_ranges(disk, iars); - } else { - swap(disk->ia_ranges, iars); - kfree(iars); - } - + disk_unregister_independent_access_ranges(disk); + disk->ia_ranges = iars; + if (blk_queue_registered(q)) + disk_register_independent_access_ranges(disk); unlock: mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 85ea43eff094..58cb9cb9f48c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -832,7 +832,7 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); - ret = disk_register_independent_access_ranges(disk, NULL); + ret = disk_register_independent_access_ranges(disk); if (ret) goto put_dev; diff --git a/block/blk.h b/block/blk.h index 74d59435870c..58ad50cacd2d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -459,8 +459,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; -int disk_register_independent_access_ranges(struct gendisk *disk, - struct blk_independent_access_ranges *new_iars); +int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); #ifdef CONFIG_FAIL_MAKE_REQUEST -- cgit v1.2.3 From 362b8c16f8fc73fddfe4bded25055fa0c9e2bf1e Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 29 Jun 2022 15:09:16 +0800 Subject: blk-cgroup: factor out blkcg_iostat_update() To reduce some duplicated code, factor out blkcg_iostat_update(). No functional change. Signed-off-by: Jason Yan Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220629070917.3113016-2-yanaijie@huawei.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6906981563f8..34a9452f93a7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -846,6 +846,21 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) } } +static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, + struct blkg_iostat *last) +{ + struct blkg_iostat delta; + unsigned long flags; + + /* propagate percpu delta to global */ + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&delta, cur); + blkg_iostat_sub(&delta, last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(last, &delta); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); +} + static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct blkcg *blkcg = css_to_blkcg(css); @@ -860,8 +875,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { struct blkcg_gq *parent = blkg->parent; struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned long flags; + struct blkg_iostat cur; unsigned int seq; /* fetch the current per-cpu values */ @@ -870,23 +884,12 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); - /* propagate percpu delta to global */ - flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); + blkcg_iostat_update(blkg, &cur, &bisc->last); /* propagate global delta to parent (unless that's root) */ - if (parent && parent->parent) { - flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); - } + if (parent && parent->parent) + blkcg_iostat_update(parent, &blkg->iostat.cur, + &blkg->iostat.last); } rcu_read_unlock(); -- cgit v1.2.3 From e55cf798140518b900e5254093f1195f65c23026 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 29 Jun 2022 15:09:17 +0800 Subject: blk-cgroup: factor out blkcg_free_all_cpd() To reduce some duplicated code, factor out blkcg_free_all_cpd(). No functional change. Signed-off-by: Jason Yan Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220629070917.3113016-3-yanaijie@huawei.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 34a9452f93a7..27a2d0ca0c70 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1533,6 +1533,18 @@ void blkcg_deactivate_policy(struct request_queue *q, } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); +static void blkcg_free_all_cpd(struct blkcg_policy *pol) +{ + struct blkcg *blkcg; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } +} + /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register @@ -1597,14 +1609,9 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_free_fn) { - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - if (blkcg->cpd[pol->plid]) { - pol->cpd_free_fn(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; - } - } - } + if (pol->cpd_free_fn) + blkcg_free_all_cpd(pol); + blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); @@ -1621,8 +1628,6 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - struct blkcg *blkcg; - mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) @@ -1637,14 +1642,9 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_free_fn) { - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - if (blkcg->cpd[pol->plid]) { - pol->cpd_free_fn(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; - } - } - } + if (pol->cpd_free_fn) + blkcg_free_all_cpd(pol); + blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); -- cgit v1.2.3 From 99e48cd6855e9535488e3c90d65edd46c6e6fc1b Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 6 Jul 2022 20:03:50 +0800 Subject: blk-mq: Add a flag for reserved requests Add a flag for reserved requests so that drivers may know this for any special handling. Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/1657109034-206040-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 15c7c5c4ad22..a00e43cc67e5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -475,6 +475,9 @@ retry: if (!(data->rq_flags & RQF_ELV)) blk_mq_tag_busy(data->hctx); + if (data->flags & BLK_MQ_REQ_RESERVED) + data->rq_flags |= RQF_RESV; + /* * Try batched alloc if we want more than 1 tag. */ @@ -589,6 +592,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, else data.rq_flags |= RQF_ELV; + if (flags & BLK_MQ_REQ_RESERVED) + data.rq_flags |= RQF_RESV; + ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) -- cgit v1.2.3 From 9bdb4833dd399cbff82cc20893f52bdec66a9eca Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 6 Jul 2022 20:03:51 +0800 Subject: blk-mq: Drop blk_mq_ops.timeout 'reserved' arg With new API blk_mq_is_reserved_rq() we can tell if a request is from the reserved pool, so stop passing 'reserved' arg. There is actually only a single user of that arg for all the callback implementations, which can use blk_mq_is_reserved_rq() instead. This will also allow us to stop passing the same 'reserved' around the blk-mq iter functions next. Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Acked-by: Ulf Hansson # For MMC Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/1657109034-206040-4-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- block/bsg-lib.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a00e43cc67e5..cedbec36e907 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1427,13 +1427,13 @@ bool blk_mq_queue_inflight(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); -static void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; - ret = req->q->mq_ops->timeout(req, reserved); + ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); @@ -1482,7 +1482,7 @@ static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved) * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, next)) - blk_mq_rq_timed_out(rq, reserved); + blk_mq_rq_timed_out(rq); return true; } diff --git a/block/bsg-lib.c b/block/bsg-lib.c index fd4cd5e68282..d6f5dcdce748 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -331,7 +331,7 @@ void bsg_remove_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(bsg_remove_queue); -static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved) +static enum blk_eh_timer_return bsg_timeout(struct request *rq) { struct bsg_set *bset = container_of(rq->q->tag_set, struct bsg_set, tag_set); -- cgit v1.2.3 From 2dd6532e9591f201e7571b30915db807603ab924 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 6 Jul 2022 20:03:53 +0800 Subject: blk-mq: Drop 'reserved' arg of busy_tag_iter_fn We no longer use the 'reserved' arg in busy_tag_iter_fn for any iter function so it may be dropped. Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Sagi Grimberg #nvme Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/1657109034-206040-6-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- block/blk-mq-tag.c | 7 +++---- block/blk-mq.c | 10 ++++------ 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b80fae7ab1d9..b11add9a95e2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -375,7 +375,7 @@ struct show_busy_params { * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to * keep iterating requests. */ -static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +static bool hctx_show_busy_rq(struct request *rq, void *data) { const struct show_busy_params *params = data; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 3cfffef1feb3..4e9b8ec55bda 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -283,7 +283,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) return true; if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) - ret = iter_data->fn(rq, iter_data->data, reserved); + ret = iter_data->fn(rq, iter_data->data); blk_mq_put_rq_ref(rq); return ret; } @@ -354,7 +354,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!(iter_data->flags & BT_TAG_ITER_STARTED) || blk_mq_request_started(rq)) - ret = iter_data->fn(rq, iter_data->data, reserved); + ret = iter_data->fn(rq, iter_data->data); if (!iter_static_rqs) blk_mq_put_rq_ref(rq); return ret; @@ -444,8 +444,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -static bool blk_mq_tagset_count_completed_rqs(struct request *rq, - void *data, bool reserved) +static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; diff --git a/block/blk-mq.c b/block/blk-mq.c index cedbec36e907..63385742b8a8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -129,8 +129,7 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct request *rq, void *priv, - bool reserved) +static bool blk_mq_check_inflight(struct request *rq, void *priv) { struct mq_inflight *mi = priv; @@ -1400,8 +1399,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -static bool blk_mq_rq_inflight(struct request *rq, void *priv, - bool reserved) +static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* * If we find a request that isn't idle we know the queue is busy @@ -1470,7 +1468,7 @@ void blk_mq_put_rq_ref(struct request *rq) __blk_mq_free_request(rq); } -static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved) +static bool blk_mq_check_expired(struct request *rq, void *priv) { unsigned long *next = priv; @@ -3289,7 +3287,7 @@ struct rq_iter_data { bool has_rq; }; -static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; -- cgit v1.2.3 From 4cf6e6c0106bf6e6d034fa6043b4428ac2f267fc Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 6 Jul 2022 20:03:54 +0800 Subject: blk-mq: Drop local variable for reserved tag The local variable is now only referenced once so drop it. Signed-off-by: John Garry Reviewed-by: Bart Van Assche Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/1657109034-206040-7-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 4e9b8ec55bda..8e3b36d1cb57 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -262,7 +262,6 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct request_queue *q = iter_data->q; struct blk_mq_tag_set *set = q->tag_set; - bool reserved = iter_data->reserved; struct blk_mq_tags *tags; struct request *rq; bool ret = true; @@ -272,7 +271,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) else tags = hctx->tags; - if (!reserved) + if (!iter_data->reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions @@ -333,12 +332,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; - bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; struct request *rq; bool ret = true; bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); - if (!reserved) + if (!(iter_data->flags & BT_TAG_ITER_RESERVED)) bitnr += tags->nr_reserved_tags; /* -- cgit v1.2.3 From 6cc37a672a1e21245b931722a016b3bd4ae10e2d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:36 +0200 Subject: block: call blk_queue_free_zone_bitmaps from disk_release The zone bitmaps are only used for non-passthrough I/O, so free them as soon as the disk is released. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 -- block/genhd.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 58cb9cb9f48c..7590810cf13f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -776,8 +776,6 @@ static void blk_release_queue(struct kobject *kobj) blk_free_queue_stats(q->stats); kfree(q->poll_stat); - blk_queue_free_zone_bitmaps(q); - if (queue_is_mq(q)) blk_mq_release(q); diff --git a/block/genhd.c b/block/genhd.c index b1fb7e058b9c..d0bdeb93e922 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1165,6 +1165,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); + blk_queue_free_zone_bitmaps(disk->queue); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; -- cgit v1.2.3 From edd1dbc83b1de3b98590b76e09b86ddf6887fce7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:37 +0200 Subject: block: use bdev_is_zoned instead of open coding it Use bdev_is_zoned in all places where a block_device is available instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-core.c | 6 +++--- block/blk-mq.h | 2 +- block/blk-zoned.c | 9 ++++----- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 933ea3210954..888ee81ea303 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1033,7 +1033,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) return 0; - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev))) return 0; return bio_add_hw_page(q, bio, page, len, offset, diff --git a/block/blk-core.c b/block/blk-core.c index 5ad7bd93077c..6bcca0b686de 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -569,7 +569,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bio->bi_bdev)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ @@ -775,11 +775,11 @@ void submit_bio_noacct(struct bio *bio) case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_ZONE_RESET_ALL: - if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) + if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) goto not_supported; break; case REQ_OP_WRITE_ZEROES: diff --git a/block/blk-mq.h b/block/blk-mq.h index 54e20edf0da3..31d75a83a562 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -317,7 +317,7 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, * For regular block devices or read operations, use the context plug * which may be NULL if blk_start_plug() was not executed. */ - if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) + if (!bdev_is_zoned(bio->bi_bdev) || !op_is_write(bio_op(bio))) return current->plug; /* Zoned block device write operation case: do not plug the BIO */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 38cd840d8838..90a5c9cc80ab 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -149,8 +149,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); - if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || - WARN_ON_ONCE(!disk->fops->report_zones)) + if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; if (!nr_zones || sector >= capacity) @@ -268,7 +267,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, struct bio *bio = NULL; int ret = 0; - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bdev)) return -EOPNOTSUPP; if (bdev_read_only(bdev)) @@ -350,7 +349,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!q) return -ENXIO; - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bdev)) return -ENOTTY; if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) @@ -408,7 +407,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!q) return -ENXIO; - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bdev)) return -ENOTTY; if (!(mode & FMODE_WRITE)) -- cgit v1.2.3 From 6deacb3bfac2b720e707c566549a7041f17db9c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:38 +0200 Subject: block: simplify blk_mq_plug Drop the unused q argument, and invert the check to move the exception into a branch and the regular path as the normal return. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 2 +- block/blk-mq.c | 2 +- block/blk-mq.h | 18 ++++++++---------- 4 files changed, 11 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 6bcca0b686de..bc16e9bae2dc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -719,7 +719,7 @@ void submit_bio_noacct(struct bio *bio) might_sleep(); - plug = blk_mq_plug(q, bio); + plug = blk_mq_plug(bio); if (plug && plug->nowait) bio->bi_opf |= REQ_NOWAIT; diff --git a/block/blk-merge.c b/block/blk-merge.c index 0f5f42ebd0bb..5abf5aa5a5f0 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -1051,7 +1051,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct blk_plug *plug; struct request *rq; - plug = blk_mq_plug(q, bio); + plug = blk_mq_plug(bio); if (!plug || rq_list_empty(plug->mq_list)) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index 63385742b8a8..f1b84e20b1a9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2808,7 +2808,7 @@ static void bio_set_ioprio(struct bio *bio) void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct blk_plug *plug = blk_mq_plug(q, bio); + struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct request *rq; unsigned int nr_segs = 1; diff --git a/block/blk-mq.h b/block/blk-mq.h index 31d75a83a562..e694ec67d646 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -294,7 +294,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) /* * blk_mq_plug() - Get caller context plug - * @q: request queue * @bio : the bio being submitted by the caller context * * Plugging, by design, may delay the insertion of BIOs into the elevator in @@ -305,23 +304,22 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) * order. While this is not a problem with regular block devices, this ordering * change can cause write BIO failures with zoned block devices as these * require sequential write patterns to zones. Prevent this from happening by - * ignoring the plug state of a BIO issuing context if the target request queue - * is for a zoned block device and the BIO to plug is a write operation. + * ignoring the plug state of a BIO issuing context if it is for a zoned block + * device and the BIO to plug is a write operation. * * Return current->plug if the bio can be plugged and NULL otherwise */ -static inline struct blk_plug *blk_mq_plug(struct request_queue *q, - struct bio *bio) +static inline struct blk_plug *blk_mq_plug( struct bio *bio) { + /* Zoned block device write operation case: do not plug the BIO */ + if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio))) + return NULL; + /* * For regular block devices or read operations, use the context plug * which may be NULL if blk_start_plug() was not executed. */ - if (!bdev_is_zoned(bio->bi_bdev) || !op_is_write(bio_op(bio))) - return current->plug; - - /* Zoned block device write operation case: do not plug the BIO */ - return NULL; + return current->plug; } /* Free all requests on the list */ -- cgit v1.2.3 From 052e545c9276f97e86368579fda32aa1ac017d51 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:39 +0200 Subject: block: simplify blk_check_zone_append Use the bdev based helpers instead of open coding them. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index bc16e9bae2dc..b530ce7b370c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -565,7 +565,6 @@ static int blk_partition_remap(struct bio *bio) static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { - sector_t pos = bio->bi_iter.bi_sector; int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ @@ -573,8 +572,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (pos & (blk_queue_zone_sectors(q) - 1) || - !blk_queue_zone_is_seq(q, pos)) + if (bio->bi_iter.bi_sector & (bdev_zone_sectors(bio->bi_bdev) - 1) || + !bio_zone_is_seq(bio)) return BLK_STS_IOERR; /* -- cgit v1.2.3 From 6b2bd274744e6454ba7bbbe6a09b44866f2f414a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:40 +0200 Subject: block: pass a gendisk to blk_queue_set_zoned Prepare for storing the zone related field in struct gendisk instead of struct request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 9 +++++---- block/partitions/core.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 6ccceb421ed2..35b7bba306a8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -893,18 +893,19 @@ static bool disk_has_partitions(struct gendisk *disk) } /** - * blk_queue_set_zoned - configure a disk queue zoned model. + * disk_set_zoned - configure the zoned model for a disk * @disk: the gendisk of the queue to configure * @model: the zoned model to set * - * Set the zoned model of the request queue of @disk according to @model. + * Set the zoned model of @disk to @model. + * * When @model is BLK_ZONED_HM (host managed), this should be called only * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). * If @model specifies BLK_ZONED_HA (host aware), the effective model used * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions * on the disk. */ -void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) +void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model) { struct request_queue *q = disk->queue; @@ -948,7 +949,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) blk_queue_clear_zone_settings(q); } } -EXPORT_SYMBOL_GPL(blk_queue_set_zoned); +EXPORT_SYMBOL_GPL(disk_set_zoned); int bdev_alignment_offset(struct block_device *bdev) { diff --git a/block/partitions/core.c b/block/partitions/core.c index 7dc487f5b03c..1a45b1dd6491 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -330,7 +330,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, case BLK_ZONED_HA: pr_info("%s: disabling host aware zoned block device support due to partitions\n", disk->disk_name); - blk_queue_set_zoned(disk, BLK_ZONED_NONE); + disk_set_zoned(disk, BLK_ZONED_NONE); break; case BLK_ZONED_NONE: break; -- cgit v1.2.3 From b3c72f8138b5f967a9fa527af84b35018897aba3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:41 +0200 Subject: block: pass a gendisk to blk_queue_clear_zone_settings Switch to a gendisk based API in preparation for moving all zone related fields from the request_queue to the gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- block/blk-zoned.c | 4 +++- block/blk.h | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 35b7bba306a8..8bb9eef5310e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -946,7 +946,7 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model) blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); } else { - blk_queue_clear_zone_settings(q); + disk_clear_zone_settings(disk); } } EXPORT_SYMBOL_GPL(disk_set_zoned); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 90a5c9cc80ab..82a4fa89678c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -622,8 +622,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk, } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); -void blk_queue_clear_zone_settings(struct request_queue *q) +void disk_clear_zone_settings(struct gendisk *disk) { + struct request_queue *q = disk->queue; + blk_mq_freeze_queue(q); blk_queue_free_zone_bitmaps(q); diff --git a/block/blk.h b/block/blk.h index 58ad50cacd2d..7482a3a441dd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -406,10 +406,10 @@ static inline int blk_iolatency_init(struct request_queue *q) { return 0; } #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); -void blk_queue_clear_zone_settings(struct request_queue *q); +void disk_clear_zone_settings(struct gendisk *disk); #else static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} -static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} +static inline void disk_clear_zone_settings(struct gendisk *disk) {} #endif int blk_alloc_ext_minor(void); -- cgit v1.2.3 From 5d40066567a73a67ddb656ad118c6cfa1c4a6d71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:42 +0200 Subject: block: pass a gendisk to blk_queue_free_zone_bitmaps Switch to a gendisk based API in preparation for moving all zone related fields from the request_queue to the gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 8 +++++--- block/blk.h | 4 ++-- block/genhd.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 82a4fa89678c..0d431394cf90 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -449,8 +449,10 @@ fail: return ret; } -void blk_queue_free_zone_bitmaps(struct request_queue *q) +void disk_free_zone_bitmaps(struct gendisk *disk) { + struct request_queue *q = disk->queue; + kfree(q->conv_zones_bitmap); q->conv_zones_bitmap = NULL; kfree(q->seq_zones_wlock); @@ -612,7 +614,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - blk_queue_free_zone_bitmaps(q); + disk_free_zone_bitmaps(disk); } blk_mq_unfreeze_queue(q); @@ -628,7 +630,7 @@ void disk_clear_zone_settings(struct gendisk *disk) blk_mq_freeze_queue(q); - blk_queue_free_zone_bitmaps(q); + disk_free_zone_bitmaps(disk); blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; q->nr_zones = 0; diff --git a/block/blk.h b/block/blk.h index 7482a3a441dd..b71e22c97d77 100644 --- a/block/blk.h +++ b/block/blk.h @@ -405,10 +405,10 @@ static inline int blk_iolatency_init(struct request_queue *q) { return 0; } #endif #ifdef CONFIG_BLK_DEV_ZONED -void blk_queue_free_zone_bitmaps(struct request_queue *q); +void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); #else -static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} +static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} static inline void disk_clear_zone_settings(struct gendisk *disk) {} #endif diff --git a/block/genhd.c b/block/genhd.c index d0bdeb93e922..9d30f159c59a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1165,7 +1165,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); - blk_queue_free_zone_bitmaps(disk->queue); + disk_free_zone_bitmaps(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; -- cgit v1.2.3 From 1dc0172027b0aa09823b430e395b1116d2745f36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:43 +0200 Subject: block: remove queue_max_open_zones and queue_max_active_zones Always use the bdev based helpers instead. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7590810cf13f..5ce72345ac66 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -330,12 +330,12 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_open_zones(q), page); + return queue_var_show(bdev_max_open_zones(q->disk->part0), page); } static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_active_zones(q), page); + return queue_var_show(bdev_max_active_zones(q->disk->part0), page); } static ssize_t queue_nomerges_show(struct request_queue *q, char *page) -- cgit v1.2.3 From b623e347323f6464b20fb0d899a0a73522ed8f6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:45 +0200 Subject: block: replace blkdev_nr_zones with bdev_nr_zones Pass a block_device instead of a request_queue as that is what most callers have at hand. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Acked-by: Damien Le Moal Link: https://lore.kernel.org/r/20220706070350.1703384-12-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 15 ++++++++------- block/ioctl.c | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 0d431394cf90..2dec25d8aa3b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -108,21 +108,22 @@ void __blk_req_zone_write_unlock(struct request *rq) EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); /** - * blkdev_nr_zones - Get number of zones - * @disk: Target gendisk + * bdev_nr_zones - Get number of zones + * @bdev: Target device * * Return the total number of zones of a zoned block device. For a block * device without zone capabilities, the number of zones is always 0. */ -unsigned int blkdev_nr_zones(struct gendisk *disk) +unsigned int bdev_nr_zones(struct block_device *bdev) { - sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); + sector_t zone_sectors = bdev_zone_sectors(bdev); - if (!blk_queue_is_zoned(disk->queue)) + if (!bdev_is_zoned(bdev)) return 0; - return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); + return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> + ilog2(zone_sectors); } -EXPORT_SYMBOL_GPL(blkdev_nr_zones); +EXPORT_SYMBOL_GPL(bdev_nr_zones); /** * blkdev_report_zones - Get zones information diff --git a/block/ioctl.c b/block/ioctl.c index 46949f1b0dba..60121e89052b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -495,7 +495,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKGETZONESZ: return put_uint(argp, bdev_zone_sectors(bdev)); case BLKGETNRZONES: - return put_uint(argp, blkdev_nr_zones(bdev->bd_disk)); + return put_uint(argp, bdev_nr_zones(bdev)); case BLKROGET: return put_int(argp, bdev_read_only(bdev) != 0); case BLKSSZGET: /* get block device logical block size */ -- cgit v1.2.3 From 375c140c199ebd2866f9c50a0b8853ffca3f1b68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:46 +0200 Subject: block: use bdev based helpers in blkdev_zone_mgmt{,all} Use the bdev based helpers instead of the queue based ones to clean up the code a bit and prepare for storing all zone related fields in struct gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 2dec25d8aa3b..c2d8a38f449a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -190,8 +190,8 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t capacity = get_capacity(bdev->bd_disk); - sector_t zone_sectors = blk_queue_zone_sectors(q); + sector_t capacity = bdev_nr_sectors(bdev); + sector_t zone_sectors = bdev_zone_sectors(bdev); unsigned long *need_reset; struct bio *bio = NULL; sector_t sector = 0; @@ -262,8 +262,8 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t zone_sectors = blk_queue_zone_sectors(q); - sector_t capacity = get_capacity(bdev->bd_disk); + sector_t zone_sectors = bdev_zone_sectors(bdev); + sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; int ret = 0; -- cgit v1.2.3 From d86e716aa40643e3eb8c69fab3a198146bf76dd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jul 2022 09:03:50 +0200 Subject: block: move zone related fields to struct gendisk Move the zone related fields that are currently stored in struct request_queue to struct gendisk as these are part of the highlevel block layer API and are only used for non-passthrough I/O that requires the gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220706070350.1703384-17-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs-zoned.c | 6 +++--- block/blk-sysfs.c | 2 +- block/blk-zoned.c | 45 +++++++++++++++++++++----------------------- 3 files changed, 25 insertions(+), 28 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c index 038cb627c868..a77b099c34b7 100644 --- a/block/blk-mq-debugfs-zoned.c +++ b/block/blk-mq-debugfs-zoned.c @@ -11,11 +11,11 @@ int queue_zone_wlock_show(void *data, struct seq_file *m) struct request_queue *q = data; unsigned int i; - if (!q->seq_zones_wlock) + if (!q->disk->seq_zones_wlock) return 0; - for (i = 0; i < q->nr_zones; i++) - if (test_bit(i, q->seq_zones_wlock)) + for (i = 0; i < q->disk->nr_zones; i++) + if (test_bit(i, q->disk->seq_zones_wlock)) seq_printf(m, "%u\n", i); return 0; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5ce72345ac66..c0303026752d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -325,7 +325,7 @@ static ssize_t queue_zoned_show(struct request_queue *q, char *page) static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_nr_zones(q), page); + return queue_var_show(disk_nr_zones(q->disk), page); } static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c2d8a38f449a..7c017458d5ce 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -57,10 +57,10 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str); */ bool blk_req_needs_zone_write_lock(struct request *rq) { - if (!rq->q->seq_zones_wlock) + if (blk_rq_is_passthrough(rq)) return false; - if (blk_rq_is_passthrough(rq)) + if (!rq->q->disk->seq_zones_wlock) return false; switch (req_op(rq)) { @@ -77,7 +77,7 @@ bool blk_req_zone_write_trylock(struct request *rq) { unsigned int zno = blk_rq_zone_no(rq); - if (test_and_set_bit(zno, rq->q->seq_zones_wlock)) + if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) return false; WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); @@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); void __blk_req_zone_write_lock(struct request *rq) { if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), - rq->q->seq_zones_wlock))) + rq->q->disk->seq_zones_wlock))) return; WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); @@ -101,9 +101,9 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); void __blk_req_zone_write_unlock(struct request *rq) { rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; - if (rq->q->seq_zones_wlock) + if (rq->q->disk->seq_zones_wlock) WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), - rq->q->seq_zones_wlock)); + rq->q->disk->seq_zones_wlock)); } EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); @@ -189,7 +189,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, static int blkdev_zone_reset_all_emulated(struct block_device *bdev, gfp_t gfp_mask) { - struct request_queue *q = bdev_get_queue(bdev); + struct gendisk *disk = bdev->bd_disk; sector_t capacity = bdev_nr_sectors(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); unsigned long *need_reset; @@ -197,19 +197,18 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, sector_t sector = 0; int ret; - need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones); + need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); if (!need_reset) return -ENOMEM; - ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0, - q->nr_zones, blk_zone_need_reset_cb, - need_reset); + ret = disk->fops->report_zones(disk, 0, disk->nr_zones, + blk_zone_need_reset_cb, need_reset); if (ret < 0) goto out_free_need_reset; ret = 0; while (sector < capacity) { - if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) { + if (!test_bit(disk_zone_no(disk, sector), need_reset)) { sector += zone_sectors; continue; } @@ -452,12 +451,10 @@ fail: void disk_free_zone_bitmaps(struct gendisk *disk) { - struct request_queue *q = disk->queue; - - kfree(q->conv_zones_bitmap); - q->conv_zones_bitmap = NULL; - kfree(q->seq_zones_wlock); - q->seq_zones_wlock = NULL; + kfree(disk->conv_zones_bitmap); + disk->conv_zones_bitmap = NULL; + kfree(disk->seq_zones_wlock); + disk->seq_zones_wlock = NULL; } struct blk_revalidate_zone_args { @@ -607,9 +604,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk, blk_mq_freeze_queue(q); if (ret > 0) { blk_queue_chunk_sectors(q, args.zone_sectors); - q->nr_zones = args.nr_zones; - swap(q->seq_zones_wlock, args.seq_zones_wlock); - swap(q->conv_zones_bitmap, args.conv_zones_bitmap); + disk->nr_zones = args.nr_zones; + swap(disk->seq_zones_wlock, args.seq_zones_wlock); + swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); if (update_driver_data) update_driver_data(disk); ret = 0; @@ -634,9 +631,9 @@ void disk_clear_zone_settings(struct gendisk *disk) disk_free_zone_bitmaps(disk); blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; - q->nr_zones = 0; - q->max_open_zones = 0; - q->max_active_zones = 0; + disk->nr_zones = 0; + disk->max_open_zones = 0; + disk->max_active_zones = 0; q->limits.chunk_sectors = 0; q->limits.zone_write_granularity = 0; q->limits.max_zone_append_sectors = 0; -- cgit v1.2.3 From f3ec5d11554778c24ac8915e847223ed71d104fc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jul 2022 17:08:08 +0800 Subject: blk-mq: don't create hctx debugfs dir until q->debugfs_dir is created blk_mq_debugfs_register_hctx() can be called by blk_mq_update_nr_hw_queues when gendisk isn't added yet, such as nvme tcp. Fixes the warning of 'debugfs: Directory 'hctx0' with parent '/' already present!' which can be observed reliably when running blktests nvme/005. Fixes: 6cfc0081b046 ("blk-mq: no need to check return value of debugfs_create functions") Reported-by: Yi Zhang Signed-off-by: Ming Lei Tested-by: Yi Zhang Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220711090808.259682-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b11add9a95e2..7ee1b13380d0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -728,6 +728,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, char name[20]; int i; + if (!q->debugfs_dir) + return; + snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); -- cgit v1.2.3 From f4b1e27db49c8b985b116aa99481b4c6a4342ed4 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 17:05:47 +0200 Subject: block/rq_qos: Use atomic_try_cmpxchg in atomic_inc_below Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in atomic_inc_below. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Signed-off-by: Uros Bizjak Cc: Jens Axboe Link: https://lore.kernel.org/r/20220712150547.5786-1-ubizjak@gmail.com Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d3a75693adbf..88f0fe7dcf54 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -10,16 +10,10 @@ static bool atomic_inc_below(atomic_t *v, unsigned int below) { unsigned int cur = atomic_read(v); - for (;;) { - unsigned int old; - + do { if (cur >= below) return false; - old = atomic_cmpxchg(v, cur, cur + 1); - if (old == cur) - break; - cur = old; - } + } while (!atomic_try_cmpxchg(v, &cur, cur + 1)); return true; } -- cgit v1.2.3 From 939f9dd040fe1063d884f8f0f89b037093fe2341 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 17:27:41 +0200 Subject: block: Use try_cmpxchg in update_io_ticks Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in update_io_ticks. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Signed-off-by: Uros Bizjak Cc: Jens Axboe Link: https://lore.kernel.org/r/20220712152741.7324-1-ubizjak@gmail.com Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b530ce7b370c..8365996a8ef8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -943,7 +943,7 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end) again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp))) { - if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp)) + if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->bd_partno) { -- cgit v1.2.3 From aee8960c2eae12636040dbf0f04e135273b1612d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 17:19:47 +0200 Subject: blk-iolatency: Use atomic{,64}_try_cmpxchg Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in check_scale_change and atomic64_try_cmpxchg in blkcg_iolatency_done_bio. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Signed-off-by: Uros Bizjak Cc: Jens Axboe Link: https://lore.kernel.org/r/20220712151947.6783-1-ubizjak@gmail.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 9568bf8dfe82..79745c6d8e15 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -401,7 +401,6 @@ static void check_scale_change(struct iolatency_grp *iolat) unsigned int cur_cookie; unsigned int our_cookie = atomic_read(&iolat->scale_cookie); u64 scale_lat; - unsigned int old; int direction = 0; if (lat_to_blkg(iolat)->parent == NULL) @@ -422,11 +421,10 @@ static void check_scale_change(struct iolatency_grp *iolat) else return; - old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); - - /* Somebody beat us to the punch, just bail. */ - if (old != our_cookie) + if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) { + /* Somebody beat us to the punch, just bail. */ return; + } if (direction < 0 && iolat->min_lat_nsec) { u64 samples_thresh; @@ -633,8 +631,8 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) window_start = atomic64_read(&iolat->window_start); if (now > window_start && (now - window_start) >= iolat->cur_win_nsec) { - if (atomic64_cmpxchg(&iolat->window_start, - window_start, now) == window_start) + if (atomic64_try_cmpxchg(&iolat->window_start, + &window_start, now)) iolatency_check_latencies(iolat, now); } } -- cgit v1.2.3 From 96388f57d2aad9836b2c589181fa1dbaba4066b4 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 12 Jul 2022 17:44:55 +0200 Subject: blk-cgroup: Use atomic{,64}_try_cmpxchg Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in blkcg_unuse_delay, blkcg_set_delay and blkcg_clear_delay and atomic64_try_cmpxchg in blkcg_scale_delay. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Signed-off-by: Uros Bizjak Cc: Jens Axboe Link: https://lore.kernel.org/r/20220712154455.66868-1-ubizjak@gmail.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-cgroup.h | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 27a2d0ca0c70..869af9d72bcf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1700,7 +1700,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && - atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { + atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index d4de0a35e066..d2724d1dd7c9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -430,12 +430,8 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ - while (old) { - int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); - if (cur == old) - break; - old = cur; - } + while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) + ; if (old == 0) return 0; @@ -458,7 +454,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ - if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old) + if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); atomic64_set(&blkg->delay_nsec, delay); @@ -475,7 +471,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ - if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old) + if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } -- cgit v1.2.3 From 5bf83e9a14ddae994d783dee96b91bf46f04839c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Jul 2022 07:53:09 +0200 Subject: block: stop using bdevname in bdev_write_inode Just use the %pg format specifier instead. Also reformat the printk statement to be more readable. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220713055317.1888500-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 5fe06c1f2def..ce05175e71ce 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -54,12 +54,10 @@ static void bdev_write_inode(struct block_device *bdev) while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); - if (ret) { - char name[BDEVNAME_SIZE]; - pr_warn_ratelimited("VFS: Dirty inode writeback failed " - "for block device %s (err=%d).\n", - bdevname(bdev, name), ret); - } + if (ret) + pr_warn_ratelimited( + "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", + bdev, ret); spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 02ff3dd20f512cf811ae8028c44fdb212b5f2bf7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Jul 2022 07:53:10 +0200 Subject: block: stop using bdevname in __blkdev_issue_discard Just use the %pg format specifier instead. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220713055317.1888500-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 09b7e1200c0f..67e6dbc1ae81 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -48,10 +48,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* In case the discard granularity isn't set by buggy device driver */ if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { - char dev_name[BDEVNAME_SIZE]; - - bdevname(bdev, dev_name); - pr_err_ratelimited("%s: Error: discard_granularity is 0.\n", dev_name); + pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", + bdev); return -EOPNOTSUPP; } -- cgit v1.2.3 From 900d156bac2bc474cf7c7bee4efbc6c83ec5ae58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Jul 2022 07:53:17 +0200 Subject: block: remove bdevname Replace the remaining calls of bdevname with snprintf using the %pg format specifier. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220713055317.1888500-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9d30f159c59a..44dfcf67ed96 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -101,29 +101,6 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) } EXPORT_SYMBOL_GPL(set_capacity_and_notify); -/* - * Format the device name of the indicated block device into the supplied buffer - * and return a pointer to that same buffer for convenience. - * - * Note: do not use this in new code, use the %pg specifier to sprintf and - * printk insted. - */ -const char *bdevname(struct block_device *bdev, char *buf) -{ - struct gendisk *hd = bdev->bd_disk; - int partno = bdev->bd_partno; - - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); - - return buf; -} -EXPORT_SYMBOL(bdevname); - static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { -- cgit v1.2.3 From ff07a02e9e8e6489db841e0c48a5c78e7e78d572 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:27 -0700 Subject: treewide: Rename enum req_opf into enum req_op The type name enum req_opf is misleading since it suggests that values of this type include both an operation type and flags. Since values of this type represent an operation only, change the type name into enum req_op. Convert the enum req_op documentation into kernel-doc format. Move a few definitions such that the enum req_op documentation occurs just above the enum req_op definition. The name "req_opf" was introduced by commit ef295ecf090d ("block: better op and flags encoding"). Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7c017458d5ce..a264621d4905 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -256,9 +256,8 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * The operation to execute on each zone can be a zone reset, open, close * or finish request. */ -int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, - sector_t sector, sector_t nr_sectors, - gfp_t gfp_mask) +int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, + sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); @@ -397,7 +396,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, void __user *argp = (void __user *)arg; struct request_queue *q; struct blk_zone_range zrange; - enum req_opf op; + enum req_op op; int ret; if (!argp) -- cgit v1.2.3 From 77e7ffd7ad3952909be6a9c599b7d164c8866fec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:28 -0700 Subject: block: Use enum req_op where appropriate Change the type of the arguments that are used to pass a REQ_OP_* value from int or unsigned int into enum req_op to improve static type checking. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- block/blk-mq-debugfs.c | 2 +- block/blk-throttle.c | 7 ++++--- block/blk-wbt.c | 2 +- block/blk.h | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 8365996a8ef8..67b8bcfa27f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -136,7 +136,7 @@ static const char *const blk_op_name[] = { * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ -inline const char *blk_op_str(unsigned int op) +inline const char *blk_op_str(enum req_op op) { const char *op_str = "UNKNOWN"; @@ -953,7 +953,7 @@ again: } unsigned long bdev_start_io_acct(struct block_device *bdev, - unsigned int sectors, unsigned int op, + unsigned int sectors, enum req_op op, unsigned long start_time) { const int sgrp = op_stat_group(op); @@ -994,7 +994,7 @@ unsigned long bio_start_io_acct(struct bio *bio) } EXPORT_SYMBOL_GPL(bio_start_io_acct); -void bdev_end_io_acct(struct block_device *bdev, unsigned int op, +void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { const int sgrp = op_stat_group(op); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7ee1b13380d0..6cc2411e2d26 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -304,7 +304,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; - const unsigned int op = req_op(rq); + const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); seq_printf(m, "%p {.op=", rq); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 139b2d7a99e2..9f5fe62afff9 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2203,8 +2203,9 @@ out_unlock: #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_track_latency(struct throtl_data *td, sector_t size, - int op, unsigned long time) + enum req_op op, unsigned long time) { + const bool rw = op_is_write(op); struct latency_bucket *latency; int index; @@ -2215,10 +2216,10 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, index = request_bucket_index(size); - latency = get_cpu_ptr(td->latency_buckets[op]); + latency = get_cpu_ptr(td->latency_buckets[rw]); latency[index].total_latency += time; latency[index].samples++; - put_cpu_ptr(td->latency_buckets[op]); + put_cpu_ptr(td->latency_buckets[rw]); } void blk_throtl_stat_add(struct request *rq, u64 time_ns) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0c119be0e813..7bf09ae06577 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -670,7 +670,7 @@ u64 wbt_default_latency_nsec(struct request_queue *q) static int wbt_data_dir(const struct request *rq) { - const int op = req_op(rq); + const enum req_op op = req_op(rq); if (op == REQ_OP_READ) return READ; diff --git a/block/blk.h b/block/blk.h index b71e22c97d77..c4b084bfe87c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -160,7 +160,7 @@ static inline bool blk_discard_mergable(struct request *req) } static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, - int op) + enum req_op op) { if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, -- cgit v1.2.3 From 2d9b02be73ba8efba406b399a722b4e33614dd0e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:30 -0700 Subject: block: Change the type of req_op() and bio_op() into enum req_op Improve static type checking by changing the type of the value returned by req_op() and bio_op() from unsigned int into enum req_op. Insert 'default: break;' in switch statements on the enum req_op type to prevent that the compiler warns about these switch statements. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Cc: Tim Waugh Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-5-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 5abf5aa5a5f0..de178a8b4c82 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -405,6 +405,8 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return 1; case REQ_OP_WRITE_ZEROES: return 0; + default: + break; } rq_for_each_bvec(bv, rq, iter) -- cgit v1.2.3 From 16458cf3bd15e5624205df6e8a76b9a5363555f3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:32 -0700 Subject: block: Use the new blk_opf_t type Use the new blk_opf_t type for arguments and variables that represent request flags or a bitwise combination of a request operation and request flags. Rename the function arguments and also a structure member that hold a request operation and flags from 'rw' into 'opf'. This patch does not change any functionality. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-7-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/bio.c | 10 +++++----- block/blk-cgroup-rwstat.h | 8 ++++---- block/blk-core.c | 2 +- block/blk-flush.c | 6 +++--- block/blk-merge.c | 6 +++--- block/blk-mq-debugfs.c | 4 ++-- block/blk-mq.c | 15 ++++++++------- block/blk-mq.h | 6 +++--- block/blk-wbt.c | 16 ++++++++-------- block/elevator.h | 2 +- block/fops.c | 12 ++++++------ 11 files changed, 44 insertions(+), 43 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 888ee81ea303..6f9f883f9a65 100644 --- a/block/bio.c +++ b/block/bio.c @@ -239,7 +239,7 @@ static void bio_free(struct bio *bio) * when IO has completed, or when the bio is released. */ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, - unsigned short max_vecs, unsigned int opf) + unsigned short max_vecs, blk_opf_t opf) { bio->bi_next = NULL; bio->bi_bdev = bdev; @@ -292,7 +292,7 @@ EXPORT_SYMBOL(bio_init); * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ -void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf) +void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); @@ -341,7 +341,7 @@ void bio_chain(struct bio *bio, struct bio *parent) EXPORT_SYMBOL(bio_chain); struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, - unsigned int nr_pages, unsigned int opf, gfp_t gfp) + unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) { struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); @@ -409,7 +409,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) } static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, - unsigned short nr_vecs, unsigned int opf, gfp_t gfp, + unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { struct bio_alloc_cache *cache; @@ -468,7 +468,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - unsigned int opf, gfp_t gfp_mask, + blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h index 9f2723b34b75..022527b0b043 100644 --- a/block/blk-cgroup-rwstat.h +++ b/block/blk-cgroup-rwstat.h @@ -59,20 +59,20 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - unsigned int op, uint64_t val) + blk_opf_t opf, uint64_t val) { struct percpu_counter *cnt; - if (op_is_discard(op)) + if (op_is_discard(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; - else if (op_is_write(op)) + else if (op_is_write(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); - if (op_is_sync(op)) + if (op_is_sync(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; diff --git a/block/blk-core.c b/block/blk-core.c index 67b8bcfa27f0..123468b9d2e4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { - BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * diff --git a/block/blk-flush.c b/block/blk-flush.c index c68968724870..d20a0c6b2c66 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -94,7 +94,7 @@ enum { }; static void blk_kick_flush(struct request_queue *q, - struct blk_flush_queue *fq, unsigned int flags); + struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) @@ -173,7 +173,7 @@ static void blk_flush_complete_seq(struct request *rq, { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; - unsigned int cmd_flags; + blk_opf_t cmd_flags; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; @@ -290,7 +290,7 @@ bool is_flush_rq(struct request *rq) * */ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, - unsigned int flags) + blk_opf_t flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = diff --git a/block/blk-merge.c b/block/blk-merge.c index de178a8b4c82..3c3f785f558a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -712,7 +712,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, */ void blk_rq_set_mixed_merge(struct request *rq) { - unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; if (rq->rq_flags & RQF_MIXED_MERGE) @@ -928,7 +928,7 @@ enum bio_merge_status { static enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; @@ -952,7 +952,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, static enum bio_merge_status bio_attempt_front_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 6cc2411e2d26..8559cea7f300 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -313,8 +313,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) else seq_printf(m, "%s", op_str); seq_puts(m, ", .cmd_flags="); - blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, - ARRAY_SIZE(cmd_flag_name)); + blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK), + cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); diff --git a/block/blk-mq.c b/block/blk-mq.c index f1b84e20b1a9..d716b7f3763f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -510,13 +510,13 @@ retry: alloc_time_ns); } -struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, +struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, - .cmd_flags = op, + .cmd_flags = opf, .nr_tags = 1, }; struct request *rq; @@ -540,12 +540,12 @@ out_queue_exit: EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) + blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, - .cmd_flags = op, + .cmd_flags = opf, .nr_tags = 1, }; u64 alloc_time_ns = 0; @@ -660,7 +660,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->q->disk ? rq->q->disk->disk_name : "?", - (unsigned long long) rq->cmd_flags); + (__force unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), @@ -713,8 +713,9 @@ static void blk_print_req_error(struct request *req, blk_status_t status) "phys_seg %u prio class %u\n", blk_status_to_str(status), req->q->disk ? req->q->disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, + blk_rq_pos(req), (__force u32)req_op(req), + blk_op_str(req_op(req)), + (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, IOPRIO_PRIO_CLASS(req->ioprio)); } diff --git a/block/blk-mq.h b/block/blk-mq.h index e694ec67d646..8ca453ac243d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -86,7 +86,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } -static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf) +static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; @@ -107,7 +107,7 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf) * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - unsigned int opf, + blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; @@ -152,7 +152,7 @@ struct blk_mq_alloc_data { struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; - unsigned int cmd_flags; + blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 7bf09ae06577..f2e4bf1dca47 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -451,7 +451,7 @@ static bool close_io(struct rq_wb *rwb) #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) -static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; @@ -462,7 +462,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) if (!rwb_enabled(rwb)) return UINT_MAX; - if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD) + if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; /* @@ -473,9 +473,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) limit = rwb->rq_depth.max_depth; - else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback. @@ -490,13 +490,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) struct wbt_wait_data { struct rq_wb *rwb; enum wbt_flags wb_acct; - unsigned long rw; + blk_opf_t opf; }; static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; - return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw)); + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); } static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) @@ -510,13 +510,13 @@ static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw) + blk_opf_t opf) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .rwb = rwb, .wb_acct = wb_acct, - .rw = rw, + .opf = opf, }; rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); diff --git a/block/elevator.h b/block/elevator.h index 16cd8bdedb7e..3f0593b3bf9d 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -34,7 +34,7 @@ struct elevator_mq_ops { int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); - void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); + void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *); void (*prepare_request)(struct request *); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); diff --git a/block/fops.c b/block/fops.c index 86d3cab9bf93..29066ac5a2fa 100644 --- a/block/fops.c +++ b/block/fops.c @@ -32,14 +32,14 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } -static unsigned int dio_bio_write_op(struct kiocb *iocb) +static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { - unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb->ki_flags & IOCB_DSYNC) - op |= REQ_FUA; - return op; + opf |= REQ_FUA; + return opf; } static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, @@ -175,7 +175,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct blkdev_dio *dio; struct bio *bio; bool is_read = (iov_iter_rw(iter) == READ), is_sync; - unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); + blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); loff_t pos = iocb->ki_pos; int ret = 0; @@ -297,7 +297,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, { struct block_device *bdev = iocb->ki_filp->private_data; bool is_read = iov_iter_rw(iter) == READ; - unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); + blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; struct bio *bio; loff_t pos = iocb->ki_pos; -- cgit v1.2.3 From dc469ba2e790cb0a335e2650b701639752ff65cd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:33 -0700 Subject: block/bfq: Use the new blk_opf_t type Use the new blk_opf_t type for arguments and variables that represent request flags or a bitwise combination of a request operation and request flags. Rename those variables from 'op' into 'opf'. This patch does not change any functionality. Cc: Jan Kara Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-8-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 26 +++++++++++++------------- block/bfq-iosched.c | 16 ++++++++-------- block/bfq-iosched.h | 8 ++++---- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fc605791b1e..30b15a9a47c4 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -220,46 +220,46 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) + blk_opf_t opf) { - blkg_rwstat_add(&bfqg->stats.queued, op, 1); + blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { - blkg_rwstat_add(&bfqg->stats.queued, op, -1); + blkg_rwstat_add(&bfqg->stats.queued, opf, -1); } -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { - blkg_rwstat_add(&bfqg->stats.merged, op, 1); + blkg_rwstat_add(&bfqg->stats.merged, opf, 1); } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, - u64 io_start_time_ns, unsigned int op) + u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; u64 now = ktime_get_ns(); if (now > io_start_time_ns) - blkg_rwstat_add(&stats->service_time, op, + blkg_rwstat_add(&stats->service_time, opf, now - io_start_time_ns); if (io_start_time_ns > start_time_ns) - blkg_rwstat_add(&stats->wait_time, op, + blkg_rwstat_add(&stats->wait_time, opf, io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) { } -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } + blk_opf_t opf) { } +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, - u64 io_start_time_ns, unsigned int op) { } + u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e6d7e6b01a05..c740b41fe0a4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -668,19 +668,19 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) * significantly affect service guarantees coming from the BFQ scheduling * algorithm. */ -static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL; + struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL; int depth; unsigned limit = data->q->nr_requests; /* Sync reads have full depth available */ - if (op_is_sync(op) && !op_is_write(op)) { + if (op_is_sync(opf) && !op_is_write(opf)) { depth = 0; } else { - depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; limit = (limit * depth) >> bfqd->full_depth_shift; } @@ -693,7 +693,7 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(op), depth); + __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); if (depth) data->shallow_depth = depth; } @@ -6104,7 +6104,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) static void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, - unsigned int cmd_flags) + blk_opf_t cmd_flags) { if (!bfqq) return; @@ -6129,7 +6129,7 @@ static void bfq_update_insert_stats(struct request_queue *q, static inline void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, - unsigned int cmd_flags) {} + blk_opf_t cmd_flags) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ static struct bfq_queue *bfq_init_rq(struct request *rq); @@ -6141,7 +6141,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; bool idle_timer_disabled = false; - unsigned int cmd_flags; + blk_opf_t cmd_flags; LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ca8177d7bf7c..ad8e513d7e87 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -994,11 +994,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op); -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op); + blk_opf_t opf); +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, - u64 io_start_time_ns, unsigned int op); + u64 io_start_time_ns, blk_opf_t opf); void bfqg_stats_update_dequeue(struct bfq_group *bfqg); void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); void bfqg_stats_update_idle_time(struct bfq_group *bfqg); -- cgit v1.2.3 From f8359efe4742a39b4ece554ab9d7e5f03c4fff83 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:34 -0700 Subject: block/mq-deadline: Use the new blk_opf_t type Use the new blk_opf_t type for an argument that represents a bitwise combination of a request operation and request flags. Rename that argument from 'op' into 'opf'. This patch does not change any functionality. Cc: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-9-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 1a9e835e816c..5639921dfa92 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -543,12 +543,12 @@ unlock: * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ -static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ - if (op_is_sync(op) && !op_is_write(op)) + if (op_is_sync(opf) && !op_is_write(opf)) return; /* -- cgit v1.2.3 From d625fecd8af84ac669075caf1941ff0d1995de56 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:35 -0700 Subject: block/kyber: Use the new blk_opf_t type Use the new blk_opf_t type for arguments that represent a bitwise combination of a request operation and request flags. Rename those arguments from 'op' into 'opf'. This patch does not change any functionality. Cc: Omar Sandoval Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-10-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 8f7c745b4a57..b05357bced99 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -195,9 +195,9 @@ struct kyber_hctx_data { static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); -static unsigned int kyber_sched_domain(unsigned int op) +static unsigned int kyber_sched_domain(blk_opf_t opf) { - switch (op & REQ_OP_MASK) { + switch (opf & REQ_OP_MASK) { case REQ_OP_READ: return KYBER_READ; case REQ_OP_WRITE: @@ -553,13 +553,13 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, } } -static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { /* * We use the scheduler tags as per-hardware queue queueing tokens. * Async requests can be limited at this stage. */ - if (!op_is_sync(op)) { + if (!op_is_sync(opf)) { struct kyber_queue_data *kqd = data->q->elevator->elevator_data; data->shallow_depth = kqd->async_depth; -- cgit v1.2.3 From 14a6e2eb7df5c7897c15b109cba29ab0c4a791b6 Mon Sep 17 00:00:00 2001 From: Jinke Han Date: Wed, 20 Jul 2022 17:36:16 +0800 Subject: block: don't allow the same type rq_qos add more than once In our test of iocost, we encountered some list add/del corruptions of inner_walk list in ioc_timer_fn. The reason can be described as follows: cpu 0 cpu 1 ioc_qos_write ioc_qos_write ioc = q_to_ioc(queue); if (!ioc) { ioc = kzalloc(); ioc = q_to_ioc(queue); if (!ioc) { ioc = kzalloc(); ... rq_qos_add(q, rqos); } ... rq_qos_add(q, rqos); ... } When the io.cost.qos file is written by two cpus concurrently, rq_qos may be added to one disk twice. In that case, there will be two iocs enabled and running on one disk. They own different iocgs on their active list. In the ioc_timer_fn function, because of the iocgs from two iocs have the same root iocg, the root iocg's walk_list may be overwritten by each other and this leads to list add/del corruptions in building or destroying the inner_walk list. And so far, the blk-rq-qos framework works in case that one instance for one type rq_qos per queue by default. This patch make this explicit and also fix the crash above. Signed-off-by: Jinke Han Reviewed-by: Muchun Song Acked-by: Tejun Heo Cc: Link: https://lore.kernel.org/r/20220720093616.70584-1-hanjinke.666@bytedance.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 20 +++++++++++++------- block/blk-iolatency.c | 18 +++++++++++------- block/blk-rq-qos.h | 11 ++++++++++- block/blk-wbt.c | 12 +++++++++++- 4 files changed, 45 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index b7082f2aed9c..7936e5f5821c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2886,15 +2886,21 @@ static int blk_iocost_init(struct request_queue *q) * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - rq_qos_add(q, rqos); + ret = rq_qos_add(q, rqos); + if (ret) + goto err_free_ioc; + ret = blkcg_activate_policy(q, &blkcg_policy_iocost); - if (ret) { - rq_qos_del(q, rqos); - free_percpu(ioc->pcpu_stat); - kfree(ioc); - return ret; - } + if (ret) + goto err_del_qos; return 0; + +err_del_qos: + rq_qos_del(q, rqos); +err_free_ioc: + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 79745c6d8e15..e285152345a2 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -771,19 +771,23 @@ int blk_iolatency_init(struct request_queue *q) rqos->ops = &blkcg_iolatency_ops; rqos->q = q; - rq_qos_add(q, rqos); - + ret = rq_qos_add(q, rqos); + if (ret) + goto err_free; ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); - if (ret) { - rq_qos_del(q, rqos); - kfree(blkiolat); - return ret; - } + if (ret) + goto err_qos_del; timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); return 0; + +err_qos_del: + rq_qos_del(q, rqos); +err_free: + kfree(blkiolat); + return ret; } static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 0e46052b018a..08b856570ad1 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -86,7 +86,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) init_waitqueue_head(&rq_wait->wait); } -static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) +static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { /* * No IO can be in-flight when adding rqos, so freeze queue, which @@ -98,6 +98,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_freeze_queue(q); spin_lock_irq(&q->queue_lock); + if (rq_qos_id(q, rqos->id)) + goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; spin_unlock_irq(&q->queue_lock); @@ -109,6 +111,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_debugfs_register_rqos(rqos); mutex_unlock(&q->debugfs_mutex); } + + return 0; +ebusy: + spin_unlock_irq(&q->queue_lock); + blk_mq_unfreeze_queue(q); + return -EBUSY; + } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f2e4bf1dca47..a9982000b667 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -820,6 +820,7 @@ int wbt_init(struct request_queue *q) { struct rq_wb *rwb; int i; + int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) @@ -846,7 +847,10 @@ int wbt_init(struct request_queue *q) /* * Assign rwb and add the stats callback. */ - rq_qos_add(q, &rwb->rqos); + ret = rq_qos_add(q, &rwb->rqos); + if (ret) + goto err_free; + blk_stat_add_callback(q, rwb->cb); rwb->min_lat_nsec = wbt_default_latency_nsec(q); @@ -855,4 +859,10 @@ int wbt_init(struct request_queue *q) wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); return 0; + +err_free: + blk_stat_free_callback(rwb->cb); + kfree(rwb); + return ret; + } -- cgit v1.2.3 From 0a3e5cc7bbfcd571a2e53779ef7d7aa3c57d5432 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2022 15:05:40 +0200 Subject: blk-mq: fix error handling in __blk_mq_alloc_disk To fully clean up the queue if the disk allocation fails we need to call blk_mq_destroy_queue and not just blk_put_queue. Fixes: 6f8191fdf41d ("block: simplify disk shutdown") Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220720130541.1323531-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d716b7f3763f..70177ee74295 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3960,7 +3960,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { - blk_put_queue(q); + blk_mq_destroy_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); -- cgit v1.2.3 From c5db2cfc6274692d821d33b59acb6ff615e350c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2022 15:05:41 +0200 Subject: block: call blk_mq_exit_queue from disk_release for never added disks To undo the all initialization from blk_mq_init_allocated_queue in case of a probe failure where add_disk is never called we have to call blk_mq_exit_queue from put_disk. This relies on the fact that drivers always call blk_mq_free_tag_set after calling put_disk in the probe error path if they have a gendisk at all. We should be doing this in general, but can't do it for the normal teardown case (yet) as the tagset can be gone by the time the disk is released once it was added. I hope to sort this out properly eventually but for now this isolated hack will do it. Fixes: 6f8191fdf41d ("block: simplify disk shutdown") Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220720130541.1323531-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 44dfcf67ed96..e1d5b10ac193 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1138,6 +1138,18 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); + /* + * To undo the all initialization from blk_mq_init_allocated_queue in + * case of a probe failure where add_disk is never called we have to + * call blk_mq_exit_queue here. We can't do this for the more common + * teardown case (yet) as the tagset can be gone by the time the disk + * is released once it was added. + */ + if (queue_is_mq(disk->queue) && + test_bit(GD_OWNS_QUEUE, &disk->state) && + !test_bit(GD_ADDED, &disk->state)) + blk_mq_exit_queue(disk->queue); + blkcg_exit_queue(disk->queue); disk_release_events(disk); @@ -1403,6 +1415,9 @@ EXPORT_SYMBOL(__blk_alloc_disk); * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. * + * Note: for blk-mq disk put_disk must be called before freeing the tag_set + * when handling probe errors (that is before add_disk() is called). + * * Context: Any context, but the last reference must not be dropped from * atomic context. */ -- cgit v1.2.3 From 828b5f017d9d5d0491cd2e71d48f4f2139078e2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Jul 2022 08:34:32 +0200 Subject: block: remove __blk_get_queue __blk_get_queue is only called by blk_get_queue, so merge the two. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220721063432.1714609-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 10 ++++------ block/blk.h | 5 ----- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 123468b9d2e4..3d286a256d3d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -461,12 +461,10 @@ fail_q: */ bool blk_get_queue(struct request_queue *q) { - if (likely(!blk_queue_dying(q))) { - __blk_get_queue(q); - return true; - } - - return false; + if (unlikely(blk_queue_dying(q))) + return false; + kobject_get(&q->kobj); + return true; } EXPORT_SYMBOL(blk_get_queue); diff --git a/block/blk.h b/block/blk.h index c4b084bfe87c..1d83b1d41cd1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -31,11 +31,6 @@ extern struct kmem_cache *blk_requestq_srcu_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; -static inline void __blk_get_queue(struct request_queue *q) -{ - kobject_get(&q->kobj); -} - bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, -- cgit v1.2.3