From 3f9f231236ce7e48780d8a4f1f8cb9fae2df1e4e Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 17 Jan 2024 11:19:45 +0800 Subject: md: Fix overflow in is_mddev_idle UBSAN reports this problem: UBSAN: Undefined behaviour in drivers/md/md.c:8175:15 signed integer overflow: -2147483291 - 2072033152 cannot be represented in type 'int' Call trace: dump_backtrace+0x0/0x310 show_stack+0x28/0x38 dump_stack+0xec/0x15c ubsan_epilogue+0x18/0x84 handle_overflow+0x14c/0x19c __ubsan_handle_sub_overflow+0x34/0x44 is_mddev_idle+0x338/0x3d8 md_do_sync+0x1bb8/0x1cf8 md_thread+0x220/0x288 kthread+0x1d8/0x1e0 ret_from_fork+0x10/0x18 'curr_events' will overflow when stat accum or 'sync_io' is greater than INT_MAX. Fix it by changing sync_io, last_events and curr_events to 64bit. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240117031946.2324519-2-linan666@huaweicloud.com Signed-off-by: Song Liu --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..d0fa4f1c29a2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -172,7 +172,7 @@ struct gendisk { struct list_head slave_bdevs; #endif struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ + atomic64_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3 From 293066264fb45df4290897c22e69833bea5fe171 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Apr 2024 16:37:26 +0200 Subject: scsi: block: Add a helper to cancel atomic queue limit updates Drivers might have to perform complex actions to determine queue limits, and those might fail. Add a helper to cancel a queue limit update that can be called in those cases. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240409143748.980206-2-hch@lst.de Reviewed-by: Kanchan Joshi Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..ded7f66dc4b9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -892,6 +892,19 @@ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); +/** + * queue_limits_cancel_update - cancel an atomic update of queue limits + * @q: queue to update + * + * This functions cancels an atomic update of the queue limits started by + * queue_limits_start_update() and should be used when an error occurs after + * starting update. + */ +static inline void queue_limits_cancel_update(struct request_queue *q) +{ + mutex_unlock(&q->limits_lock); +} + /* * Access functions for manipulating queue properties */ -- cgit v1.2.3 From ec84ca4025c0b90c6b7173a230f78ec00cad44f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Apr 2024 16:37:48 +0200 Subject: scsi: block: Remove now unused queue limits helpers Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240409143748.980206-24-hch@lst.de Reviewed-by: Bart Van Assche Reviewed-by: John Garry Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ded7f66dc4b9..e3c7082efa39 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -908,15 +908,9 @@ static inline void queue_limits_cancel_update(struct request_queue *q) /* * Access functions for manipulating queue properties */ -void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit); -extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); -extern void blk_queue_max_segments(struct request_queue *, unsigned short); -extern void blk_queue_max_discard_segments(struct request_queue *, - unsigned short); void blk_queue_max_secure_erase_sectors(struct request_queue *q, unsigned int max_sectors); -extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, @@ -933,7 +927,6 @@ void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); -extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, @@ -941,10 +934,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); -extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -extern void blk_queue_dma_alignment(struct request_queue *, int); -extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); @@ -961,8 +950,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk, extern void blk_queue_required_elevator_features(struct request_queue *q, unsigned int features); -extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, - struct device *dev); bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); -- cgit v1.2.3 From b85a3c1b7978f942fa5bf8cfe22b6a6aaa49d3b7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:04 +0900 Subject: block: Introduce bio_straddles_zones() and bio_offset_from_zone_start() Implement the inline helper functions bio_straddles_zones() and bio_offset_from_zone_start() to respectively test if a BIO crosses a zone boundary (the start sector and last sector belong to different zones) and to obtain the offset of a BIO from the start sector of its target zone. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..ec7bd7091467 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -853,6 +853,13 @@ static inline unsigned int bio_zone_no(struct bio *bio) return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); } +static inline bool bio_straddles_zones(struct bio *bio) +{ + return bio_sectors(bio) && + bio_zone_no(bio) != + disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); +} + static inline unsigned int bio_zone_is_seq(struct bio *bio) { return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); @@ -1328,6 +1335,12 @@ static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, return sector & (bdev_zone_sectors(bdev) - 1); } +static inline sector_t bio_offset_from_zone_start(struct bio *bio) +{ + return bdev_offset_from_zone_start(bio->bi_bdev, + bio->bi_iter.bi_sector); +} + static inline bool bdev_is_zone_start(struct block_device *bdev, sector_t sector) { -- cgit v1.2.3 From ecfe43b11b02ffeb24c203af7d3947417d412cf7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:06 +0900 Subject: block: Remember zone capacity when revalidating zones In preparation for adding zone write plugging, modify blk_revalidate_disk_zones() to get the capacity of zones of a zoned block device. This capacity value as a number of 512B sectors is stored in the gendisk zone_capacity field. Given that host-managed SMR disks (including zoned UFS drives) and all known NVMe ZNS devices have the same zone capacity for all zones blk_revalidate_disk_zones() returns an error if different capacities are detected for different zones. This also adds check to verify that the values reported by the device for zone capacities are correct, that is, that the zone capacity is never 0, does not exceed the zone size and is equal to the zone size for conventional zones. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240408014128.205141-7-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ec7bd7091467..4e81f714cca7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -191,6 +191,7 @@ struct gendisk { * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; + unsigned int zone_capacity; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From dd291d77cc90eb6a86e9860ba8e6e38eebd57d12 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:07 +0900 Subject: block: Introduce zone write plugging Zone write plugging implements a per-zone "plug" for write operations to control the submission and execution order of write operations to sequential write required zones of a zoned block device. Per-zone plugging guarantees that at any time there is at most only one write request per zone being executed. This mechanism is intended to replace zone write locking which implements a similar per-zone write throttling at the scheduler level, but is implemented only by mq-deadline. Unlike zone write locking which operates on requests, zone write plugging operates on BIOs. A zone write plug is simply a BIO list that is atomically manipulated using a spinlock and a kblockd submission work. A write BIO to a zone is "plugged" to delay its execution if a write BIO for the same zone was already issued, that is, if a write request for the same zone is being executed. The next plugged BIO is unplugged and issued once the write request completes. This mechanism allows to: - Untangle zone write ordering from block IO schedulers. This allows removing the restriction on using mq-deadline for writing to zoned block devices. Any block IO scheduler, including "none" can be used. - Zone write plugging operates on BIOs instead of requests. Plugged BIOs waiting for execution thus do not hold scheduling tags and thus are not preventing other BIOs from executing (reads or writes to other zones). Depending on the workload, this can significantly improve the device use (higher queue depth operation) and performance. - Both blk-mq (request based) zoned devices and BIO-based zoned devices (e.g. device mapper) can use zone write plugging. It is mandatory for the former but optional for the latter. BIO-based drivers can use zone write plugging to implement write ordering guarantees, or the drivers can implement their own if needed. - The code is less invasive in the block layer and is mostly limited to blk-zoned.c with some small changes in blk-mq.c, blk-merge.c and bio.c. Zone write plugging is implemented using struct blk_zone_wplug. This structure includes a spinlock, a BIO list and a work structure to handle the submission of plugged BIOs. Zone write plugs structures are managed using a per-disk hash table. Plugging of zone write BIOs is done using the function blk_zone_write_plug_bio() which returns false if a BIO execution does not need to be delayed and true otherwise. This function is called from blk_mq_submit_bio() after a BIO is split to avoid large BIOs spanning multiple zones which would cause mishandling of zone write plugs. This ichange enables by default zone write plugging for any mq request-based block device. BIO-based device drivers can also use zone write plugging by expliclty calling blk_zone_write_plug_bio() in their ->submit_bio method. For such devices, the driver must ensure that a BIO passed to blk_zone_write_plug_bio() is already split and not straddling zone boundaries. Only write and write zeroes BIOs are plugged. Zone write plugging does not introduce any significant overhead for other operations. A BIO that is being handled through zone write plugging is flagged using the new BIO flag BIO_ZONE_WRITE_PLUGGING. A request handling a BIO flagged with this new flag is flagged with the new RQF_ZONE_WRITE_PLUGGING flag. The completion of BIOs and requests flagged trigger respectively calls to the functions blk_zone_write_bio_endio() and blk_zone_write_complete_request(). The latter function is used to trigger submission of the next plugged BIO using the zone plug work. blk_zone_write_bio_endio() does the same for BIO-based devices. This ensures that at any time, at most one request (blk-mq devices) or one BIO (BIO-based devices) is being executed for any zone. The handling of zone write plugs using a per-zone plug spinlock maximizes parallelism and device usage by allowing multiple zones to be writen simultaneously without lock contention. Zone write plugging ignores flush BIOs without data. Hovever, any flush BIO that has data is always plugged so that the write part of the flush sequence is serialized with other regular writes. Given that any BIO handled through zone write plugging will be the only BIO in flight for the target zone when it is executed, the unplugging and submission of a BIO will have no chance of successfully merging with plugged requests or requests in the scheduler. To overcome this potential performance degradation, blk_mq_submit_bio() calls the function blk_zone_write_plug_attempt_merge() to try to merge other plugged BIOs with the one just unplugged and submitted. Successful merging is signaled using blk_zone_write_plug_bio_merged(), called from bio_attempt_back_merge(). Furthermore, to avoid recalculating the number of segments of plugged BIOs to attempt merging, the number of segments of a plugged BIO is saved using the new struct bio field __bi_nr_segments. To avoid growing the size of struct bio, this field is added as a union with the bio_cookie field. This is safe to do as polling is always disabled for plugged BIOs. When BIOs are plugged in a zone write plug, the device request queue usage counter is always incremented. This reference is kept and reused for blk-mq devices when the plugged BIO is unplugged and submitted again using submit_bio_noacct_nocheck(). For this case, the unplugged BIO is already flagged with BIO_ZONE_WRITE_PLUGGING and blk_mq_submit_bio() proceeds directly to allocating a new request for the BIO, re-using the usage reference count taken when the BIO was plugged. This extra reference count is dropped in blk_zone_write_plug_attempt_merge() for any plugged BIO that is successfully merged. Given that BIO-based devices will not take this path, the extra reference is dropped after a plugged BIO is unplugged and submitted. Zone write plugs are dynamically allocated and managed using a hash table (an array of struct hlist_head) with RCU protection. A zone write plug is allocated when a write BIO is received for the zone and not freed until the zone is fully written, reset or finished. To detect when a zone write plug can be freed, the write state of each zone is tracked using a write pointer offset which corresponds to the offset of a zone write pointer relative to the zone start. Write operations always increment this write pointer offset. Zone reset operations set it to 0 and zone finish operations set it to the zone size. If a write error happens, the wp_offset value of a zone write plug may become incorrect and out of sync with the device managed write pointer. This is handled using the zone write plug flag BLK_ZONE_WPLUG_ERROR. The function blk_zone_wplug_handle_error() is called from the new disk zone write plug work when this flag is set. This function executes a report zone to update the zone write pointer offset to the current value as indicated by the device. The disk zone write plug work is scheduled whenever a BIO flagged with BIO_ZONE_WRITE_PLUGGING completes with an error or when bio_zone_wplug_prepare_bio() detects an unaligned write. Once scheduled, the disk zone write plugs work keeps running until all zone errors are handled. To match the new data structures used for zoned disks, the function disk_free_zone_bitmaps() is renamed to the more generic disk_free_zone_resources(). The function disk_init_zone_resources() is also introduced to initialize zone write plugs resources when a gendisk is allocated. In order to guarantee that the user can simultaneously write up to a number of zones equal to a device max active zone limit or max open zone limit, zone write plugs are allocated using a mempool sized to the maximum of these 2 device limits. For a device that does not have active and open zone limits, 128 is used as the default mempool size. If a change to the device active and open zone limits is detected, the disk mempool is resized when blk_revalidate_disk_zones() is executed. This commit contains contributions from Christoph Hellwig . Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240408014128.205141-8-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e81f714cca7..348b57ca0425 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -194,6 +194,12 @@ struct gendisk { unsigned int zone_capacity; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; + unsigned int zone_wplugs_hash_bits; + spinlock_t zone_wplugs_lock; + struct mempool_s *zone_wplugs_pool; + struct hlist_head *zone_wplugs_hash; + struct list_head zone_wplugs_err_list; + struct work_struct zone_wplugs_work; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -663,6 +669,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return bdev->bd_disk->queue->limits.max_active_zones; } +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int bdev_nr_zones(struct block_device *bdev) { @@ -690,6 +697,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { return 0; } +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_depth(struct request_queue *q) -- cgit v1.2.3 From ccdbf0aad2523ca133cceb22ce0f8306730e7ac3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:09 +0900 Subject: block: Allow zero value of max_zone_append_sectors queue limit In preparation for adding a generic zone append emulation using zone write plugging, allow device drivers supporting zoned block device to set a the max_zone_append_sectors queue limit of a device to 0 to indicate the lack of native support for zone append operations and that the block layer should emulate these operations using regular write operations. blk_queue_max_zone_append_sectors() is modified to allow passing 0 as the max_zone_append_sectors argument. The function queue_max_zone_append_sectors() is also modified to ensure that the minimum of the max_hw_sectors and chunk_sectors limit is used whenever the max_zone_append_sectors limit is 0. This minimum is consistent with the value set for the max_zone_append_sectors limit by the function blk_validate_zoned_limits() when limits for a queue are validated. The helper functions queue_emulates_zone_append() and bdev_emulates_zone_append() are added to test if a queue (or block device) emulates zone append operations. In order for blk_revalidate_disk_zones() to accept zoned block devices relying on zone append emulation, the direct check to the max_zone_append_sectors queue limit of the disk is replaced by a check using the value returned by queue_max_zone_append_sectors(). Similarly, queue_zone_append_max_show() is modified to use the same accessor so that the sysfs attribute advertizes the non-zero limit that will be used, regardless if it is for native or emulated commands. For stacking drivers, a top device should not need to care if the underlying devices have native or emulated zone append operations. blk_stack_limits() is thus modified to set the top device max_zone_append_sectors limit using the new accessor queue_limits_max_zone_append_sectors(). queue_max_zone_append_sectors() is modified to use this function as well. Stacking drivers that require zone append emulation, e.g. dm-crypt, can still request this feature by calling blk_queue_max_zone_append_sectors() with a 0 limit. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-10-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 348b57ca0425..46613bf6a402 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1173,12 +1173,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) { + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - const struct queue_limits *l = &q->limits; + return min_not_zero(l->max_zone_append_sectors, max_sectors); +} + +static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) +{ + if (!blk_queue_is_zoned(q)) + return 0; - return min(l->max_zone_append_sectors, l->max_sectors); + return queue_limits_max_zone_append_sectors(&q->limits); +} + +static inline bool queue_emulates_zone_append(struct request_queue *q) +{ + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; +} + +static inline bool bdev_emulates_zone_append(struct block_device *bdev) +{ + return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int -- cgit v1.2.3 From 9b3c08b90fc212de58c34621d83e74977170b2cd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:20 +0900 Subject: block: Simplify blk_revalidate_disk_zones() interface The only user of blk_revalidate_disk_zones() second argument was the SCSI disk driver (sd). Now that this driver does not require this update_driver_data argument, remove it to simplify the interface of blk_revalidate_disk_zones(). Also update the function kdoc comment to be more accurate (i.e. there is no gendisk ->revalidate method). Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-21-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46613bf6a402..fbc6860b3622 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -336,8 +336,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); +int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes -- cgit v1.2.3 From e4eb37cc0f3ed8971c50dddfbeb35a799e5b504e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:22 +0900 Subject: block: Remove elevator required features The only elevator feature ever implemented is ELEVATOR_F_ZBD_SEQ_WRITE for signaling that a scheduler implements zone write locking to tightly control the dispatching order of write operations to zoned block devices. With the removal of zone write locking support in mq-deadline and the reliance of all block device drivers on the block layer zone write plugging to control ordering of write operations to zones, the elevator feature ELEVATOR_F_ZBD_SEQ_WRITE is completely unused. Remove it, and also remove the now unused code for filtering the possible schedulers for a block device based on required features. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-23-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fbc6860b3622..017e9d064177 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -453,8 +453,6 @@ struct request_queue { atomic_t nr_active_requests_shared_tags; - unsigned int required_elevator_features; - struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; @@ -958,14 +956,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); -/* - * Elevator features for blk_queue_required_elevator_features: - */ -/* Supports zoned block devices sequential write constraint */ -#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) - -extern void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); -- cgit v1.2.3 From 02ccd7c360b1692da164842f211d41fab7d83adc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:26 +0900 Subject: block: Remove zone write locking Zone write locking is now unused and replaced with zone write plugging. Remove all code that was implementing zone write locking, that is, the various helper functions controlling request zone write locking and the gendisk attached zone bitmaps. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-27-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 017e9d064177..7e8a68805324 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -177,23 +177,14 @@ struct gendisk { #ifdef CONFIG_BLK_DEV_ZONED /* - * Zoned block device information for request dispatch control. - * nr_zones is the total number of zones of the device. This is always - * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones - * bits which indicates if a zone is conventional (bit set) or - * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones - * bits which indicates if a zone is write locked, that is, if a write - * request targeting the zone was dispatched. - * - * Reads of this information must be protected with blk_queue_enter() / - * blk_queue_exit(). Modifying this information is only allowed while - * no requests are being processed. See also blk_mq_freeze_queue() and - * blk_mq_unfreeze_queue(). + * Zoned block device information. Reads of this information must be + * protected with blk_queue_enter() / blk_queue_exit(). Modifying this + * information is only allowed while no requests are being processed. + * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; unsigned int zone_capacity; unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; @@ -635,15 +626,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - if (!blk_queue_is_zoned(disk->queue)) - return false; - if (!disk->conv_zones_bitmap) - return true; - return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); -} - static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { @@ -677,10 +659,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - return false; -} static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { return 0; @@ -869,11 +847,6 @@ static inline bool bio_straddles_zones(struct bio *bio) disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } -static inline unsigned int bio_zone_is_seq(struct bio *bio) -{ - return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); -} - /* * Return how much of the chunk is left to be used for I/O at a given offset. */ -- cgit v1.2.3 From 99a9476b27e89525cef653b91e542baf61f105d2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 8 Apr 2024 10:41:28 +0900 Subject: block: Do not special-case plugging of zone write operations With the block layer zone write plugging being automatically done for any write operation to a zone of a zoned block device, a regular request plugging handled through current->plug can only ever see at most a single write request per zone. In such case, any potential reordering of the plugged requests will be harmless. We can thus remove the special casing for write operations to zones and have these requests plugged as well. This allows removing the function blk_mq_plug and instead directly using current->plug where needed. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Tested-by: Hans Holmberg Tested-by: Dennis Maisenbacher Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240408014128.205141-29-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e8a68805324..1ce8ba08e318 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1299,18 +1299,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) return disk_zone_no(bdev->bd_disk, sec); } -/* Whether write serialization is required for @op on zoned devices. */ -static inline bool op_needs_zoned_write_locking(enum req_op op) -{ - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; -} - -static inline bool bdev_op_is_zoned_write(struct block_device *bdev, - enum req_op op) -{ - return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op); -} - static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From a8f59e5a5deaf3e99a8b7252e96cee9af67858a9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 20 Apr 2024 16:58:11 +0900 Subject: block: use a per disk workqueue for zone write plugging A zone write plug BIO work function blk_zone_wplug_bio_work() calls submit_bio_noacct_nocheck() to execute the next unplugged BIO. This function may block. So executing zone plugs BIO works using the block layer global kblockd workqueue can potentially lead to preformance or latency issues as the number of concurrent work for a workqueue is limited to WQ_DFL_ACTIVE (256). 1) For a system with a large number of zoned disks, issuing write requests to otherwise unused zones may be delayed wiating for a work thread to become available. 2) Requeue operations which use kblockd but are independent of zone write plugging may alsoi end up being delayed. To avoid these potential performance issues, create a workqueue per zoned device to execute zone plugs BIO work. The workqueue max active parameter is set to the maximum number of zone write plugs allocated with the zone write plug mempool. This limit is equal to the maximum number of open zones of the disk and defaults to 128 for disks that do not have a limit on the number of open zones. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240420075811.1276893-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ce8ba08e318..d22eb41a05b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -191,6 +191,7 @@ struct gendisk { struct hlist_head *zone_wplugs_hash; struct list_head zone_wplugs_err_list; struct work_struct zone_wplugs_work; + struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) -- cgit v1.2.3 From ead083aeeed9df44fab9227e47688f7305c3a233 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Apr 2024 00:34:31 -0400 Subject: set_blocksize(): switch to passing struct file * Signed-off-by: Al Viro --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 172c91879999..20c749b2ebc2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1474,7 +1474,7 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) } int bdev_read_only(struct block_device *bdev); -int set_blocksize(struct block_device *bdev, int size); +int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); -- cgit v1.2.3 From b8c873edbf35570b93edfeddad9e85da54defa52 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 12 Apr 2024 01:01:36 -0400 Subject: wrapper for access to ->bd_partno On the next step it's going to get folded into a field where flags will go. Signed-off-by: Al Viro --- include/linux/blkdev.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..32549d675955 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -720,6 +720,11 @@ void invalidate_disk(struct gendisk *disk); void set_disk_ro(struct gendisk *disk, bool read_only); void disk_uevent(struct gendisk *disk, enum kobject_action action); +static inline u8 bdev_partno(const struct block_device *bdev) +{ + return bdev->bd_partno; +} + static inline int get_disk_ro(struct gendisk *disk) { return disk->part0->bd_read_only || @@ -1095,7 +1100,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, static inline bool bdev_is_partition(struct block_device *bdev) { - return bdev->bd_partno; + return bdev_partno(bdev) != 0; } enum blk_default_limits { -- cgit v1.2.3 From 1116b9fa15c09748ae05d2365a305fa22671eb1e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 12 Apr 2024 01:07:29 -0400 Subject: bdev: infrastructure for flags Replace bd_partno with a 32bit field (__bd_flags). The lower 8 bits contain the partition number, the upper 24 are for flags. Helpers: bdev_{test,set,clear}_flag(bdev, flag), with atomic_or() and atomic_andnot() used to set/clear. NOTE: this commit does not actually move any flags over there - they are still bool fields. As the result, it shifts the fields wrt cacheline boundaries; that's going to be restored once the first 3 flags are dealt with. Signed-off-by: Al Viro --- include/linux/blkdev.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32549d675955..99917e5860fd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -722,7 +722,22 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action); static inline u8 bdev_partno(const struct block_device *bdev) { - return bdev->bd_partno; + return atomic_read(&bdev->__bd_flags) & BD_PARTNO; +} + +static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag) +{ + return atomic_read(&bdev->__bd_flags) & flag; +} + +static inline void bdev_set_flag(struct block_device *bdev, unsigned flag) +{ + atomic_or(flag, &bdev->__bd_flags); +} + +static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) +{ + atomic_andnot(flag, &bdev->__bd_flags); } static inline int get_disk_ro(struct gendisk *disk) -- cgit v1.2.3 From 01e198f01d55880b79d8f5ac73064ff30a89d98a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 12 Apr 2024 01:15:55 -0400 Subject: bdev: move ->bd_read_only to ->__bd_flags Signed-off-by: Al Viro --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99917e5860fd..1fe91231f85b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -742,13 +742,13 @@ static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0->bd_read_only || + return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } static inline int bdev_read_only(struct block_device *bdev) { - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); + return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -- cgit v1.2.3 From 186ddac2072a8134798d72635d1ed0f29889369d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 11 Apr 2024 15:53:43 +0100 Subject: block: move two helpers into bdev.c disk_live() and block_size() access bd_inode directly, prepare to remove the field bd_inode from block_device, and only access bd_inode in block layer. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Al Viro Link: https://lore.kernel.org/r/20240411145346.2516848-8-viro@zeniv.linux.org.uk Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 20c749b2ebc2..99ac98ed9548 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -211,11 +211,6 @@ struct gendisk { struct blk_independent_access_ranges *ia_ranges; }; -static inline bool disk_live(struct gendisk *disk) -{ - return !inode_unhashed(disk->part0->bd_inode); -} - /** * disk_openers - returns how many openers are there for a disk * @disk: disk to check @@ -1364,11 +1359,6 @@ static inline unsigned int blksize_bits(unsigned int size) return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT; } -static inline unsigned int block_size(struct block_device *bdev) -{ - return 1 << bdev->bd_inode->i_blkbits; -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); @@ -1536,6 +1526,8 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *I_BDEV(struct inode *inode); struct block_device *file_bdev(struct file *bdev_file); +bool disk_live(struct gendisk *disk); +unsigned int block_size(struct block_device *bdev); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); -- cgit v1.2.3 From 140ce28dd3bee8e53acc27f123ae474d69ef66f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 May 2024 15:00:32 +0200 Subject: block: add a disk_has_partscan helper Add a helper to check if partition scanning is enabled instead of open coding the check in a few places. This now always checks for the hidden flag even if all but one of the callers are never reachable for hidden gendisks. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240502130033.1958492-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 040a22e0eda0..3b18a40a1fc1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -230,6 +230,19 @@ static inline unsigned int disk_openers(struct gendisk *disk) return atomic_read(&disk->part0->bd_openers); } +/** + * disk_has_partscan - return %true if partition scanning is enabled on a disk + * @disk: disk to check + * + * Returns %true if partitions scanning is enabled for @disk, or %false if + * partition scanning is disabled either permanently or temporarily. + */ +static inline bool disk_has_partscan(struct gendisk *disk) +{ + return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. -- cgit v1.2.3 From 504fbcffea649cad69111e7597081dd8adc3b395 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 7 May 2024 10:31:03 +0800 Subject: md: Revert "md: Fix overflow in is_mddev_idle" This reverts commit 3f9f231236ce7e48780d8a4f1f8cb9fae2df1e4e. Using 64bit for 'sync_io' is unnecessary from the gendisk side. This overflow will not cause any functional impact, except for a UBSAN warning. Solving this overflow requires introducing additional calculations and checks which are not necessary. So just keep using 32bit for 'sync_io'. Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20240507023103.781816-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3b18a40a1fc1..26acf80c50c0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -172,7 +172,7 @@ struct gendisk { struct list_head slave_bdevs; #endif struct timer_rand_state *random; - atomic64_t sync_io; /* RAID */ + atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3