From 5c5028ee594ce5f907ca6ad1c32cca6a15098464 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Oct 2025 13:47:15 -0700 Subject: block: rename min_segment_size Despite its name, the block layer is fine with segments smaller that the "min_segment_size" limit. The value is an optimization limit indicating the largest segment that can be used without considering boundary limits. Smaller segments can take a fast path, so give it a name that reflects that: max_fast_segment_size. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 70b671a9a7f7..99be263b31ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -378,7 +378,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; - unsigned int min_segment_size; + unsigned int max_fast_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- cgit v1.2.3 From 113cbd62824afdf62d2f3f092809cf37cc7f1dd8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:07 +0200 Subject: blktrace: pass blk_user_trace2 to setup functions Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This prepares for the incoming extension of the blktrace protocol with a 64bit act_mask. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 122c62e561fc..05c8754456aa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -14,11 +14,12 @@ #include struct blk_trace { + int version; int trace_state; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; - u16 act_mask; + u64 act_mask; u64 start_lba; u64 end_lba; u32 pid; -- cgit v1.2.3 From ec7f31b2a2d3bf6b9e4d4b8cd156587f1d0607d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Nov 2025 05:16:45 -0500 Subject: block: make bio auto-integrity deadlock safe The current block layer automatic integrity protection allocates the actual integrity buffer, which has three problems: - because it happens at the bottom of the I/O stack and doesn't use a mempool it can deadlock under load - because the data size in a bio is almost unbounded when using lage folios it can relatively easily exceed the maximum kmalloc size - even when it does not exceed the maximum kmalloc size, it could exceed the maximum segment size of the device Fix this by limiting the I/O size so that we can allocate at least a 2MiB integrity buffer, i.e. 128MiB for 8 byte PI and 512 byte integrity intervals, and create a mempool as a last resort for this maximum size, mirroring the scheme used for bvecs. As a nice upside none of this can fail now, so we remove the error handling and open code the trivial addition of the bip vec. The new allocation helpers sit outside of bio-integrity-auto.c because I plan to reuse them for file system based PI in the near future. Fixes: 7ba1ba12eeef ("block: Block layer data integrity support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 ++++++ include/linux/blk-integrity.h | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 851254f36eb3..3d05296a5afe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -14,6 +14,8 @@ enum bip_flags { BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_P2P_DMA = 1 << 8, /* using P2P address */ + + BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { @@ -140,4 +142,8 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); +void bio_integrity_free_buf(struct bio_integrity_payload *bip); + #endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index b659373788f6..c2030fd8ba0a 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,6 +8,11 @@ struct request; +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, -- cgit v1.2.3 From fdb9aed869f34d776298b3a8197909eb820e4d0d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:38 +0900 Subject: block: introduce disk_report_zone() Commit b76b840fd933 ("dm: Fix dm-zoned-reclaim zone write pointer alignment") introduced an indirect call for the callback function of a report zones executed with blkdev_report_zones(). This is necessary so that the function disk_zone_wplug_sync_wp_offset() can be called to refresh a zone write plug zone write pointer offset after a write error. However, this solution makes following the path of a zone information harder to understand. Clean this up by introducing the new blk_report_zones_args structure to define a zone report callback and its private data and introduce the helper function disk_report_zone() which calls both disk_zone_wplug_sync_wp_offset() and the zone report user callback function for all zones of a zone report. This helper function must be called by all block device drivers that implement the report zones block operation in order to correctly report a zone information. All block device drivers supporting the report_zones block operation are updated to use this new scheme. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++++- include/linux/device-mapper.h | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99be263b31ab..2f75fb15f55f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; @@ -432,6 +433,9 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -1662,7 +1666,8 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 84fdc3a6a19a..38f625af6ab4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); #ifdef CONFIG_BLK_DEV_ZONED struct dm_report_zones_args { struct dm_target *tgt; + struct gendisk *disk; sector_t next_sector; - void *orig_data; - report_zones_cb orig_cb; unsigned int zone_idx; + /* for block layer ->report_zones */ + struct blk_report_zones_args *rep_args; + + /* for internal users */ + report_zones_cb cb; + void *data; + /* must be filled by ->report_zones before calling dm_report_zones_cb */ sector_t start; }; -- cgit v1.2.3 From 6e945ffb6555705cf20b1fcdc21a139911562995 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:40 +0900 Subject: block: use zone condition to determine conventional zones The conv_zones_bitmap field of struct gendisk is used to define a bitmap to identify the conventional zones of a zoned block device. The bit for a zone is set in this bitmap if the zone is a conventional one, that is, if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this always corresponds to the zone condition BLK_ZONE_COND_NOT_WP. In other words, conv_zones_bitmap tracks a single condition of the zones of a zoned block device. In preparation for tracking more zone conditions, change conv_zones_bitmap into an array of zone conditions, using 1 byte per zone. This increases the memory usage from 1 bit per zone to 1 byte per zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256 MiB zones. This is a trade-off to allow fast cached report zones later on top of this change. Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add a blk_revalidate_zone_cond() function to initialize the zones_cond array of a disk during device scan and to update it on device revalidation. Move the allocation of the zones_cond array to disk_revalidate_zone_resources(), making sure that this array is always allocated, even for devices that do not need zone write plugs (zone resources), to ensure that bdev_zone_is_seq() can be re-implemented to use the zone condition array in place of the conv zones bitmap. Finally, the function bdev_zone_is_seq() is rewritten to use a test on the condition of the target zone. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f75fb15f55f..53bcfbc2f68f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,7 +196,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long __rcu *conv_zones_bitmap; + u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; @@ -925,12 +925,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev, { return disk_zone_capacity(bdev->bd_disk, pos); } + +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector); + #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false; @@ -1533,33 +1541,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev, return bdev_is_zone_start(bdev, sector); } -/** - * bdev_zone_is_seq - check if a sector belongs to a sequential write zone - * @bdev: block device to check - * @sector: sector number - * - * Check if @sector on @bdev is contained in a sequential write required zone. - */ -static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) -{ - bool is_seq = false; - -#if IS_ENABLED(CONFIG_BLK_DEV_ZONED) - if (bdev_is_zoned(bdev)) { - struct gendisk *disk = bdev->bd_disk; - unsigned long *bitmap; - - rcu_read_lock(); - bitmap = rcu_dereference(disk->conv_zones_bitmap); - is_seq = !bitmap || - !test_bit(disk_zone_no(disk, sector), bitmap); - rcu_read_unlock(); - } -#endif - - return is_seq; -} - int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); -- cgit v1.2.3 From f2284eec5053df271c78e687672247922bcee881 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:43 +0900 Subject: block: introduce blkdev_get_zone_info() Introduce the function blkdev_get_zone_info() to obtain a single zone information from cached zone data, that is, either from the zone write plug for the target zone if it exists and from the disk zones_cond array otherwise. Since sequential zones that do not have a zone write plug are either full, empty or in a bad state (read-only or offline), the zone write pointer can be inferred from the zone condition cached in the disk zones_cond array. For sequential zones that have a zone write plug, the zone condition and zone write pointer are obtained from the condition and write pointer offset managed with the zone write plug. This allows obtaining the information for a zone much more quickly than having to execute a report zones command on the device. blkdev_get_zone_info() falls back to using a regular zone report if the target zone is flagged as needing an update with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag, or if the target device does not use zone write plugs (i.e. a device mapper device). In this case, the new function blkdev_report_zone_fallback() is used and the zone condition is reported consistantly with the cahced report, that is, the BLK_ZONE_COND_ACTIVE condition is used in place of the implicit open, explicit open and closed conditions. This is achieved by adding the .report_active field to struct blk_report_zones_args and by having disk_report_zone() sets the correct zone condition if .report_active is true. In preparation for using blkdev_get_zone_info() in upcoming file systems changes, also export this function as a GPL symbol. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53bcfbc2f68f..03a594b4dfbc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -436,6 +436,9 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args); +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -- cgit v1.2.3 From 31f0656a4ab712edf2888eabcc0664197a4a938e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:44 +0900 Subject: block: introduce blkdev_report_zones_cached() Introduce the function blkdev_report_zones_cached() to provide a fast report zone built using the blkdev_get_zone_info() function, which gets zone information from a disk zones_cond array or zone write plugs. For a large capacity SMR drive, such fast report zone can be completed in a few milliseconds compared to several seconds completion times when the report zone is obtained from the device. The zone report is built in the same manner as with the regular blkdev_report_zones() function, that is, the first zone reported is the one containing the specified start sector and the report is limited to the specified number of zones (nr_zones argument). The information for each zone in the report is obtained using blkdev_get_zone_info(). For zoned devices that do not use zone write plug resources, using blkdev_get_zone_info() is inefficient as the zone report would be very slow, generated one zone at a time. To avoid this, blkdev_report_zones_cached() falls back to calling blkdev_do_report_zones() to execute a regular zone report. In this case, the .report_active field of struct blk_report_zones_args is set to true to report zone conditions using the BLK_ZONE_COND_ACTIVE condition in place of the implicit open, explicit open and closed conditions. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 03a594b4dfbc..f0ab02e0a673 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,6 +442,8 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); -- cgit v1.2.3 From 15638d52cbcf6e969f4a5e2757b118355db583f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2025 14:52:15 -0500 Subject: block: fix cached zone reporting after zone append was used No zone plugs are allocated when a zone is opened by calling Zone Append on it. This makes the cached zone reporting report incorrectly empty zones if the file system is unmounted and report zones is called after that, e.g. by xfstests test cases using the scratch device. Fix this by recording if zone append was used on a device, and disable cached reporting for the device until a ZONE_RESET_ALL happens that guarantees all zones are empty. We could probably do even better using a per-zone flag, but the practical use cache for zone reporting after the initial mount are rather limited, so let's keep things simple for now. Fixes: 31f0656a4ab7 ("block: introduce blkdev_report_zones_cached()") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f0ab02e0a673..6a498aa7f7e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -173,6 +173,7 @@ struct gendisk { #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 +#define GD_ZONE_APPEND_USED 7 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ -- cgit v1.2.3 From 2f6b2565d43cdb5087cac23d530cca84aa3d897e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Oct 2025 08:04:55 -0700 Subject: block: accumulate memory segment gaps per bio The blk-mq dma iterator has an optimization for requests that align to the device's iommu merge boundary. This boundary may be larger than the device's virtual boundary, but the code had been depending on that queue limit to know ahead of time if the request is guaranteed to align to that optimization. Rather than rely on that queue limit, which many devices may not report, save the lowest set bit of any boundary gap between each segment in the bio while checking the segments. The request stores the value for merging and quickly checking per io if the request can use iova optimizations. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/blk-mq.h | 16 ++++++++++++++++ include/linux/blk_types.h | 12 ++++++++++++ 3 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 16c1c85613b7..ad2d57908c1c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align); +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b25d12545f46..b54506b3b76d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,6 +152,14 @@ struct request { unsigned short nr_phys_segments; unsigned short nr_integrity_segments; + /* + * The lowest set bit for address gaps between physical segments. This + * provides information necessary for dma optimization opprotunities, + * like for testing if the segments can be coalesced against the + * device's iommu granule. + */ + unsigned char phys_gap_bit; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; @@ -208,6 +216,14 @@ struct request { void *end_io_data; }; +/* + * Returns a mask with all bits starting at req->phys_gap_bit set to 1. + */ +static inline unsigned long req_phys_gap_mask(const struct request *req) +{ + return ~(((1 << req->phys_gap_bit) >> 1) - 1); +} + static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8e8d1cc8b06c..53501ebb0623 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -218,6 +218,18 @@ struct bio { enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; + + /* + * The bvec gap bit indicates the lowest set bit in any address offset + * between all bi_io_vecs. This field is initialized only after the bio + * is split to the hardware limits (see bio_split_io_at()). The value + * may be used to consider DMA optimization when performing that + * mapping. The value is compared to a power of two mask where the + * result depends on any bit set within the mask, so saving the lowest + * bit is sufficient to know if any segment gap collides with the mask. + */ + u8 bi_bvec_gap_bit; + atomic_t __bi_remaining; struct bvec_iter bi_iter; -- cgit v1.2.3 From 25976c314f6596254c9b1e2291d94393b7d5ae81 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Nov 2025 15:38:44 +0900 Subject: block: introduce bdev_zone_start() Introduce the function bdev_zone_start() as a more explicit (and clear) replacement for ALIGN_DOWN() to get the start sector of a zone containing a particular sector of a zoned block device. Use this new helper in blkdev_get_zone_info() and blkdev_report_zones_cached(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a498aa7f7e7..2fff8a80dbd2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,6 +1522,12 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return q->limits.chunk_sectors; } +static inline sector_t bdev_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & ~(bdev_zone_sectors(bdev) - 1); +} + static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { -- cgit v1.2.3 From 37f0c7a8df7ad719a68fa1c2dbf066cfebc391a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 14 Nov 2025 11:07:04 +0200 Subject: block-dma: properly take MMIO path In commit eadaa8b255f3 ("dma-mapping: introduce new DMA attribute to indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe MMIO addresses, which require to avoid any memory cache flushing, as an outcome of the discussion pointed in Link tag below. In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic treated this as regular page and relied on "struct page" DMA flow. That flow performs CPU cache flushing, which shouldn't be done here, and doesn't set IOMMU_MMIO flag in DMA-IOMMU case. As a solution, let's encode peer-to-peer transaction type in NVMe IOD flags variable and provide it to blk-mq-dma API. Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Leon Romanovsky Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 1 - include/linux/blk-integrity.h | 14 -------------- include/linux/blk-mq-dma.h | 28 +++++++++++++--------------- include/linux/blk_types.h | 2 -- 4 files changed, 13 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 3d05296a5afe..21e4652dcfd2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,7 +13,6 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ - BIP_P2P_DMA = 1 << 8, /* using P2P address */ BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c2030fd8ba0a..a6b84206eb94 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -33,14 +33,6 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); -} - int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -129,12 +121,6 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return false; -} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 51829958d872..cb88fc791fbd 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -16,13 +16,13 @@ struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; u32 len; + struct pci_p2pdma_map_state p2pdma; /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ struct blk_map_iter iter; - struct pci_p2pdma_map_state p2pdma; }; bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, @@ -43,36 +43,34 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_dma_unmap - try to DMA unmap a request + * blk_rq_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap - * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR + * @map: peer-to-peer mapping type * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len, bool is_p2p) +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, + enum pci_p2pdma_map_type map) { - if (is_p2p) + if (map == PCI_P2PDMA_MAP_BUS_ADDR) return true; if (dma_use_iova(state)) { + unsigned int attrs = 0; + + if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), - 0); + attrs); return true; } return !dma_need_unmap(dma_dev); } - -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - req->cmd_flags & REQ_P2PDMA); -} - #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 53501ebb0623..d884cc1256ec 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -393,7 +393,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -426,7 +425,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From 42adb2d4ef24d2834cbd3bb96a6660826ae763da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Nov 2025 13:04:06 -0800 Subject: fs: Add the __data_racy annotation to backing_dev_info.ra_pages Some but not all .ra_pages changes happen while block layer I/O is paused with blk_mq_freeze_queue(). Filesystems may read .ra_pages even while block layer I/O is paused, e.g. from inside their .fadvise callback. Annotating all .ra_pages reads with READ_ONCE() would be cumbersome. Hence, add the __data_racy annotatation to the .ra_pages member variable. Cc: Alexander Viro Cc: Christian Brauner Cc: Nilay Shroff Signed-off-by: Bart Van Assche Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..30f4bd9ff7c8 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -168,7 +168,9 @@ struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ + /* max readahead in PAGE_SIZE units */ + unsigned long __data_racy ra_pages; + unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ -- cgit v1.2.3 From 935a20d1bebf6236076785fac3ff81e3931834e9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Nov 2025 13:04:07 -0800 Subject: block: Remove queue freezing from several sysfs store callbacks Freezing the request queue from inside sysfs store callbacks may cause a deadlock in combination with the dm-multipath driver and the queue_if_no_path option. Additionally, freezing the request queue slows down system boot on systems where sysfs attributes are set synchronously. Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue() calls from the store callbacks that do not strictly need these callbacks. Add the __data_racy annotation to request_queue.rq_timeout to suppress KCSAN data race reports about the rq_timeout reads. This patch may cause a small delay in applying the new settings. For all the attributes affected by this patch, I/O will complete correctly whether the old or the new value of the attribute is used. This patch affects the following sysfs attributes: * io_poll_delay * io_timeout * nomerges * read_ahead_kb * rq_affinity Here is an example of a deadlock triggered by running test srp/002 if this patch is not applied: task:multipathd Call Trace: __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 schedule_preempt_disabled+0x1c/0x30 __mutex_lock+0xb89/0x1650 mutex_lock_nested+0x1f/0x30 dm_table_set_restrictions+0x823/0xdf0 __bind+0x166/0x590 dm_swap_table+0x2a7/0x490 do_resume+0x1b1/0x610 dev_suspend+0x55/0x1a0 ctl_ioctl+0x3a5/0x7e0 dm_ctl_ioctl+0x12/0x20 __x64_sys_ioctl+0x127/0x1a0 x64_sys_call+0xe2b/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 task:(udev-worker) Call Trace: __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 blk_mq_freeze_queue_wait+0xf2/0x140 blk_mq_freeze_queue_nomemsave+0x23/0x30 queue_ra_store+0x14e/0x290 queue_attr_store+0x23e/0x2c0 sysfs_kf_write+0xde/0x140 kernfs_fop_write_iter+0x3b2/0x630 vfs_write+0x4fd/0x1390 ksys_write+0xfd/0x230 __x64_sys_write+0x76/0xc0 x64_sys_call+0x276/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Cc: Christoph Hellwig Cc: Ming Lei Cc: Nilay Shroff Cc: Martin Wilck Cc: Benjamin Marzinski Cc: stable@vger.kernel.org Fixes: af2814149883 ("block: freeze the queue in queue_attr_store") Signed-off-by: Bart Van Assche Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fff8a80dbd2..cb4ba09959ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -495,7 +495,7 @@ struct request_queue { */ unsigned long queue_flags; - unsigned int rq_timeout; + unsigned int __data_racy rq_timeout; unsigned int queue_depth; -- cgit v1.2.3 From d0c98769ee7d5db8d699a270690639cde1766cd4 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 28 Nov 2025 16:53:13 +0800 Subject: blk-mq: use array manage hctx map instead of xarray After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use an xarray instead of array to store hctx, but in poll mode, each time in blk_mq_poll, we need use xa_load to find corresponding hctx, this introduce some costs. In my test, xa_load may cost 3.8% cpu. This patch revert previous change, eliminates the overhead of xa_load and can result in a 3% performance improvement. Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- include/linux/blkdev.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b54506b3b76d..9208ff90ae16 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1016,7 +1016,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) } #define queue_for_each_hw_ctx(q, hctx, i) \ - xa_for_each(&(q)->hctx_table, (i), (hctx)) + for ((i) = 0; (i) < (q)->nr_hw_queues && \ + ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cb4ba09959ee..6195f89648db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -503,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct xarray hctx_table; + struct blk_mq_hw_ctx **queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From 89e1fb7ceffd898505ad7fa57acec0585bfaa2cc Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 28 Nov 2025 16:53:14 +0800 Subject: blk-mq: fix potential uaf for 'queue_hw_ctx' This is just apply Kuai's patch in [1] with mirror changes. blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate submit_queues through configfs for null_blk), while it might still be used from other context(e.g. switch elevator to none): t1 t2 elevator_switch blk_mq_unquiesce_queue blk_mq_run_hw_queues queue_for_each_hw_ctx // assembly code for hctx = (q)->queue_hw_ctx[i] mov 0x48(%rbp),%rdx -> read old queue_hw_ctx __blk_mq_update_nr_hw_queues blk_mq_realloc_hw_ctxs hctxs = q->queue_hw_ctx q->queue_hw_ctx = new_hctxs kfree(hctxs) movslq %ebx,%rax mov (%rdx,%rax,8),%rdi ->uaf This problem was found by code review, and I comfirmed that the concurrent scenario do exist(specifically 'q->queue_hw_ctx' can be changed during blk_mq_run_hw_queues()), however, the uaf problem hasn't been repoduced yet without hacking the kernel. Sicne the queue is freezed in __blk_mq_update_nr_hw_queues(), fix the problem by protecting 'queue_hw_ctx' through rcu where it can be accessed without grabbing 'q_usage_counter'. [1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/ Signed-off-by: Yu Kuai Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 13 ++++++++++++- include/linux/blkdev.h | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9208ff90ae16..eb7254b3dddd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1015,9 +1015,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) return rq + 1; } +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + struct blk_mq_hw_ctx *hctx; + + rcu_read_lock(); + hctx = rcu_dereference(q->queue_hw_ctx)[id]; + rcu_read_unlock(); + + return hctx; +} + #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ - ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) + ({ hctx = queue_hctx((q), i); 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6195f89648db..72e34acd439c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -503,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx **queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From 9574b21e952256d4fa3c8797c94482a240992d18 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:23 +0800 Subject: kfifo: add kfifo_alloc_node() helper for NUMA awareness Add __kfifo_alloc_node() by refactoring and reusing __kfifo_alloc(), and define kfifo_alloc_node() macro to support NUMA-aware memory allocation. The new __kfifo_alloc_node() function accepts a NUMA node parameter and uses kmalloc_array_node() instead of kmalloc_array() for node-specific allocation. The existing __kfifo_alloc() now calls __kfifo_alloc_node() with NUMA_NO_NODE to maintain backward compatibility. This enables users to allocate kfifo buffers on specific NUMA nodes, which is important for performance in NUMA systems where the kfifo will be primarily accessed by threads running on specific nodes. Cc: Stefani Seibold Cc: Andrew Morton Cc: linux-kernel@vger.kernel.org Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/kfifo.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index fd743d4c4b4b..8b81ac74829c 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -369,6 +369,30 @@ __kfifo_int_must_check_helper( \ }) \ ) +/** + * kfifo_alloc_node - dynamically allocates a new fifo buffer on a NUMA node + * @fifo: pointer to the fifo + * @size: the number of elements in the fifo, this must be a power of 2 + * @gfp_mask: get_free_pages mask, passed to kmalloc() + * @node: NUMA node to allocate memory on + * + * This macro dynamically allocates a new fifo buffer with NUMA node awareness. + * + * The number of elements will be rounded-up to a power of 2. + * The fifo will be release with kfifo_free(). + * Return 0 if no error, otherwise an error code. + */ +#define kfifo_alloc_node(fifo, size, gfp_mask, node) \ +__kfifo_int_must_check_helper( \ +({ \ + typeof((fifo) + 1) __tmp = (fifo); \ + struct __kfifo *__kfifo = &__tmp->kfifo; \ + __is_kfifo_ptr(__tmp) ? \ + __kfifo_alloc_node(__kfifo, size, sizeof(*__tmp->type), gfp_mask, node) : \ + -EINVAL; \ +}) \ +) + /** * kfifo_free - frees the fifo * @fifo: the fifo to be freed @@ -899,8 +923,14 @@ __kfifo_uint_must_check_helper( \ ) -extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask); +extern int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask, int node); + +static inline int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) +{ + return __kfifo_alloc_node(fifo, size, esize, gfp_mask, NUMA_NO_NODE); +} extern void __kfifo_free(struct __kfifo *fifo); -- cgit v1.2.3 From 418de94e7593081c29066555bf9059f1f7dd9d79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Nov 2025 22:57:54 -0800 Subject: sbitmap: fix all kernel-doc warnings Modify kernel-doc comments in sbitmap.h to prevent warnings: Warning: include/linux/sbitmap.h:84 struct member 'alloc_hint' not described in 'sbitmap' Warning: include/linux/sbitmap.h:151 struct member 'ws_active' not described in 'sbitmap_queue' Warning: include/linux/sbitmap.h:552 No description found for return value of 'sbq_wait_ptr' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index ffb9907c7070..cc7ad189caa5 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -75,7 +75,7 @@ struct sbitmap { */ struct sbitmap_word *map; - /* + /** * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different @@ -128,7 +128,7 @@ struct sbitmap_queue { */ struct sbq_wait_state *ws; - /* + /** * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; @@ -547,6 +547,8 @@ static inline void sbq_index_atomic_inc(atomic_t *index) * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. + * + * Return: Next wait queue to be used */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) -- cgit v1.2.3