From 0d8627cc936de8ea04f3cc1e6921c63fb72cc199 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:06 +0200 Subject: blktrace: add definitions for blk_user_trace_setup2 Add definitions for a version 2 of the blk_user_trace_setup ioctl. This new ioctl will enable a different struct layout of the binary data passed to user-space when using a new version of the blktrace utility requesting the new struct layout. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ include/uapi/linux/fs.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 1bfb635e309b..a6958708d477 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -129,6 +129,7 @@ enum { }; #define BLKTRACE_BDEV_SIZE 32 +#define BLKTRACE_BDEV_SIZE2 64 /* * User setup structure passed with BLKTRACESETUP @@ -143,4 +144,19 @@ struct blk_user_trace_setup { __u32 pid; }; +/* + * User setup structure passed with BLKTRACESETUP2 + */ +struct blk_user_trace_setup2 { + char name[BLKTRACE_BDEV_SIZE2]; /* output */ + __u64 act_mask; /* input */ + __u32 buf_size; /* input */ + __u32 buf_nr; /* input */ + __u64 start_lba; + __u64 end_lba; + __u32 pid; + __u32 flags; /* currently unused */ + __u64 reserved[11]; +}; + #endif /* _UAPIBLKTRACE_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index beb4c2d1e41c..957ce3343a4f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,6 +300,7 @@ struct file_attr { #define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ +#define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From c44347d606260f36a81f6d8415a5af33cb3015fa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:08 +0200 Subject: blktrace: add definitions for struct blk_io_trace2 Add definitions for the extended version of the blktrace protocol using a wider action type to be able to record new actions in the kernel. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index a6958708d477..9f9834d76e00 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -94,6 +94,7 @@ enum blktrace_notify { #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 +#define BLK_IO_TRACE2_VERSION 0x08 /* * The trace itself @@ -113,6 +114,21 @@ struct blk_io_trace { /* cgroup id will be stored here if exists */ }; +struct blk_io_trace2 { + __u32 magic; /* MAGIC << 8 | BLK_IO_TRACE2_VERSION */ + __u32 sequence; /* event number */ + __u64 time; /* in nanoseconds */ + __u64 sector; /* disk offset */ + __u32 bytes; /* transfer length */ + __u32 pid; /* who did it */ + __u64 action; /* what happened */ + __u32 device; /* device number */ + __u32 cpu; /* on what cpu did it happen */ + __u16 error; /* completion error */ + __u16 pdu_len; /* length of data after this trace */ + __u8 pad[12]; + /* cgroup id will be stored here if it exists */ +}; /* * The remap event */ -- cgit v1.2.3 From f9ee38bbf70fb20584625849a253c8652176fa66 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:12 +0200 Subject: blktrace: add block trace commands for zone operations Add block trace commands for zone operations. These commands can only be handled with version 2 of the blktrace protocol. For version 1, warn if a command that does not fit into the 16 bits reserved for the command in this version is passed in. Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 9f9834d76e00..190a3c5ab0a0 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -26,11 +26,20 @@ enum blktrace_cat { BLK_TC_DRV_DATA = 1 << 14, /* binary per-driver data */ BLK_TC_FUA = 1 << 15, /* fua requests */ - BLK_TC_END = 1 << 15, /* we've run out of bits! */ + BLK_TC_END_V1 = 1 << 15, /* we've run out of bits! */ + + BLK_TC_ZONE_APPEND = 1ull << 16, /* zone append */ + BLK_TC_ZONE_RESET = 1ull << 17, /* zone reset */ + BLK_TC_ZONE_RESET_ALL = 1ull << 18, /* zone reset all */ + BLK_TC_ZONE_FINISH = 1ull << 19, /* zone finish */ + BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ + BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ + + BLK_TC_END_V2 = 1ull << 21, }; #define BLK_TC_SHIFT (16) -#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) +#define BLK_TC_ACT(act) ((u64)(act) << BLK_TC_SHIFT) /* * Basic trace actions -- cgit v1.2.3 From 1c164fcc1b08e75f1cad1532718f09cddc0ddebe Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:13 +0200 Subject: blktrace: expose ZONE APPEND completions to blktrace Expose ZONE APPEND completions as a block trace completion action to blktrace. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 190a3c5ab0a0..289872e51fc5 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -97,6 +97,9 @@ enum blktrace_notify { #define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA)) +#define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ + BLK_TC_ACT(BLK_TC_ZONE_APPEND)) + #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) -- cgit v1.2.3 From 3f6722816a73e2017599d965683dbe71833afd7a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:14 +0200 Subject: blktrace: trace zone write plugging operations Trace zone write plugging operations on block devices. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 289872e51fc5..30f3d2589365 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -62,6 +62,8 @@ enum blktrace_act { __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ __BLK_TA_DRV_DATA, /* driver-specific binary data */ + __BLK_TA_ZONE_PLUG, /* zone write plug was plugged */ + __BLK_TA_ZONE_UNPLUG, /* zone write plug was unplugged */ __BLK_TA_CGROUP = 1 << 8, /* from a cgroup*/ }; @@ -99,6 +101,9 @@ enum blktrace_notify { #define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ BLK_TC_ACT(BLK_TC_ZONE_APPEND)) +#define BLK_TA_ZONE_PLUG (__BLK_TA_ZONE_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_ZONE_UNPLUG (__BLK_TA_ZONE_UNPLUG |\ + BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) -- cgit v1.2.3 From bc49af56eea866c34d21bf582f65b02fc8c06ec3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 28 Oct 2025 20:34:23 -0700 Subject: blktrace: add support for REQ_OP_WRITE_ZEROES tracing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, REQ_OP_WRITE_ZEROES operations are not handled in the blktrace infrastructure, resulting in incorrect or missing operation labels in ftrace blktrace output. This manifests as write-zeroes operations appearing with incorrect labels like "N" instead of a proper "WZ" designation. This patch adds complete support for REQ_OP_WRITE_ZEROES across the blktrace infrastructure: Add BLK_TC_WRITE_ZEROES trace category in blktrace_api.h and update BLK_TC_END_V2 marker accordingly Map REQ_OP_WRITE_ZEROES to BLK_TC_WRITE_ZEROES in __blk_add_trace() to ensure proper trace event categorization Update fill_rwbs() to generate "WZ" label for write-zeroes operations in ftrace output, making them easily identifiable Add "write-zeroes" string mapping in act_to_str array for debugfs filter interface Update blk_fill_rwbs() to handle REQ_OP_WRITE_ZEROES for block layer event tracing With this fix, write-zeroes operations are now correctly traced and displayed. =========================================================== BEFORE THIS PATCH =========================================================== blkdiscard -z -o 0 -l 40960 /dev/nvme0n1 blkdiscard-3809 [030] ..... 1212.253701: block_bio_queue: 259,0 NS 0 + 80 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253703: block_getrq: 259,0 NS 0 + 80 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253704: block_io_start: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253704: block_plug: [blkdiscard] blkdiscard-3809 [030] ..... 1212.253706: block_unplug: [blkdiscard] 1 blkdiscard-3809 [030] ..... 1212.253706: block_rq_insert: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard] kworker/30:1H-566 [030] ..... 1212.253726: block_rq_issue: 259,0 NS 40960 () 0 + 80 be,0,4 [kworker/30:1H] -0 [030] d.h1. 1212.253957: block_rq_complete: 259,0 NS () 0 + 80 be,0,4 [0] -0 [030] dNh1. 1212.253960: block_io_done: 259,0 NS 0 () 0 + 0 none,0,0 [swapper/30] Trace Event Breakdown: Event | Device | Op | Sector | Sectors | Byte Size | Calculation block_bio_queue | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_getrq | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_io_start | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_insert | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_issue | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_complete | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_io_done | 259,0 | NS | 0 | 0 | 0 | Completion (no data) Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes =========================================================== AFTER THIS PATCH =========================================================== blkdiscard -z -o 0 -l 40960 /dev/nvme0n1 blkdiscard-2477 [020] ..... 960.989131: block_bio_queue: 259,0 WZS 0 + 80 [blkdiscard] blkdiscard-2477 [020] ..... 960.989134: block_getrq: 259,0 WZS 0 + 80 [blkdiscard] blkdiscard-2477 [020] ..... 960.989135: block_io_start: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard] blkdiscard-2477 [020] ..... 960.989138: block_plug: [blkdiscard] blkdiscard-2477 [020] ..... 960.989140: block_unplug: [blkdiscard] 1 blkdiscard-2477 [020] ..... 960.989141: block_rq_insert: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard] kworker/20:1H-736 [020] ..... 960.989166: block_rq_issue: 259,0 WZS 40960 () 0 + 80 be,0,4 [kworker/20:1H] -0 [020] d.h1. 960.989476: block_rq_complete: 259,0 WZS () 0 + 80 be,0,4 [0] -0 [020] dNh1. 960.989482: block_io_done: 259,0 WZS 0 () 0 + 0 none,0,0 [swapper/20] Trace Event Breakdown: Event | Device | Op | Sector | Sectors | Byte Size | Calculation block_bio_queue | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_getrq | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_io_start | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_insert | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_issue | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_complete | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_io_done | 259,0 | WZS | 0 | 0 | 0 | Completion (no data) Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes Tested with ftrace blktrace on NVMe devices using blkdiscard with the -z (write-zeroes) flag. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 30f3d2589365..7c092d9f3aa4 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -35,7 +35,9 @@ enum blktrace_cat { BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ - BLK_TC_END_V2 = 1ull << 21, + BLK_TC_WRITE_ZEROES = 1ull << 22, /* write-zeroes */ + + BLK_TC_END_V2 = 1ull << 22, }; #define BLK_TC_SHIFT (16) -- cgit v1.2.3 From 0bf0e2e4666822b62d7ad6473dc37fd6b377b5f1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:41 +0900 Subject: block: track zone conditions The function blk_revalidate_zone_cond() already caches the condition of all zones of a zoned block device in the zones_cond array of a gendisk. However, the zone conditions are updated only when the device is scanned or revalidated. Implement tracking of the runtime changes to zone conditions using the new cond field in struct blk_zone_wplug. The size of this structure remains 112 Bytes as the new field replaces the 4 Bytes padding at the end of the structure. Beause zones that do not have a zone write plug can be in the empty, implicit open, explicit open or full condition, the zones_cond array of a disk is used to track the conditions, of zones that do not have a zone write plug. The condition of such zone is updated in the disk zones_cond array when a zone reset, reset all or finish operation is executed, and also when a zone write plug is removed from the disk hash table when the zone becomes full. Since a device may automatically close an implicitly open zone when writing to an empty or closed zone, if the total number of open zones has reached the device limit, the BLK_ZONE_COND_IMP_OPEN and BLK_ZONE_COND_CLOSED zone conditions cannot be precisely tracked. To overcome this, the zone condition BLK_ZONE_COND_ACTIVE is introduced to represent a zone that has the condition BLK_ZONE_COND_IMP_OPEN, BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED. This follows the definition of an active zone as defined in the NVMe Zoned Namespace specifications. As such, for a zoned device that has a limit on the maximum number of open zones, we will never have more zones in the BLK_ZONE_COND_ACTIVE condition than the device limit. This is compatible with the SCSI ZBC and ATA ZAC specifications for SMR HDDs as these devices do not have a limit on the number of active zones. The function disk_zone_wplug_set_wp_offset() is modified to use the new helper disk_zone_wplug_update_cond() to update a zone write plug condition whenever a zone write plug write offset is updated on submission or merging of write BIOs to a zone. The functions blk_zone_reset_bio_endio(), blk_zone_reset_all_bio_endio() and blk_zone_finish_bio_endio() are modified to update the condition of the zones targeted by reset, reset_all and finish operations, either using though disk_zone_wplug_set_wp_offset() for zones that have a zone write plug, or using the disk_zone_set_cond() helper to update the zones_cond array of the disk for zones that do not have a zone write plug. When a zone write plug is removed from the disk hash table (when the zone becomes empty or full), the condition of struct blk_zone_wplug is used to update the disk zones_cond array. Conversely, when a zone write plug is added to the disk hash table, the zones_cond array is used to initialize the zone write plug condition. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index f85743ef6e7d..5c7662971414 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -48,6 +48,8 @@ enum blk_zone_type { * FINISH ZONE command. * @BLK_ZONE_COND_READONLY: The zone is read-only. * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written). + * @BLK_ZONE_COND_ACTIVE: The zone is either implicitly open, explicitly open, + * or closed. * * The Zone Condition state machine in the ZBC/ZAC standards maps the above * deinitions as: @@ -61,6 +63,13 @@ enum blk_zone_type { * * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should * be considered invalid. + * + * The condition BLK_ZONE_COND_ACTIVE is used only with cached zone reports. + * It is used to report any of the BLK_ZONE_COND_IMP_OPEN, + * BLK_ZONE_COND_EXP_OPEN and BLK_ZONE_COND_CLOSED conditions. Conversely, a + * regular zone report will never report a zone condition using + * BLK_ZONE_COND_ACTIVE and instead use the conditions BLK_ZONE_COND_IMP_OPEN, + * BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED as reported by the device. */ enum blk_zone_cond { BLK_ZONE_COND_NOT_WP = 0x0, @@ -71,6 +80,8 @@ enum blk_zone_cond { BLK_ZONE_COND_READONLY = 0xD, BLK_ZONE_COND_FULL = 0xE, BLK_ZONE_COND_OFFLINE = 0xF, + + BLK_ZONE_COND_ACTIVE = 0xFF, }; /** -- cgit v1.2.3 From b30ffcdc0c15a88f8866529d3532454e02571221 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:45 +0900 Subject: block: introduce BLKREPORTZONESV2 ioctl Introduce the new BLKREPORTZONESV2 ioctl command to allow user applications access to the fast zone report implemented by blkdev_report_zones_cached(). This new ioctl is defined as number 142 and is documented in include/uapi/linux/fs.h. Unlike the existing BLKREPORTZONES ioctl, this new ioctl uses the flags field of struct blk_zone_report also as an input. If the user sets the BLK_ZONE_REP_CACHED flag as an input, then blkdev_report_zones_cached() is used to generate the zone report using cached zone information. If this flag is not set, then BLKREPORTZONESV2 behaves in the same manner as BLKREPORTZONES and the zone report is generated by accessing the zoned device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 35 ++++++++++++++++++++++++++++++----- include/uapi/linux/fs.h | 2 +- 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 5c7662971414..e33f02703350 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -87,10 +87,20 @@ enum blk_zone_cond { /** * enum blk_zone_report_flags - Feature flags of reported zone descriptors. * - * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + * @BLK_ZONE_REP_CAPACITY: Output only. Indicates that zone descriptors in a + * zone report have a valid capacity field. + * @BLK_ZONE_REP_CACHED: Input only. Indicates that the zone report should be + * generated using cached zone information. In this case, + * the implicit open, explicit open and closed zone + * conditions are all reported with the + * BLK_ZONE_COND_ACTIVE condition. */ enum blk_zone_report_flags { - BLK_ZONE_REP_CAPACITY = (1 << 0), + /* Output flags */ + BLK_ZONE_REP_CAPACITY = (1U << 0), + + /* Input flags */ + BLK_ZONE_REP_CACHED = (1U << 31), }; /** @@ -133,6 +143,10 @@ struct blk_zone { * @sector: starting sector of report * @nr_zones: IN maximum / OUT actual * @flags: one or more flags as defined by enum blk_zone_report_flags. + * @flags: one or more flags as defined by enum blk_zone_report_flags. + * With BLKREPORTZONE, this field is ignored as an input and is valid + * only as an output. Using BLKREPORTZONEV2, this field is used as both + * input and output. * @zones: Space to hold @nr_zones @zones entries on reply. * * The array of at most @nr_zones must follow this structure in memory. @@ -159,9 +173,19 @@ struct blk_zone_range { /** * Zoned block device ioctl's: * - * @BLKREPORTZONE: Get zone information. Takes a zone report as argument. - * The zone report will start from the zone containing the - * sector specified in the report request structure. + * @BLKREPORTZONE: Get zone information from a zoned device. Takes a zone report + * as argument. The zone report will start from the zone + * containing the sector specified in struct blk_zone_report. + * The flags field of struct blk_zone_report is used as an + * output only and ignored as an input. + * DEPRECATED, use BLKREPORTZONEV2 instead. + * @BLKREPORTZONEV2: Same as @BLKREPORTZONE but uses the flags field of + * struct blk_zone_report as an input, allowing to get a zone + * report using cached zone information if the flag + * BLK_ZONE_REP_CACHED is set. In such case, the zone report + * may include zones with the condition @BLK_ZONE_COND_ACTIVE + * (c.f. the description of this condition above for more + * details). * @BLKRESETZONE: Reset the write pointer of the zones in the specified * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. @@ -180,5 +204,6 @@ struct blk_zone_range { #define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) #define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) #define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) +#define BLKREPORTZONEV2 _IOWR(0x12, 142, struct blk_zone_report) #endif /* _UAPI_BLKZONED_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 957ce3343a4f..66ca526cf786 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -298,7 +298,7 @@ struct file_attr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) #define BLKGETDISKSEQ _IOR(0x12,128,__u64) -/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ +/* 130-136 and 142 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ #define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) -- cgit v1.2.3 From 62ed1b58224636185fa689db81224b8c8af46473 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:57 +0800 Subject: md: allow configuring logical block size Previously, raid array used the maximum logical block size (LBS) of all member disks. Adding a larger LBS disk at runtime could unexpectedly increase RAID's LBS, risking corruption of existing partitions. This can be reproduced by: ``` # LBS of sd[de] is 512 bytes, sdf is 4096 bytes. mdadm -CRq /dev/md0 -l1 -n3 /dev/sd[de] missing --assume-clean # LBS is 512 cat /sys/block/md0/queue/logical_block_size # create partition md0p1 parted -s /dev/md0 mklabel gpt mkpart primary 1MiB 100% lsblk | grep md0p1 # LBS becomes 4096 after adding sdf mdadm --add -q /dev/md0 /dev/sdf cat /sys/block/md0/queue/logical_block_size # partition lost partprobe /dev/md0 lsblk | grep md0p1 ``` Simply restricting larger-LBS disks is inflexible. In some scenarios, only disks with 512 bytes LBS are available currently, but later, disks with 4KB LBS may be added to the array. Making LBS configurable is the best way to solve this scenario. After this patch, the raid will: - store LBS in disk metadata - add a read-write sysfs 'mdX/logical_block_size' Future mdadm should support setting LBS via metadata field during RAID creation and the new sysfs. Though the kernel allows runtime LBS changes, users should avoid modifying it after creating partitions or filesystems to prevent compatibility issues. Only 1.x metadata supports configurable LBS. 0.90 metadata inits all fields to default values at auto-detect. Supporting 0.90 would require more extensive changes and no such use case has been observed. Note that many RAID paths rely on PAGE_SIZE alignment, including for metadata I/O. A larger LBS than PAGE_SIZE will result in metadata read/write failures. So this config should be prevented. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-6-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- include/uapi/linux/raid/md_p.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index ac74133a4768..310068bb2a1d 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -291,7 +291,8 @@ struct mdp_superblock_1 { __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ __le32 sb_csum; /* checksum up to devs[max_dev] */ __le32 max_dev; /* size of devs[] array to consider */ - __u8 pad3[64-32]; /* set to 0 when writing */ + __le32 logical_block_size; /* same as q->limits->logical_block_size */ + __u8 pad3[64-36]; /* set to 0 when writing */ /* device state information. Indexed by dev_number. * 2 bytes per device -- cgit v1.2.3