diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-13 15:51:31 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-13 15:51:31 -0700 |
| commit | 7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (patch) | |
| tree | 64677a680f3bccc7efb8f4cfcb288006e1433cd3 /include/uapi/linux | |
| parent | b8f82cb0d84d00c04cdbdce42f67df71b8507e8b (diff) | |
| parent | 36446de0c30c62b9d89502fd36c4904996d86ecd (diff) | |
Merge tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe:
- Add shared memory zero-copy I/O support for ublk, bypassing per-I/O
copies between kernel and userspace by matching registered buffer
PFNs at I/O time. Includes selftests.
- Refactor bio integrity to support filesystem initiated integrity
operations and arbitrary buffer alignment.
- Clean up bio allocation, splitting bio_alloc_bioset() into clear fast
and slow paths. Add bio_await() and bio_submit_or_kill() helpers,
unify synchronous bi_end_io callbacks.
- Fix zone write plug refcount handling and plug removal races. Add
support for serializing zone writes at QD=1 for rotational zoned
devices, yielding significant throughput improvements.
- Add SED-OPAL ioctls for Single User Mode management and a STACK_RESET
command.
- Add io_uring passthrough (uring_cmd) support to the BSG layer.
- Replace pp_buf in partition scanning with struct seq_buf.
- zloop improvements and cleanups.
- drbd genl cleanup, switching to pre_doit/post_doit.
- NVMe pull request via Keith:
- Fabrics authentication updates
- Enhanced block queue limits support
- Workqueue usage updates
- A new write zeroes device quirk
- Tagset cleanup fix for loop device
- MD pull requests via Yu Kuai:
- Fix raid5 soft lockup in retry_aligned_read()
- Fix raid10 deadlock with check operation and nowait requests
- Fix raid1 overlapping writes on writemostly disks
- Fix sysfs deadlock on array_state=clear
- Proactive RAID-5 parity building with llbitmap, with
write_zeroes_unmap optimization for initial sync
- Fix llbitmap barrier ordering, rdev skipping, and bitmap_ops
version mismatch fallback
- Fix bcache use-after-free and uninitialized closure
- Validate raid5 journal metadata payload size
- Various cleanups
- Various other fixes, improvements, and cleanups
* tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (146 commits)
ublk: fix tautological comparison warning in ublk_ctrl_reg_buf
scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd()
block: refactor blkdev_zone_mgmt_ioctl
MAINTAINERS: update ublk driver maintainer email
Documentation: ublk: address review comments for SHMEM_ZC docs
ublk: allow buffer registration before device is started
ublk: replace xarray with IDA for shmem buffer index allocation
ublk: simplify PFN range loop in __ublk_ctrl_reg_buf
ublk: verify all pages in multi-page bvec fall within registered range
ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support
xfs: use bio_await in xfs_zone_gc_reset_sync
block: add a bio_submit_or_kill helper
block: factor out a bio_await helper
block: unify the synchronous bi_end_io callbacks
xfs: fix number of GC bvecs
selftests/ublk: add read-only buffer registration test
selftests/ublk: add filesystem fio verify test for shmem_zc
selftests/ublk: add hugetlbfs shmem_zc test for loop target
selftests/ublk: add shared memory zero-copy test
selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target
...
Diffstat (limited to 'include/uapi/linux')
| -rw-r--r-- | include/uapi/linux/bsg.h | 75 | ||||
| -rw-r--r-- | include/uapi/linux/sed-opal.h | 30 | ||||
| -rw-r--r-- | include/uapi/linux/ublk_cmd.h | 80 |
3 files changed, 185 insertions, 0 deletions
diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h index cd6302def5ed..6cff77f5b857 100644 --- a/include/uapi/linux/bsg.h +++ b/include/uapi/linux/bsg.h @@ -2,6 +2,9 @@ #ifndef _UAPIBSG_H #define _UAPIBSG_H +#ifdef __KERNEL__ +#include <linux/build_bug.h> +#endif /* __KERNEL__ */ #include <linux/types.h> #define BSG_PROTOCOL_SCSI 0 @@ -63,5 +66,77 @@ struct sg_io_v4 { __u32 padding; }; +struct bsg_uring_cmd { + __u64 request; /* [i], [*i] command descriptor address */ + __u32 request_len; /* [i] command descriptor length in bytes */ + __u32 protocol; /* [i] protocol type (BSG_PROTOCOL_*) */ + __u32 subprotocol; /* [i] subprotocol type (BSG_SUB_PROTOCOL_*) */ + __u32 max_response_len; /* [i] response buffer size in bytes */ + + __u64 response; /* [i], [*o] response data address */ + __u64 dout_xferp; /* [i], [*i] */ + __u32 dout_xfer_len; /* [i] bytes to be transferred to device */ + __u32 dout_iovec_count; /* [i] 0 -> "flat" dout transfer else + * dout_xferp points to array of iovec + */ + __u64 din_xferp; /* [i], [*o] */ + __u32 din_xfer_len; /* [i] bytes to be transferred from device */ + __u32 din_iovec_count; /* [i] 0 -> "flat" din transfer */ + + __u32 timeout_ms; /* [i] timeout in milliseconds */ + __u8 reserved[12]; /* reserved for future extension */ +}; + +#ifdef __KERNEL__ +/* Must match IORING_OP_URING_CMD payload size (e.g. SQE128). */ +static_assert(sizeof(struct bsg_uring_cmd) == 80); +#endif /* __KERNEL__ */ + + +/* + * SCSI BSG io_uring completion (res2, 64-bit) + * + * When using BSG_PROTOCOL_SCSI + BSG_SUB_PROTOCOL_SCSI_CMD with + * IORING_OP_URING_CMD, the completion queue entry (CQE) contains: + * - result: errno (0 on success) + * - res2: packed SCSI status + * + * res2 bit layout: + * [0..7] device_status (SCSI status byte, e.g. CHECK_CONDITION) + * [8..15] driver_status (e.g. DRIVER_SENSE when sense data is valid) + * [16..23] host_status (e.g. DID_OK, DID_TIME_OUT) + * [24..31] sense_len_wr (bytes of sense data written to response buffer) + * [32..63] resid_len (residual transfer length) + */ +static inline __u8 bsg_scsi_res2_device_status(__u64 res2) +{ + return res2 & 0xff; +} +static inline __u8 bsg_scsi_res2_driver_status(__u64 res2) +{ + return res2 >> 8; +} +static inline __u8 bsg_scsi_res2_host_status(__u64 res2) +{ + return res2 >> 16; +} +static inline __u8 bsg_scsi_res2_sense_len(__u64 res2) +{ + return res2 >> 24; +} +static inline __u32 bsg_scsi_res2_resid_len(__u64 res2) +{ + return res2 >> 32; +} +static inline __u64 bsg_scsi_res2_build(__u8 device_status, __u8 driver_status, + __u8 host_status, __u8 sense_len_wr, + __u32 resid_len) +{ + return ((__u64)(__u32)(resid_len) << 32) | + ((__u64)sense_len_wr << 24) | + ((__u64)host_status << 16) | + ((__u64)driver_status << 8) | + (__u64)device_status; +} #endif /* _UAPIBSG_H */ diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 9025dd5a4f0f..ef4d3be6ca7f 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -74,6 +74,19 @@ struct opal_lr_act { __u8 align[2]; /* Align to 8 byte boundary */ }; +struct opal_lr_react { + struct opal_key key; + struct opal_key new_admin_key; /* Set new Admin1 PIN if key_len is > 0 */ + __u8 num_lrs; /* + * Configure selected ranges (from lr[]) in SUM. + * If num_lrs > 0 the 'entire_table' must be 0 + */ + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; /* Set RangeStartRangeLengthPolicy parameter */ + __u8 entire_table; /* Set all locking objects in SUM */ + __u8 align[4]; /* Align to 8 byte boundary */ +}; + struct opal_session_info { __u32 sum; __u32 who; @@ -98,6 +111,18 @@ struct opal_lr_status { __u8 align[4]; }; +struct opal_sum_ranges { + /* + * Initiate Admin1 session if key_len > 0, + * use Anybody session otherwise. + */ + struct opal_key key; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; + __u8 align[5]; /* Align to 8 byte boundary */ +}; + struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; @@ -216,5 +241,10 @@ struct opal_revert_lsp { #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) +#define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) +#define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) +#define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) +#define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges) +#define IOC_OPAL_STACK_RESET _IO('p', 246) #endif /* _UAPI_SED_OPAL_H */ diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a88876756805..6991370a72ce 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -58,6 +58,45 @@ #define UBLK_U_CMD_TRY_STOP_DEV \ _IOWR('u', 0x17, struct ublksrv_ctrl_cmd) /* + * Register a shared memory buffer for zero-copy I/O. + * Input: ctrl_cmd.addr points to struct ublk_shmem_buf_reg (buffer VA + size) + * ctrl_cmd.len = sizeof(struct ublk_shmem_buf_reg) + * Result: >= 0 is the assigned buffer index, < 0 is error + * + * The kernel pins pages from the calling process's address space + * and inserts PFN ranges into a per-device maple tree. When a block + * request's pages match registered pages, the driver sets + * UBLK_IO_F_SHMEM_ZC and encodes the buffer index + offset in addr, + * allowing the server to access the data via its own mapping of the + * same shared memory — true zero copy. + * + * The memory can be backed by memfd, hugetlbfs, or any GUP-compatible + * shared mapping. Queue freeze is handled internally. + * + * The buffer VA and size are passed via a user buffer (not inline in + * ctrl_cmd) so that unprivileged devices can prepend the device path + * to ctrl_cmd.addr without corrupting the VA. + */ +#define UBLK_U_CMD_REG_BUF \ + _IOWR('u', 0x18, struct ublksrv_ctrl_cmd) +/* + * Unregister a shared memory buffer. + * Input: ctrl_cmd.data[0] = buffer index + */ +#define UBLK_U_CMD_UNREG_BUF \ + _IOWR('u', 0x19, struct ublksrv_ctrl_cmd) + +/* Parameter buffer for UBLK_U_CMD_REG_BUF, pointed to by ctrl_cmd.addr */ +struct ublk_shmem_buf_reg { + __u64 addr; /* userspace virtual address of shared memory */ + __u64 len; /* buffer size in bytes, page-aligned, default max 4GB */ + __u32 flags; + __u32 reserved; +}; + +/* Pin pages without FOLL_WRITE; usable with write-sealed memfd */ +#define UBLK_SHMEM_BUF_READ_ONLY (1U << 0) +/* * 64bits are enough now, and it should be easy to extend in case of * running out of feature flags */ @@ -370,6 +409,14 @@ /* Disable automatic partition scanning when device is started */ #define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) +/* + * Enable shared memory zero copy. When enabled, the server can register + * shared memory buffers via UBLK_U_CMD_REG_BUF. If a block request's + * pages match a registered buffer, UBLK_IO_F_SHMEM_ZC is set and addr + * encodes the buffer index + offset instead of a userspace buffer address. + */ +#define UBLK_F_SHMEM_ZC (1ULL << 19) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -469,6 +516,12 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_F_NEED_REG_BUF (1U << 17) /* Request has an integrity data buffer */ #define UBLK_IO_F_INTEGRITY (1UL << 18) +/* + * I/O buffer is in a registered shared memory buffer. When set, the addr + * field in ublksrv_io_desc encodes buffer index and byte offset instead + * of a userspace virtual address. + */ +#define UBLK_IO_F_SHMEM_ZC (1U << 19) /* * io cmd is described by this structure, and stored in share memory, indexed @@ -743,4 +796,31 @@ struct ublk_params { struct ublk_param_integrity integrity; }; +/* + * Shared memory zero-copy addr encoding for UBLK_IO_F_SHMEM_ZC. + * + * When UBLK_IO_F_SHMEM_ZC is set, ublksrv_io_desc.addr is encoded as: + * bits [0:31] = byte offset within the buffer (up to 4GB) + * bits [32:47] = buffer index (up to 65536) + * bits [48:63] = reserved (must be zero) + */ +#define UBLK_SHMEM_ZC_OFF_MASK 0xffffffffULL +#define UBLK_SHMEM_ZC_IDX_OFF 32 +#define UBLK_SHMEM_ZC_IDX_MASK 0xffffULL + +static inline __u64 ublk_shmem_zc_addr(__u16 index, __u32 offset) +{ + return ((__u64)index << UBLK_SHMEM_ZC_IDX_OFF) | offset; +} + +static inline __u16 ublk_shmem_zc_index(__u64 addr) +{ + return (addr >> UBLK_SHMEM_ZC_IDX_OFF) & UBLK_SHMEM_ZC_IDX_MASK; +} + +static inline __u32 ublk_shmem_zc_offset(__u64 addr) +{ + return (__u32)(addr & UBLK_SHMEM_ZC_OFF_MASK); +} + #endif |
