summaryrefslogtreecommitdiff
path: root/include/uapi/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 15:51:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 15:51:31 -0700
commit7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (patch)
tree64677a680f3bccc7efb8f4cfcb288006e1433cd3 /include/uapi/linux
parentb8f82cb0d84d00c04cdbdce42f67df71b8507e8b (diff)
parent36446de0c30c62b9d89502fd36c4904996d86ecd (diff)
Merge tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - Add shared memory zero-copy I/O support for ublk, bypassing per-I/O copies between kernel and userspace by matching registered buffer PFNs at I/O time. Includes selftests. - Refactor bio integrity to support filesystem initiated integrity operations and arbitrary buffer alignment. - Clean up bio allocation, splitting bio_alloc_bioset() into clear fast and slow paths. Add bio_await() and bio_submit_or_kill() helpers, unify synchronous bi_end_io callbacks. - Fix zone write plug refcount handling and plug removal races. Add support for serializing zone writes at QD=1 for rotational zoned devices, yielding significant throughput improvements. - Add SED-OPAL ioctls for Single User Mode management and a STACK_RESET command. - Add io_uring passthrough (uring_cmd) support to the BSG layer. - Replace pp_buf in partition scanning with struct seq_buf. - zloop improvements and cleanups. - drbd genl cleanup, switching to pre_doit/post_doit. - NVMe pull request via Keith: - Fabrics authentication updates - Enhanced block queue limits support - Workqueue usage updates - A new write zeroes device quirk - Tagset cleanup fix for loop device - MD pull requests via Yu Kuai: - Fix raid5 soft lockup in retry_aligned_read() - Fix raid10 deadlock with check operation and nowait requests - Fix raid1 overlapping writes on writemostly disks - Fix sysfs deadlock on array_state=clear - Proactive RAID-5 parity building with llbitmap, with write_zeroes_unmap optimization for initial sync - Fix llbitmap barrier ordering, rdev skipping, and bitmap_ops version mismatch fallback - Fix bcache use-after-free and uninitialized closure - Validate raid5 journal metadata payload size - Various cleanups - Various other fixes, improvements, and cleanups * tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (146 commits) ublk: fix tautological comparison warning in ublk_ctrl_reg_buf scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd() block: refactor blkdev_zone_mgmt_ioctl MAINTAINERS: update ublk driver maintainer email Documentation: ublk: address review comments for SHMEM_ZC docs ublk: allow buffer registration before device is started ublk: replace xarray with IDA for shmem buffer index allocation ublk: simplify PFN range loop in __ublk_ctrl_reg_buf ublk: verify all pages in multi-page bvec fall within registered range ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support xfs: use bio_await in xfs_zone_gc_reset_sync block: add a bio_submit_or_kill helper block: factor out a bio_await helper block: unify the synchronous bi_end_io callbacks xfs: fix number of GC bvecs selftests/ublk: add read-only buffer registration test selftests/ublk: add filesystem fio verify test for shmem_zc selftests/ublk: add hugetlbfs shmem_zc test for loop target selftests/ublk: add shared memory zero-copy test selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target ...
Diffstat (limited to 'include/uapi/linux')
-rw-r--r--include/uapi/linux/bsg.h75
-rw-r--r--include/uapi/linux/sed-opal.h30
-rw-r--r--include/uapi/linux/ublk_cmd.h80
3 files changed, 185 insertions, 0 deletions
diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h
index cd6302def5ed..6cff77f5b857 100644
--- a/include/uapi/linux/bsg.h
+++ b/include/uapi/linux/bsg.h
@@ -2,6 +2,9 @@
#ifndef _UAPIBSG_H
#define _UAPIBSG_H
+#ifdef __KERNEL__
+#include <linux/build_bug.h>
+#endif /* __KERNEL__ */
#include <linux/types.h>
#define BSG_PROTOCOL_SCSI 0
@@ -63,5 +66,77 @@ struct sg_io_v4 {
__u32 padding;
};
+struct bsg_uring_cmd {
+ __u64 request; /* [i], [*i] command descriptor address */
+ __u32 request_len; /* [i] command descriptor length in bytes */
+ __u32 protocol; /* [i] protocol type (BSG_PROTOCOL_*) */
+ __u32 subprotocol; /* [i] subprotocol type (BSG_SUB_PROTOCOL_*) */
+ __u32 max_response_len; /* [i] response buffer size in bytes */
+
+ __u64 response; /* [i], [*o] response data address */
+ __u64 dout_xferp; /* [i], [*i] */
+ __u32 dout_xfer_len; /* [i] bytes to be transferred to device */
+ __u32 dout_iovec_count; /* [i] 0 -> "flat" dout transfer else
+ * dout_xferp points to array of iovec
+ */
+ __u64 din_xferp; /* [i], [*o] */
+ __u32 din_xfer_len; /* [i] bytes to be transferred from device */
+ __u32 din_iovec_count; /* [i] 0 -> "flat" din transfer */
+
+ __u32 timeout_ms; /* [i] timeout in milliseconds */
+ __u8 reserved[12]; /* reserved for future extension */
+};
+
+#ifdef __KERNEL__
+/* Must match IORING_OP_URING_CMD payload size (e.g. SQE128). */
+static_assert(sizeof(struct bsg_uring_cmd) == 80);
+#endif /* __KERNEL__ */
+
+
+/*
+ * SCSI BSG io_uring completion (res2, 64-bit)
+ *
+ * When using BSG_PROTOCOL_SCSI + BSG_SUB_PROTOCOL_SCSI_CMD with
+ * IORING_OP_URING_CMD, the completion queue entry (CQE) contains:
+ * - result: errno (0 on success)
+ * - res2: packed SCSI status
+ *
+ * res2 bit layout:
+ * [0..7] device_status (SCSI status byte, e.g. CHECK_CONDITION)
+ * [8..15] driver_status (e.g. DRIVER_SENSE when sense data is valid)
+ * [16..23] host_status (e.g. DID_OK, DID_TIME_OUT)
+ * [24..31] sense_len_wr (bytes of sense data written to response buffer)
+ * [32..63] resid_len (residual transfer length)
+ */
+static inline __u8 bsg_scsi_res2_device_status(__u64 res2)
+{
+ return res2 & 0xff;
+}
+static inline __u8 bsg_scsi_res2_driver_status(__u64 res2)
+{
+ return res2 >> 8;
+}
+static inline __u8 bsg_scsi_res2_host_status(__u64 res2)
+{
+ return res2 >> 16;
+}
+static inline __u8 bsg_scsi_res2_sense_len(__u64 res2)
+{
+ return res2 >> 24;
+}
+static inline __u32 bsg_scsi_res2_resid_len(__u64 res2)
+{
+ return res2 >> 32;
+}
+static inline __u64 bsg_scsi_res2_build(__u8 device_status, __u8 driver_status,
+ __u8 host_status, __u8 sense_len_wr,
+ __u32 resid_len)
+{
+ return ((__u64)(__u32)(resid_len) << 32) |
+ ((__u64)sense_len_wr << 24) |
+ ((__u64)host_status << 16) |
+ ((__u64)driver_status << 8) |
+ (__u64)device_status;
+}
#endif /* _UAPIBSG_H */
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 9025dd5a4f0f..ef4d3be6ca7f 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -74,6 +74,19 @@ struct opal_lr_act {
__u8 align[2]; /* Align to 8 byte boundary */
};
+struct opal_lr_react {
+ struct opal_key key;
+ struct opal_key new_admin_key; /* Set new Admin1 PIN if key_len is > 0 */
+ __u8 num_lrs; /*
+ * Configure selected ranges (from lr[]) in SUM.
+ * If num_lrs > 0 the 'entire_table' must be 0
+ */
+ __u8 lr[OPAL_MAX_LRS];
+ __u8 range_policy; /* Set RangeStartRangeLengthPolicy parameter */
+ __u8 entire_table; /* Set all locking objects in SUM */
+ __u8 align[4]; /* Align to 8 byte boundary */
+};
+
struct opal_session_info {
__u32 sum;
__u32 who;
@@ -98,6 +111,18 @@ struct opal_lr_status {
__u8 align[4];
};
+struct opal_sum_ranges {
+ /*
+ * Initiate Admin1 session if key_len > 0,
+ * use Anybody session otherwise.
+ */
+ struct opal_key key;
+ __u8 num_lrs;
+ __u8 lr[OPAL_MAX_LRS];
+ __u8 range_policy;
+ __u8 align[5]; /* Align to 8 byte boundary */
+};
+
struct opal_lock_unlock {
struct opal_session_info session;
__u32 l_state;
@@ -216,5 +241,10 @@ struct opal_revert_lsp {
#define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery)
#define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp)
#define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw)
+#define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react)
+#define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup)
+#define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup)
+#define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges)
+#define IOC_OPAL_STACK_RESET _IO('p', 246)
#endif /* _UAPI_SED_OPAL_H */
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a88876756805..6991370a72ce 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -58,6 +58,45 @@
#define UBLK_U_CMD_TRY_STOP_DEV \
_IOWR('u', 0x17, struct ublksrv_ctrl_cmd)
/*
+ * Register a shared memory buffer for zero-copy I/O.
+ * Input: ctrl_cmd.addr points to struct ublk_shmem_buf_reg (buffer VA + size)
+ * ctrl_cmd.len = sizeof(struct ublk_shmem_buf_reg)
+ * Result: >= 0 is the assigned buffer index, < 0 is error
+ *
+ * The kernel pins pages from the calling process's address space
+ * and inserts PFN ranges into a per-device maple tree. When a block
+ * request's pages match registered pages, the driver sets
+ * UBLK_IO_F_SHMEM_ZC and encodes the buffer index + offset in addr,
+ * allowing the server to access the data via its own mapping of the
+ * same shared memory — true zero copy.
+ *
+ * The memory can be backed by memfd, hugetlbfs, or any GUP-compatible
+ * shared mapping. Queue freeze is handled internally.
+ *
+ * The buffer VA and size are passed via a user buffer (not inline in
+ * ctrl_cmd) so that unprivileged devices can prepend the device path
+ * to ctrl_cmd.addr without corrupting the VA.
+ */
+#define UBLK_U_CMD_REG_BUF \
+ _IOWR('u', 0x18, struct ublksrv_ctrl_cmd)
+/*
+ * Unregister a shared memory buffer.
+ * Input: ctrl_cmd.data[0] = buffer index
+ */
+#define UBLK_U_CMD_UNREG_BUF \
+ _IOWR('u', 0x19, struct ublksrv_ctrl_cmd)
+
+/* Parameter buffer for UBLK_U_CMD_REG_BUF, pointed to by ctrl_cmd.addr */
+struct ublk_shmem_buf_reg {
+ __u64 addr; /* userspace virtual address of shared memory */
+ __u64 len; /* buffer size in bytes, page-aligned, default max 4GB */
+ __u32 flags;
+ __u32 reserved;
+};
+
+/* Pin pages without FOLL_WRITE; usable with write-sealed memfd */
+#define UBLK_SHMEM_BUF_READ_ONLY (1U << 0)
+/*
* 64bits are enough now, and it should be easy to extend in case of
* running out of feature flags
*/
@@ -370,6 +409,14 @@
/* Disable automatic partition scanning when device is started */
#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18)
+/*
+ * Enable shared memory zero copy. When enabled, the server can register
+ * shared memory buffers via UBLK_U_CMD_REG_BUF. If a block request's
+ * pages match a registered buffer, UBLK_IO_F_SHMEM_ZC is set and addr
+ * encodes the buffer index + offset instead of a userspace buffer address.
+ */
+#define UBLK_F_SHMEM_ZC (1ULL << 19)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -469,6 +516,12 @@ struct ublksrv_ctrl_dev_info {
#define UBLK_IO_F_NEED_REG_BUF (1U << 17)
/* Request has an integrity data buffer */
#define UBLK_IO_F_INTEGRITY (1UL << 18)
+/*
+ * I/O buffer is in a registered shared memory buffer. When set, the addr
+ * field in ublksrv_io_desc encodes buffer index and byte offset instead
+ * of a userspace virtual address.
+ */
+#define UBLK_IO_F_SHMEM_ZC (1U << 19)
/*
* io cmd is described by this structure, and stored in share memory, indexed
@@ -743,4 +796,31 @@ struct ublk_params {
struct ublk_param_integrity integrity;
};
+/*
+ * Shared memory zero-copy addr encoding for UBLK_IO_F_SHMEM_ZC.
+ *
+ * When UBLK_IO_F_SHMEM_ZC is set, ublksrv_io_desc.addr is encoded as:
+ * bits [0:31] = byte offset within the buffer (up to 4GB)
+ * bits [32:47] = buffer index (up to 65536)
+ * bits [48:63] = reserved (must be zero)
+ */
+#define UBLK_SHMEM_ZC_OFF_MASK 0xffffffffULL
+#define UBLK_SHMEM_ZC_IDX_OFF 32
+#define UBLK_SHMEM_ZC_IDX_MASK 0xffffULL
+
+static inline __u64 ublk_shmem_zc_addr(__u16 index, __u32 offset)
+{
+ return ((__u64)index << UBLK_SHMEM_ZC_IDX_OFF) | offset;
+}
+
+static inline __u16 ublk_shmem_zc_index(__u64 addr)
+{
+ return (addr >> UBLK_SHMEM_ZC_IDX_OFF) & UBLK_SHMEM_ZC_IDX_MASK;
+}
+
+static inline __u32 ublk_shmem_zc_offset(__u64 addr)
+{
+ return (__u32)(addr & UBLK_SHMEM_ZC_OFF_MASK);
+}
+
#endif