From 6f02527729bd31ca4e473bff19fda4ccd5889148 Mon Sep 17 00:00:00 2001 From: Norman Maurer Date: Mon, 28 Jul 2025 20:59:53 -1000 Subject: io_uring/net: Allow to do vectorized send At the moment you have to use sendmsg for vectorized send. While this works it's suboptimal as it also means you need to allocate a struct msghdr that needs to be kept alive until a submission happens. We can remove this limitation by just allowing to use send directly. Signed-off-by: Norman Maurer Link: https://lore.kernel.org/r/20250729065952.26646-1-norman_maurer@apple.com [axboe: remove -EINVAL return for SENDMSG and SEND_VECTORIZED] [axboe: allow send_zc to set SEND_VECTORIZED too] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b8a0e70ee2fd..6957dc539d83 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -392,12 +392,16 @@ enum io_uring_op { * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers * will be contiguous from the starting buffer ID. + * + * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) #define IORING_RECVSEND_BUNDLE (1U << 4) +#define IORING_SEND_VECTORIZED (1U << 5) /* * cqe.res for IORING_CQE_F_NOTIF if -- cgit v1.2.3 From 620a50c927004f5c9420a7ca9b1a55673dbf3941 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:07 +0800 Subject: io_uring: uring_cmd: add multishot support Add UAPI flag IORING_URING_CMD_MULTISHOT for supporting multishot uring_cmd operations with provided buffer. This enables drivers to post multiple completion events from a single uring_cmd submission, which is useful for: - Notifying userspace of device events (e.g., interrupt handling) - Supporting devices with multiple event sources (e.g., multi-queue devices) - Avoiding the need for device poll() support when events originate from multiple sources device-wide The implementation adds two new APIs: - io_uring_cmd_select_buffer(): selects a buffer from the provided buffer group for multishot uring_cmd - io_uring_mshot_cmd_post_cqe(): posts a CQE after event data is pushed to the provided buffer Multishot uring_cmd must be used with buffer select (IOSQE_BUFFER_SELECT) and is mutually exclusive with IORING_URING_CMD_FIXED for now. The ublk driver will be the first user of this functionality: https://github.com/ming1/linux/commits/ublk-devel/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-3-ming.lei@redhat.com [axboe: fold in fix for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..1e935f8901c5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -298,9 +298,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* -- cgit v1.2.3 From b69458735d826f0676585623d028a0fd474f3e4f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:08:14 -0600 Subject: io_uring: add UAPI definitions for mixed CQE postings This adds the CQE flags related to supporting a mixed CQ ring mode, where both normal (16b) and big (32b) CQEs may be posted. No functional changes in this patch. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1e935f8901c5..7af8d10b3aba 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -491,12 +491,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 -- cgit v1.2.3 From e26dca67fde194340582cfbb0c0bf661825e9e46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:14:41 -0600 Subject: io_uring: add support for IORING_SETUP_CQE_MIXED Normal rings support 16b CQEs for posting completions, while certain features require the ring to be configured with IORING_SETUP_CQE32, as they need to convey more information per completion. This, in turn, makes ALL the CQEs be 32b in size. This is somewhat wasteful and inefficient, particularly when only certain CQEs need to be of the bigger variant. This adds support for setting up a ring with mixed CQE sizes, using IORING_SETUP_CQE_MIXED. When setup in this mode, CQEs posted to the ring may be either 16b or 32b in size. If a CQE is 32b in size, then IORING_CQE_F_32 is set in the CQE flags to indicate that this is the case. If this flag isn't set, the CQE is the normal 16b variant. CQEs on these types of mixed rings may also have IORING_CQE_F_SKIP set. This can happen if the ring is one (small) CQE entry away from wrapping, and an attempt is made to post a 32b CQE. As CQEs must be contigious in the CQ ring, a 32b CQE cannot wrap the ring. For this case, a single dummy CQE is posted with the SKIP flag set. The application should simply ignore those. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7af8d10b3aba..5135e1be0390 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, -- cgit v1.2.3 From 806ecb209aa86fcc1d92bc9f10323cf773f64d6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:22:16 -0600 Subject: io_uring/nop: add support for IORING_SETUP_CQE_MIXED This adds support for setting IORING_NOP_CQE32 as a flag for a NOP command, in which case a 32b CQE will be posted rather than a regular one. This is the default if the ring has been setup with IORING_SETUP_CQE32. If the ring has been setup with IORING_SETUP_CQE_MIXED, then 16b CQEs will be posted without this flag set, and 32b CQEs if this flag is set. For the latter case, sqe->off is what will be posted as cqe->big_cqe[0] and sqe->addr is what will be posted as cqe->big_cqe[1]. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5135e1be0390..04ebff33d0e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -464,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From c265ae75f900cea4e415230a77b5d152377627dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Sep 2025 00:03:00 +0100 Subject: io_uring: introduce io_uring querying There are many parameters users might want to query about io_uring like available request types or the ring sizes. This patch introduces an interface for such slow path queries. It was written with several requirements in mind: - Can be used with or without an io_uring instance. Asking for supported setup flags before creating an instance as well as qeurying info about an already created ring are valid use cases. - Should be moderately fast. For example, users might use it to periodically retrieve ring attributes at runtime. As a consequence, it should be able to query multiple attributes in a single syscall. - Backward and forward compatible. - Should be reasobably easy to use. - Reduce the kernel code size for introducing new query types. It's implemented as a new registration opcode IORING_REGISTER_QUERY. The user passes one or more query strutctures linked together, each represented by struct io_uring_query_hdr. The header stores common control fields needed for processing and points to query type specific information. The header contains - The query type - The result field, which on return contains the error code for the query - Pointer to the query type specific information - The size of the query structure. The kernel will only populate up to the size, which helps with backward compatibility. The kernel can also reduce the size, so if the current kernel is older than the inteface the user tries to use, it'll get only the supported bits. - next_entry field is used to chain multiple queries. Apart from common registeration syscall failures, it can only immediately return an error code in case when the headers are incorrect or any other addresses and invalid. That usually mean that the userspace doesn't use the API right and should be corrected. All query type specific errors are returned in the header's result field. As an example, the patch adds a single query type for now, i.e. IO_URING_QUERY_OPCODES, which tells what register / request / etc. opcodes are supported, but there are particular plans to extend it. Note: there is a request probing interface via IORING_REGISTER_PROBE, but it's a mess. It requires the user to create a ring first, it only works for requests, and requires dynamic allocations. Reviewed-by: Martin K. Petersen Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 04ebff33d0e6..1ce17c535944 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -686,6 +686,9 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + /* this goes last */ IORING_REGISTER_LAST, -- cgit v1.2.3 From 705d2ac7b2044f1ca05ba6033183151a04dbff4d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 16 Sep 2025 15:28:02 +0100 Subject: io_uring/zcrx: allow synchronous buffer return Returning buffers via a ring is performant and convenient, but it becomes a problem when/if the user misconfigured the ring size and it becomes full. Add a synchronous way to return buffers back to the page pool via a new register opcode. It's supposed to be a reliable slow path for refilling. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ce17c535944..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,6 +689,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1070,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif -- cgit v1.2.3 From beb97995b97532e1f215e3295e6843e59862f94b Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 Oct 2025 14:18:18 +0800 Subject: io_uring: use tab indentation for IORING_SEND_VECTORIZED comment Be consistent with tab style of "liburing/src/include/liburing/io_uring.h". Signed-off-by: Haiyue Wang Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0cc1cc0dd01..263bed13473e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -404,7 +404,7 @@ enum io_uring_op { * will be contiguous from the starting buffer ID. * * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec - * to allow vectorized send operations. + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) -- cgit v1.2.3 From 819630bd6f86ac8998c7df9deddb6cee50e9e22d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 3 Nov 2025 13:53:13 +0000 Subject: io_uring/zcrx: remove sync refill uapi There is a better way to handle the problem IORING_REGISTER_ZCRX_REFILL solves. The uapi can also be slightly adjusted to accommodate future extensions. Remove the feature for now, it'll be reworked for the next release. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 263bed13473e..b7c8dad26690 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,9 +689,6 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, - /* return zcrx buffers back into circulation */ - IORING_REGISTER_ZCRX_REFILL = 36, - /* this goes last */ IORING_REGISTER_LAST, @@ -1073,15 +1070,6 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; -struct io_uring_zcrx_sync_refill { - __u32 zcrx_id; - /* the number of entries to return */ - __u32 nr_entries; - /* pointer to an array of struct io_uring_zcrx_rqe */ - __u64 rqes; - __u64 __resv[2]; -}; - #ifdef __cplusplus } #endif -- cgit v1.2.3