From 8e29da69feade64ec7fe9e1a2824b967c5183a21 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Apr 2022 10:44:00 -0600 Subject: io_uring: add support for IORING_ASYNC_CANCEL_ALL The current cancelation will lookup and cancel the first request it finds based on the key passed in. Add a flag that allows to cancel any request that matches they key. It completes with the number of requests found and canceled, or res < 0 if an error occured. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20220418164402.75259-4-axboe@kernel.dk --- include/uapi/linux/io_uring.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1845cf7c80ba..476e58a2837f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -187,6 +187,13 @@ enum { #define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_USER_DATA (1U << 2) +/* + * ASYNC_CANCEL flags. + * + * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key + */ +#define IORING_ASYNC_CANCEL_ALL (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From 4bf94615b8886305199ed5755cb72fea88258d15 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Apr 2022 10:44:01 -0600 Subject: io_uring: allow IORING_OP_ASYNC_CANCEL with 'fd' key Currently sqe->addr must contain the user_data of the request being canceled. Introduce the IORING_ASYNC_CANCEL_FD flag, which tells the kernel that we're keying off the file fd instead for cancelation. This allows canceling any request that a) uses a file, and b) was assigned the file based on the value being passed in. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20220418164402.75259-5-axboe@kernel.dk --- include/uapi/linux/io_uring.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 476e58a2837f..cc7fe82a1798 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -191,8 +191,11 @@ enum { * ASYNC_CANCEL flags. * * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key + * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the + * request 'user_data' */ #define IORING_ASYNC_CANCEL_ALL (1U << 0) +#define IORING_ASYNC_CANCEL_FD (1U << 1) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From 970f256edb8c1259c8ed48d52b38215135396126 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Apr 2022 10:44:02 -0600 Subject: io_uring: add support for IORING_ASYNC_CANCEL_ANY Rather than match on a specific key, be it user_data or file, allow canceling any request that we can lookup. Works like IORING_ASYNC_CANCEL_ALL in that it cancels multiple requests, but it doesn't key off user_data or the file. Can't be set with IORING_ASYNC_CANCEL_FD, as that's a key selector. Only one may be used at the time. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20220418164402.75259-6-axboe@kernel.dk Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cc7fe82a1798..980d82eb196e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -193,9 +193,11 @@ enum { * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the * request 'user_data' + * IORING_ASYNC_CANCEL_ANY Match any request */ #define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_FD (1U << 1) +#define IORING_ASYNC_CANCEL_ANY (1U << 2) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From e1169f06d5bbdbc2b22ae4e3083a4bf75ae5ecee Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Apr 2022 19:49:03 -0600 Subject: io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used If this is set, io_uring will never use an IPI to deliver a task_work notification. This can be used in the common case where a single task or thread communicates with the ring, and doesn't rely on io_uring_cqe_peek(). This provides a noticeable win in performance, both from eliminating the IPI itself, but also from avoiding interrupting the submitting task unnecessarily. Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20220426014904.60384-6-axboe@kernel.dk Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 980d82eb196e..a84f29d657c3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -102,6 +102,14 @@ enum { #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ +/* + * Cooperative task running. When requests complete, they often require + * forcing the submitter to transition to the kernel to complete. If this + * flag is set, work will be done when the task transitions anyway, rather + * than force an inter-processor interrupt reschedule. This avoids interrupting + * a task running in userspace, and saves an IPI. + */ +#define IORING_SETUP_COOP_TASKRUN (1U << 8) enum { IORING_OP_NOP, -- cgit v1.2.3 From ef060ea9e4fd3b763e7060a3af0a258d2d5d7c0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Apr 2022 19:49:04 -0600 Subject: io_uring: add IORING_SETUP_TASKRUN_FLAG If IORING_SETUP_COOP_TASKRUN is set to use cooperative scheduling for running task_work, then IORING_SETUP_TASKRUN_FLAG can be set so the application can tell if task_work is pending in the kernel for this ring. This allows use cases like io_uring_peek_cqe() to still function appropriately, or for the task to know when it would be useful to call io_uring_wait_cqe() to run pending events. Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20220426014904.60384-7-axboe@kernel.dk Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a84f29d657c3..fad63564678a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -110,6 +110,12 @@ enum { * a task running in userspace, and saves an IPI. */ #define IORING_SETUP_COOP_TASKRUN (1U << 8) +/* + * If COOP_TASKRUN is set, get notified if task work is available for + * running and a kernel transition would be needed to run it. This sets + * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + */ +#define IORING_SETUP_TASKRUN_FLAG (1U << 9) enum { IORING_OP_NOP, @@ -256,6 +262,7 @@ struct io_sqring_offsets { */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ +#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ struct io_cqring_offsets { __u32 head; -- cgit v1.2.3 From 0455d4ccec548b0fb51db39a4d3350a7a80a0222 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Apr 2022 12:11:33 -0600 Subject: io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg If IORING_RECVSEND_POLL_FIRST is set for recv/recvmsg or send/sendmsg, then we arm poll first rather than attempt a receive or send upfront. This can be useful if we expect there to be no data (or space) available for the request, as we can then avoid wasting time on the initial issue attempt. Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index fad63564678a..06621a278cb6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -213,6 +213,16 @@ enum { #define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_ANY (1U << 2) +/* + * send/sendmsg and recv/recvmsg flags (sqe->addr2) + * + * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send + * or receive and arm poll if that yields an + * -EAGAIN result, arm poll upfront and skip + * the initial transfer attempt. + */ +#define IORING_RECVSEND_POLL_FIRST (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From 1339f24b336db5ded9811f3fe7b948e0de207785 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 7 May 2022 14:18:44 -0600 Subject: io_uring: allow allocated fixed files for openat/openat2 If the application passes in IORING_FILE_INDEX_ALLOC as the file_slot, then that's a hint to allocate a fixed file descriptor rather than have one be passed in directly. This can be useful for having io_uring manage the direct descriptor space. Normal open direct requests will complete with 0 for success, and < 0 in case of error. If io_uring is asked to allocated the direct descriptor, then the direct descriptor is returned in case of success. Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 06621a278cb6..b7f02a55032a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -63,6 +63,15 @@ struct io_uring_sqe { __u64 __pad2[2]; }; +/* + * If sqe->file_index is set to this for opcodes that instantiate a new + * direct descriptor (like openat/openat2/accept), then io_uring will allocate + * an available direct descriptor instead of having the application pass one + * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE + * if the space is full. + */ +#define IORING_FILE_INDEX_ALLOC (~0U) + enum { IOSQE_FIXED_FILE_BIT, IOSQE_IO_DRAIN_BIT, -- cgit v1.2.3 From a8da73a32b6e9271a613e5a0e90a8c35f40abeb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 May 2022 09:29:14 -0600 Subject: io_uring: add flag for allocating a fully sparse direct descriptor space Currently to setup a fully sparse descriptor space upfront, the app needs to alloate an array of the full size and memset it to -1 and then pass that in. Make this a bit easier by allowing a flag that simply does this internally rather than needing to copy each slot separately. This works with IORING_REGISTER_FILES2 as the flag is set in struct io_uring_rsrc_register, and is only allow when the type is IORING_RSRC_FILE as this doesn't make sense for registered buffers. Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b7f02a55032a..36ec43dc7bf9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -396,9 +396,15 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +/* + * Register a fully sparse file space, rather than pass in an array of all + * -1 file descriptors. + */ +#define IORING_RSRC_REGISTER_SPARSE (1U << 0) + struct io_uring_rsrc_register { __u32 nr; - __u32 resv; + __u32 flags; __u64 resv2; __aligned_u64 data; __aligned_u64 tags; -- cgit v1.2.3 From 390ed29b5e425ba00da2b6113b74a14949f71b02 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sat, 14 May 2022 22:20:43 +0800 Subject: io_uring: add IORING_ACCEPT_MULTISHOT for accept add an accept_flag IORING_ACCEPT_MULTISHOT for accept, which is to support multishot. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220514142046.58072-2-haoxu.linux@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 36ec43dc7bf9..15f821af9242 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -232,6 +232,11 @@ enum { */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) +/* + * accept flags stored in sqe->ioprio + */ +#define IORING_ACCEPT_MULTISHOT (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From c7fb19428d67dd0a2a78a4f237af01d39c78dc5a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Apr 2022 14:38:53 -0600 Subject: io_uring: add support for ring mapped supplied buffers Provided buffers allow an application to supply io_uring with buffers that can then be grabbed for a read/receive request, when the data source is ready to deliver data. The existing scheme relies on using IORING_OP_PROVIDE_BUFFERS to do that, but it can be difficult to use in real world applications. It's pretty efficient if the application is able to supply back batches of provided buffers when they have been consumed and the application is ready to recycle them, but if fragmentation occurs in the buffer space, it can become difficult to supply enough buffers at the time. This hurts efficiency. Add a register op, IORING_REGISTER_PBUF_RING, which allows an application to setup a shared queue for each buffer group of provided buffers. The application can then supply buffers simply by adding them to this ring, and the kernel can consume then just as easily. The ring shares the head with the application, the tail remains private in the kernel. Provided buffers setup with IORING_REGISTER_PBUF_RING cannot use IORING_OP_{PROVIDE,REMOVE}_BUFFERS for adding or removing entries to the ring, they must use the mapped ring. Mapped provided buffer rings can co-exist with normal provided buffers, just not within the same group ID. To gauge overhead of the existing scheme and evaluate the mapped ring approach, a simple NOP benchmark was written. It uses a ring of 128 entries, and submits/completes 32 at the time. 'Replenish' is how many buffers are provided back at the time after they have been consumed: Test Replenish NOPs/sec ================================================================ No provided buffers NA ~30M Provided buffers 32 ~16M Provided buffers 1 ~10M Ring buffers 32 ~27M Ring buffers 1 ~27M The ring mapped buffers perform almost as well as not using provided buffers at all, and they don't care if you provided 1 or more back at the same time. This means application can just replenish as they go, rather than need to batch and compact, further reducing overhead in the application. The NOP benchmark above doesn't need to do any compaction, so that overhead isn't even reflected in the above test. Co-developed-by: Dylan Yudaken Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 15f821af9242..ddf969ae5a79 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -384,6 +384,10 @@ enum { IORING_REGISTER_RING_FDS = 20, IORING_UNREGISTER_RING_FDS = 21, + /* register ring based provide buffer group */ + IORING_REGISTER_PBUF_RING = 22, + IORING_UNREGISTER_PBUF_RING = 23, + /* this goes last */ IORING_REGISTER_LAST }; @@ -461,6 +465,38 @@ struct io_uring_restriction { __u32 resv2[3]; }; +struct io_uring_buf { + __u64 addr; + __u32 len; + __u16 bid; + __u16 resv; +}; + +struct io_uring_buf_ring { + union { + /* + * To avoid spilling into more pages than we need to, the + * ring tail is overlaid with the io_uring_buf->resv field. + */ + struct { + __u64 resv1; + __u32 resv2; + __u16 resv3; + __u16 tail; + }; + struct io_uring_buf bufs[0]; + }; +}; + +/* argument for IORING_(UN)REGISTER_PBUF_RING */ +struct io_uring_buf_reg { + __u64 ring_addr; + __u32 ring_entries; + __u16 bgid; + __u16 pad; + __u64 resv[3]; +}; + /* * io_uring_restriction->opcode values */ -- cgit v1.2.3