From 3c7d76d6128a0fef68e6540754bf85a44a29bb59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2025 03:25:41 -0700 Subject: io_uring: IOPOLL polling improvements io_uring manages issued and pending IOPOLL read/write requests in a singly linked list. One downside of that is that individual items cannot easily be removed from that list, and as a result, io_uring will only complete a completed request N in that list if 0..N-1 are also complete. For homogenous IO this isn't necessarily an issue, but if different devices are involved in polling in the same ring, or if disparate IO from the same device is being polled for, this can defer completion of some requests unnecessarily. Move to a doubly linked list for iopoll completions instead, making it possible to easily complete whatever requests that were polled done successfully. Co-developed-by: Fengnan Chang Link: https://lore.kernel.org/io-uring/20251210085501.84261-1-changfengnan@bytedance.com/ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1adb0d20a0a..54fd30abf2b8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -316,7 +316,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ bool poll_multi_queue; - struct io_wq_work_list iopoll_list; + struct list_head iopoll_list; struct io_file_table file_table; struct io_rsrc_data buf_table; @@ -708,7 +708,16 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - struct io_task_work io_task_work; + + /* + * IOPOLL doesn't use task_work, so use the ->iopoll_node list + * entry to manage pending iopoll requests. + */ + union { + struct io_task_work io_task_work; + struct list_head iopoll_node; + }; + union { /* * for polled requests, i.e. IORING_OP_POLL_ADD and async armed -- cgit v1.2.3 From d6406c45f14842019cfaaba19fe2a76ef9fa831c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:45 -0700 Subject: io_uring: track restrictions separately for IORING_OP and IORING_REGISTER It's quite likely that only register opcode restrictions exists, in which case we'd never need to check the normal opcodes. Split ctx->restricted into two separate fields, one for I/O opcodes, and one for register opcodes. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 54fd30abf2b8..e4c804f99c30 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -224,7 +224,10 @@ struct io_restriction { DECLARE_BITMAP(sqe_op, IORING_OP_LAST); u8 sqe_flags_allowed; u8 sqe_flags_required; - bool registered; + /* IORING_OP_* restrictions exist */ + bool op_registered; + /* IORING_REGISTER_* restrictions exist */ + bool reg_registered; }; struct io_submit_link { @@ -259,7 +262,8 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int drain_next: 1; - unsigned int restricted: 1; + unsigned int op_restricted: 1; + unsigned int reg_restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int has_evfd: 1; -- cgit v1.2.3 From 697a5284ad9697609324739e38e341612cd342a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Jan 2026 07:59:08 -0700 Subject: io_uring: fix IOPOLL with passthrough I/O A previous commit improving IOPOLL made an incorrect assumption that task_work isn't used with IOPOLL. This can cause crashes when doing passthrough I/O on nvme, where queueing the completion task_work will trample on the same memory that holds the completed list of requests. Fix it up by shuffling the members around, so we're not sharing any parts that end up getting used in this path. Fixes: 3c7d76d6128a ("io_uring: IOPOLL polling improvements") Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs_SLPj9v9w5MgfzHKy+983enPx3ZQY2kMuMJ1202DBefw@mail.gmail.com/ Tested-by: Yi Zhang Cc: Ming Lei Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e4c804f99c30..211686ad89fd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -713,13 +713,10 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - /* - * IOPOLL doesn't use task_work, so use the ->iopoll_node list - * entry to manage pending iopoll requests. - */ union { struct io_task_work io_task_work; - struct list_head iopoll_node; + /* For IOPOLL setup queues, with hybrid polling */ + u64 iopoll_start; }; union { @@ -728,8 +725,8 @@ struct io_kiocb { * poll */ struct hlist_node hash_node; - /* For IOPOLL setup queues, with hybrid polling */ - u64 iopoll_start; + /* IOPOLL completion handling */ + struct list_head iopoll_node; /* for private io_kiocb freeing */ struct rcu_head rcu_head; }; -- cgit v1.2.3 From 07f3c3a1cd56c2048a92dad0c11f15e4ac3888c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 11:21:32 -0700 Subject: io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member A previous commit got rid of any use of this member, but forgot to remove it. Kill it. Fixes: f4bb2f65bb81 ("io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 211686ad89fd..dc6bd6940a0d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -442,6 +442,9 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; + /* protected by ->completion_lock */ + unsigned nr_req_allocated; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ @@ -454,10 +457,6 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif - /* protected by ->completion_lock */ - unsigned evfd_last_cq_tail; - unsigned nr_req_allocated; - /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from -- cgit v1.2.3