From 8e9fad0e70b7b62848e0aeb1a873903b9ce4d7c4 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 27 Jun 2023 06:44:24 -0700 Subject: io_uring: Add io_uring command support for sockets Enable io_uring commands on network sockets. Create two new SOCKET_URING_OP commands that will operate on sockets. In order to call ioctl on sockets, use the file_operations->io_uring_cmd callbacks, and map it to a uring socket function, which handles the SOCKET_URING_OP accordingly, and calls socket ioctls. This patches was tested by creating a new test case in liburing. Link: https://github.com/leitao/liburing/tree/io_uring_cmd Signed-off-by: Breno Leitao Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230627134424.2784797-1-leitao@debian.org Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index bb9c666bd584..106cdc55ff3b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -81,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) @@ -116,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode) { return ""; } +static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} #endif #endif -- cgit v1.2.3 From b24c5d752962fa0970cd7e3d74b1cd0e843358de Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:25 +0100 Subject: io_uring: simplify big_cqe handling Don't keep big_cqe bits of req in a union with hash_node, find a separate space for it. It's bit safer, but also if we keep it always initialised, we can get rid of ugly REQ_F_CQE32_INIT handling. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/447aa1b2968978c99e655ba88db536e903df0fe9.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f04ce513fadb..9795eda529f7 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -409,7 +409,6 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, - REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, REQ_F_HASH_LOCKED_BIT, @@ -479,8 +478,6 @@ enum { REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), - /* ->extra1 and ->extra2 are initialised */ - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), /* recvmsg special flag, clear EPOLLIN */ REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ @@ -579,13 +576,7 @@ struct io_kiocb { struct io_task_work io_task_work; unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - union { - struct hlist_node hash_node; - struct { - u64 extra1; - u64 extra2; - }; - }; + struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ @@ -595,6 +586,11 @@ struct io_kiocb { /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; struct io_wq_work work; + + struct { + u64 extra1; + u64 extra2; + } big_cqe; }; struct io_overflow_cqe { -- cgit v1.2.3 From ec26c225f06f5993f8891fa6c79fab3c92981181 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:29 +0100 Subject: io_uring: merge iopoll and normal completion paths io_do_iopoll() and io_submit_flush_completions() are pretty similar, both filling CQEs and then free a list of requests. Don't duplicate it and make iopoll use __io_submit_flush_completions(), which also helps with inlining and other optimisations. For that, we need to first find all completed iopoll requests and splice them from the iopoll list and then pass it down. This adds one extra list traversal, which should be fine as requests will stay hot in cache. CQ locking is already conditional, introduce ->lockless_cq and skip locking for IOPOLL as it's protected by ->uring_lock. We also add a wakeup optimisation for IOPOLL to __io_cq_unlock_post(), so it works just like io_cqring_ev_posted_iopoll(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3840473f5e8a960de35b77292026691880f6bdbc.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9795eda529f7..c0c03d8059df 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -205,6 +205,7 @@ struct io_ring_ctx { unsigned int has_evfd: 1; /* all CQEs should be posted only by the submitter task */ unsigned int task_complete: 1; + unsigned int lockless_cq: 1; unsigned int syscall_iopoll: 1; unsigned int poll_activated: 1; unsigned int drain_disabled: 1; -- cgit v1.2.3 From e5598d6ae62626d261b046a2f19347c38681ff51 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:31 +0100 Subject: io_uring: compact SQ/CQ heads/tails Queues heads and tails cache line aligned. That makes sq, cq taking 4 lines or 5 lines if we include the rest of struct io_rings (e.g. sq_flags is frequently accessed). Since modern io_uring is mostly single threaded, it doesn't make much send to spread them as such, it wastes space and puts additional pressure on caches. Put them all into a single line. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9c8deddf9a7ed32069235a530d1e117fb460bc4c.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c0c03d8059df..608a8e80e881 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -69,8 +69,8 @@ struct io_uring_task { }; struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; + u32 head; + u32 tail; }; /* -- cgit v1.2.3 From d7f06fea5d6be78403d42c9637f67bc883870094 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:33 +0100 Subject: io_uring: move non aligned field to the end Move not cache aligned fields down in io_ring_ctx, should change anything, but makes further refactoring easier. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/518e95d7888e9d481b2c5968dcf3f23db9ea47a5.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 608a8e80e881..ad87d6074fb2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -270,24 +270,6 @@ struct io_ring_ctx { struct io_alloc_cache netmsg_cache; } ____cacheline_aligned_in_smp; - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - unsigned int file_alloc_start; - unsigned int file_alloc_end; - - struct xarray personalities; - u32 pers_next; - struct { /* * We cache a range of free CQEs we can use, once exhausted it @@ -332,6 +314,24 @@ struct io_ring_ctx { unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; + /* IRQ completion list, under ->completion_lock */ + struct io_wq_work_list locked_free_list; + unsigned int locked_free_nr; + + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct list_head sqd_list; + + unsigned long check_cq; + + unsigned int file_alloc_start; + unsigned int file_alloc_end; + + struct xarray personalities; + u32 pers_next; + /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; -- cgit v1.2.3 From 18df385f42f0b3310ed2e4a3e39264bf5e784692 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:34 +0100 Subject: io_uring: banish non-hot data to end of io_ring_ctx Let's move all slow path, setup/init and so on fields to the end of io_ring_ctx, that makes ctx reorganisation later easier. That includes, page arrays used only on tear down, CQ overflow list, old provided buffer caches and used by io-wq poll hashes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/fc471b63925a0bf90a34943c4d36163c523cfb43.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ad87d6074fb2..72e609752323 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,20 +211,11 @@ struct io_ring_ctx { unsigned int drain_disabled: 1; unsigned int compat: 1; - enum task_work_notify_mode notify_method; + struct task_struct *submitter_task; + struct io_rings *rings; + struct percpu_ref refs; - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - unsigned short n_sqe_pages; - struct page **ring_pages; - struct page **sqe_pages; - - struct io_rings *rings; - struct task_struct *submitter_task; - struct percpu_ref refs; + enum task_work_notify_mode notify_method; } ____cacheline_aligned_in_smp; /* submission data */ @@ -262,10 +253,8 @@ struct io_ring_ctx { struct io_buffer_list *io_bl; struct xarray io_bl_xa; - struct list_head io_buffers_cache; struct io_hash_table cancel_table_locked; - struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; } ____cacheline_aligned_in_smp; @@ -298,11 +287,8 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct io_wq_work_list iopoll_list; - struct io_hash_table cancel_table; struct llist_head work_llist; - - struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; /* timeouts */ @@ -318,6 +304,10 @@ struct io_ring_ctx { struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; + struct list_head io_buffers_comp; + struct list_head cq_overflow_list; + struct io_hash_table cancel_table; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ @@ -332,6 +322,8 @@ struct io_ring_ctx { struct xarray personalities; u32 pers_next; + struct list_head io_buffers_cache; + /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; @@ -375,6 +367,15 @@ struct io_ring_ctx { unsigned sq_thread_idle; /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + + /* + * If IORING_SETUP_NO_MMAP is used, then the below holds + * the gup'ed pages for the two rings, and the sqes. + */ + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; }; struct io_tw_state { -- cgit v1.2.3 From c9def23dde5238184777340ad811e4903f216a2d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:35 +0100 Subject: io_uring: separate task_work/waiting cache line task_work's are typically queued up from IRQ/softirq potentially by a random CPU like in case of networking. Batch ctx fields bouncing as this into a separate cache line. We also move ->cq_timeouts there because waiters have to read and check it. We can also conditionally hide ->cq_timeouts in the future from the CQ wait path as a not really useful rudiment. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b7f3fcb5b6b9cca0238778262c1fdb7ada6286b7.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 72e609752323..5de5dffe29df 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -270,15 +270,25 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; unsigned cq_extra; } ____cacheline_aligned_in_smp; + /* + * task_work and async notification delivery cacheline. Expected to + * regularly bounce b/w CPUs. + */ + struct { + struct llist_head work_llist; + unsigned long check_cq; + atomic_t cq_wait_nr; + atomic_t cq_timeouts; + struct wait_queue_head cq_wait; + } ____cacheline_aligned_in_smp; + struct { spinlock_t completion_lock; bool poll_multi_queue; - atomic_t cq_wait_nr; /* * ->iopoll_list is protected by the ctx->uring_lock for @@ -287,14 +297,11 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct io_wq_work_list iopoll_list; - - struct llist_head work_llist; } ____cacheline_aligned_in_smp; /* timeouts */ struct { spinlock_t timeout_lock; - atomic_t cq_timeouts; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; @@ -314,8 +321,6 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - unsigned long check_cq; - unsigned int file_alloc_start; unsigned int file_alloc_end; -- cgit v1.2.3 From 0aa7aa5f766933d4f91b22d9658cd688e1f15dab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:36 +0100 Subject: io_uring: move multishot cqe cache in ctx We cache multishot CQEs before flushing them to the CQ in submit_state.cqe. It's a 16 entry cache totalling 256 bytes in the middle of the io_submit_state structure. Move it out of there, it should help with CPU caches for the submission state, and shouldn't affect cached CQEs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dbe1f39c043ee23da918836be44fcec252ce6711.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5de5dffe29df..01bdbc223edd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -176,7 +176,6 @@ struct io_submit_state { unsigned short submit_nr; unsigned int cqes_count; struct blk_plug plug; - struct io_uring_cqe cqes[16]; }; struct io_ev_fd { @@ -307,6 +306,8 @@ struct io_ring_ctx { unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; + struct io_uring_cqe completion_cqes[16]; + /* IRQ completion list, under ->completion_lock */ struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; -- cgit v1.2.3 From 644c4a7a721fb90356cdd42219c9928a3c386230 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Aug 2023 23:53:37 +0100 Subject: io_uring: move iopoll ctx fields around Move poll_multi_queue and iopoll_list to the submission cache line, it doesn't make much sense to keep them separately, and is better place for it in general. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5b03cf7e6652e350e6e70a917eec72ba9f33b97b.1692916914.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 01bdbc223edd..13d19b9be9f4 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -256,6 +256,15 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; + + /* + * ->iopoll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct io_wq_work_list iopoll_list; + bool poll_multi_queue; } ____cacheline_aligned_in_smp; struct { @@ -284,20 +293,6 @@ struct io_ring_ctx { struct wait_queue_head cq_wait; } ____cacheline_aligned_in_smp; - struct { - spinlock_t completion_lock; - - bool poll_multi_queue; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - } ____cacheline_aligned_in_smp; - /* timeouts */ struct { spinlock_t timeout_lock; @@ -308,6 +303,8 @@ struct io_ring_ctx { struct io_uring_cqe completion_cqes[16]; + spinlock_t completion_lock; + /* IRQ completion list, under ->completion_lock */ struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; -- cgit v1.2.3