From 9f0deaa12d832f488500a5afe9b912e9b3cfc432 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 16 Aug 2022 06:59:59 -0700 Subject: eventfd: guard wake_up in eventfd fs calls as well Guard wakeups that the user can trigger, and that may end up triggering a call back into eventfd_signal. This is in addition to the current approach that only guards in eventfd_signal. Rename in_eventfd_signal -> in_eventfd at the same time to reflect this. Without this there would be a deadlock in the following code using libaio: int main() { struct io_context *ctx = NULL; struct iocb iocb; struct iocb *iocbs[] = { &iocb }; int evfd; uint64_t val = 1; evfd = eventfd(0, EFD_CLOEXEC); assert(!io_setup(2, &ctx)); io_prep_poll(&iocb, evfd, POLLIN); io_set_eventfd(&iocb, evfd); assert(1 == io_submit(ctx, 1, iocbs)); write(evfd, &val, 8); } Signed-off-by: Dylan Yudaken Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20220816135959.1490641-1-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/eventfd.h | 2 +- include/linux/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 305d5f19093b..30eb30d6909b 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -46,7 +46,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); static inline bool eventfd_signal_allowed(void) { - return !current->in_eventfd_signal; + return !current->in_eventfd; } #else /* CONFIG_EVENTFD */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..8d82d6d32670 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -936,7 +936,7 @@ struct task_struct { #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ - unsigned in_eventfd_signal:1; + unsigned in_eventfd:1; #endif #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; -- cgit v1.2.3 From c0e0d6ba25f180ab76d3c18f8b360a119dffa634 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 30 Aug 2022 05:50:10 -0700 Subject: io_uring: add IORING_SETUP_DEFER_TASKRUN Allow deferring async tasks until the user calls io_uring_enter(2) with the IORING_ENTER_GETEVENTS flag. Enable this mode with a flag at io_uring_setup time. This functionality requires that the later io_uring_enter will be called from the same submission task, and therefore restrict this flag to work only when IORING_SETUP_SINGLE_ISSUER is also set. Being able to hand pick when tasks are run prevents the problem where there is current work to be done, however task work runs anyway. For example, a common workload would obtain a batch of CQEs, and process each one. Interrupting this to additional taskwork would add latency but not gain anything. If instead task work is deferred to just before more CQEs are obtained then no additional latency is added. The way this is implemented is by trying to keep task work local to a io_ring_ctx, rather than to the submission task. This is required, as the application will want to wake up only a single io_ring_ctx at a time to process work, and so the lists of work have to be kept separate. This has some other benefits like not having to check the task continually in handle_tw_list (and potentially unlocking/locking those), and reducing locks in the submit & process completions path. There are networking cases where using this option can reduce request latency by 50%. For example a contrived example using [1] where the client sends 2k data and receives the same data back while doing some system calls (to trigger task work) shows this reduction. The reason ends up being that if sending responses is delayed by processing task work, then the client side sits idle. Whereas reordering the sends first means that the client runs it's workload in parallel with the local task work. [1]: Using https://github.com/DylanZA/netbench/tree/defer_run Client: ./netbench --client_only 1 --control_port 10000 --host --tx "epoll --threads 16 --per_thread 1 --size 2048 --resp 2048 --workload 1000" Server: ./netbench --server_only 1 --control_port 10000 --rx "io_uring --defer_taskrun 0 --workload 100" --rx "io_uring --defer_taskrun 1 --workload 100" Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220830125013.570060-5-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ include/uapi/linux/io_uring.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 677a25d44d7f..d56ff2185168 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -301,6 +301,8 @@ struct io_ring_ctx { struct io_hash_table cancel_table; bool poll_multi_queue; + struct llist_head work_llist; + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6b83177fd41d..972b179bc07a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -157,6 +157,13 @@ enum { */ #define IORING_SETUP_SINGLE_ISSUER (1U << 12) +/* + * Defer running task work to get events. + * Rather than running bits of task work whenever the task transitions + * try to do it just before it is needed. + */ +#define IORING_SETUP_DEFER_TASKRUN (1U << 13) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, -- cgit v1.2.3 From 21a091b970cdbcf3e8ff829234b51be6f9192766 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 30 Aug 2022 05:50:12 -0700 Subject: io_uring: signal registered eventfd to process deferred task work Some workloads rely on a registered eventfd (via io_uring_register_eventfd(3)) in order to wake up and process the io_uring. In the case of a ring setup with IORING_SETUP_DEFER_TASKRUN, that eventfd also needs to be signalled when there are tasks to run. This changes an old behaviour which assumed 1 eventfd signal implied at least 1 CQE, however only when this new flag is set (and so old users will not notice). This should be expected with the IORING_SETUP_DEFER_TASKRUN flag as it is not guaranteed that every task will result in a CQE. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220830125013.570060-7-dylany@fb.com [axboe: fold in call_rcu() serialization fix] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d56ff2185168..aa4d90a53866 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -184,6 +184,8 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async: 1; struct rcu_head rcu; + atomic_t refs; + atomic_t ops; }; struct io_alloc_cache { -- cgit v1.2.3 From f75d5036d04cd57103fe1a50dffceb7c1040fbe7 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 30 Aug 2022 05:50:13 -0700 Subject: io_uring: trace local task work run Add tracing for io_run_local_task_work Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220830125013.570060-8-dylany@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index c5b21ff0ac85..936fd41bf147 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -655,6 +655,35 @@ TRACE_EVENT(io_uring_short_write, __entry->wanted, __entry->got) ); +/* + * io_uring_local_work_run - ran ring local task work + * + * @tctx: pointer to a io_uring_ctx + * @count: how many functions it ran + * @loops: how many loops it ran + * + */ +TRACE_EVENT(io_uring_local_work_run, + + TP_PROTO(void *ctx, int count, unsigned int loops), + + TP_ARGS(ctx, count, loops), + + TP_STRUCT__entry ( + __field(void *, ctx ) + __field(int, count ) + __field(unsigned int, loops ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->count = count; + __entry->loops = loops; + ), + + TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ -- cgit v1.2.3 From de27e18e86173b704beaa19f0ee376f3305c4794 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 23 Aug 2022 21:44:40 +0530 Subject: fs: add file_operations->uring_cmd_iopoll io_uring will invoke this to do completion polling on uring-cmd operations. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220823161443.49436-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..d6badd19784f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2132,6 +2132,7 @@ struct file_operations { loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); + int (*uring_cmd_iopoll)(struct io_uring_cmd *ioucmd); } __randomize_layout; struct inode_operations { -- cgit v1.2.3 From 5756a3a7e713bcab705a5f0c810a2b1f7f4ecfaa Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 23 Aug 2022 21:44:41 +0530 Subject: io_uring: add iopoll infrastructure for io_uring_cmd Put this up in the same way as iopoll is done for regular read/write IO. Make place for storing a cookie into struct io_uring_cmd on submission. Perform the completion using the ->uring_cmd_iopoll handler. Signed-off-by: Kanchan Joshi Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20220823161443.49436-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 4a2f6cc5a492..58676c0a398f 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -20,8 +20,12 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; const void *cmd; - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd); + union { + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + /* used for polled completion */ + void *cookie; + }; u32 cmd_op; u32 pad; u8 pdu[32]; /* available inline for free use */ -- cgit v1.2.3 From c6e99ea482e2a9e1fef2488891242f9749584225 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 23 Aug 2022 21:44:42 +0530 Subject: block: export blk_rq_is_poll This is in preparation to support iopoll for nvme passthrough. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220823161443.49436-4-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92294a5fb083..de384f5d2c37 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -980,6 +980,7 @@ int blk_rq_map_kern(struct request_queue *, struct request *, void *, int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); +bool blk_rq_is_poll(struct request *rq); struct req_iterator { struct bvec_iter iter; -- cgit v1.2.3 From de97fcb30316410a2c46be102f074a454ecc6cf1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Sep 2022 15:18:05 -0600 Subject: fs: add batch and poll flags to the uring_cmd_iopoll() handler We need the poll_flags to know how to poll for the IO, and we should have the batch structure in preparation for supporting batched completions with iopoll. Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d6badd19784f..01681d061a6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2132,7 +2132,8 @@ struct file_operations { loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); - int (*uring_cmd_iopoll)(struct io_uring_cmd *ioucmd); + int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, + unsigned int poll_flags); } __randomize_layout; struct inode_operations { -- cgit v1.2.3 From 493108d95f1464ccd101d4e5cfa7e93f1fc64d47 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 21 Sep 2022 12:17:54 +0100 Subject: io_uring/net: zerocopy sendmsg Add a zerocopy version of sendmsg. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6aabc4bdfc0ec78df6ec9328137e394af9d4e7ef.1663668091.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 972b179bc07a..92f29d9505a6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -213,6 +213,7 @@ enum io_uring_op { IORING_OP_SOCKET, IORING_OP_URING_CMD, IORING_OP_SEND_ZC, + IORING_OP_SENDMSG_ZC, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3