From e307e6698165ca6508ed42c69cb1be76c8eb6a3c Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 27 Oct 2022 20:34:45 +0200 Subject: io_uring/net: introduce IORING_SEND_ZC_REPORT_USAGE flag It might be useful for applications to detect if a zero copy transfer with SEND[MSG]_ZC was actually possible or not. The application can fallback to plain SEND[MSG] in order to avoid the overhead of two cqes per request. Or it can generate a log message that could indicate to an administrator that no zero copy was possible and could explain degraded performance. Cc: stable@vger.kernel.org # 6.1 Link: https://lore.kernel.org/io-uring/fb6a7599-8a9b-15e5-9b64-6cd9d01c6ff4@gmail.com/T/#m2b0d9df94ce43b0e69e6c089bdff0ce6babbdfaa Signed-off-by: Stefan Metzmacher Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/8945b01756d902f5d5b0667f20b957ad3f742e5e.1666895626.git.metze@samba.org Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2df3225b562f..9d4c4078e8d0 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -296,10 +296,28 @@ enum io_uring_op { * * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * the buf_index field. + * + * IORING_SEND_ZC_REPORT_USAGE + * If set, SEND[MSG]_ZC should report + * the zerocopy usage in cqe.res + * for the IORING_CQE_F_NOTIF cqe. + * 0 is reported if zerocopy was actually possible. + * IORING_NOTIF_USAGE_ZC_COPIED if data was copied + * (at least partially). */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_SEND_ZC_REPORT_USAGE (1U << 3) + +/* + * cqe.res for IORING_CQE_F_NOTIF if + * IORING_SEND_ZC_REPORT_USAGE was requested + * + * It should be treated as a flag, all other + * bits of cqe.res should be treated as reserved! + */ +#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) /* * accept flags stored in sqe->ioprio -- cgit v1.2.3 From caf1aeaffc3b09649a56769e559333ae2c4f1802 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:10:53 -0700 Subject: eventpoll: add EPOLL_URING_WAKE poll wakeup flag We can have dependencies between epoll and io_uring. Consider an epoll context, identified by the epfd file descriptor, and an io_uring file descriptor identified by iofd. If we add iofd to the epfd context, and arm a multishot poll request for epfd with iofd, then the multishot poll request will repeatedly trigger and generate events until terminated by CQ ring overflow. This isn't a desired behavior. Add EPOLL_URING so that io_uring can pass it in as part of the poll wakeup key, and io_uring can check for that to detect a potential recursive invocation. Cc: stable@vger.kernel.org # 6.0 Signed-off-by: Jens Axboe --- include/uapi/linux/eventpoll.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 8a3432d0f0dc..e687658843b1 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -41,6 +41,12 @@ #define EPOLLMSG (__force __poll_t)0x00000400 #define EPOLLRDHUP (__force __poll_t)0x00002000 +/* + * Internal flag - wakeup generated by io_uring, used to detect recursion back + * into the io_uring poll handler. + */ +#define EPOLL_URING_WAKE ((__force __poll_t)(1U << 27)) + /* Set exclusive wakeup mode for the target file descriptor */ #define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28)) -- cgit v1.2.3 From 03e02acda8e267a8183e1e0ed289ff1ef9cd7ed8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Nov 2022 10:13:44 -0700 Subject: eventfd: provide a eventfd_signal_mask() helper This is identical to eventfd_signal(), but it allows the caller to pass in a mask to be used for the poll wakeup key. The use case is avoiding repeated multishot triggers if we have a dependency between eventfd and io_uring. If we setup an eventfd context and register that as the io_uring eventfd, and at the same time queue a multishot poll request for the eventfd context, then any CQE posted will repeatedly trigger the multishot request until it terminates when the CQ ring overflows. In preparation for io_uring detecting this circular dependency, add the mentioned helper so that io_uring can pass in EPOLL_URING as part of the poll wakeup key. Cc: stable@vger.kernel.org # 6.0 [axboe: fold in !CONFIG_EVENTFD fix from Zhang Qilong] Signed-off-by: Jens Axboe --- include/linux/eventfd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 30eb30d6909b..786824f58d3d 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -66,6 +67,12 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx, int n) return -ENOSYS; } +static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, + unsigned mask) +{ + return -ENOSYS; +} + static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) { -- cgit v1.2.3 From 931147ddfa6e9ffd814272f1c0370c4740acbe17 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:54 -0800 Subject: io_uring: allow defer completion for aux posted cqes Multishot ops cannot use the compl_reqs list as the request must stay in the poll list, but that means they need to run each completion without benefiting from batching. Here introduce batching infrastructure for only small (ie 16 byte) CQEs. This restriction is ok because there are no use cases posting 32 byte CQEs. In the ring keep a batch of up to 16 posted results, and flush in the same way as compl_reqs. 16 was chosen through experimentation on a microbenchmark ([1]), as well as trying not to increase the size of the ring too much. This increases the size to 1472 bytes from 1216. [1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10 Gives results: baseline 8309 k/s 8 18807 k/s 16 19338 k/s 32 20134 k/s Suggested-by: Pavel Begunkov Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-5-dylany@meta.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f5b687a787a3..accdfecee953 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -174,7 +174,9 @@ struct io_submit_state { bool plug_started; bool need_plug; unsigned short submit_nr; + unsigned int cqes_count; struct blk_plug plug; + struct io_uring_cqe cqes[16]; }; struct io_ev_fd { -- cgit v1.2.3