summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 16:22:30 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 16:22:30 -0700
commit23acda7c221a76ff711d65f4ca90029d43b249a0 (patch)
tree3e7745c9210489864e153990c06833d7d47a3dcd /io_uring
parent7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (diff)
parentc5e9f6a96bf7379da87df1b852b90527e242b56f (diff)
Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Add a callback driven main loop for io_uring, and BPF struct_ops on top to allow implementing custom event loop logic - Decouple IOPOLL from being a ring-wide all-or-nothing setting, allowing IOPOLL use cases to also issue certain white listed non-polled opcodes - Timeout improvements. Migrate internal timeout storage from timespec64 to ktime_t for simpler arithmetic and avoid copying of timespec data - Zero-copy receive (zcrx) updates: - Add a device-less mode (ZCRX_REG_NODEV) for testing and experimentation where data flows through the copy fallback path - Fix two-step unregistration regression, DMA length calculations, xarray mark usage, and a potential 32-bit overflow in id shifting - Refactoring toward multi-area support: dedicated refill queue struct, consolidated DMA syncing, netmem array refilling format, and guard-based locking - Zero-copy transmit (zctx) cleanup: - Unify io_send_zc() and io_sendmsg_zc() into a single function - Add vectorized registered buffer send for IORING_OP_SEND_ZC - Add separate notification user_data via sqe->addr3 so notification and completion CQEs can be distinguished without extra reference counting - Switch struct io_ring_ctx internal bitfields to explicit flag bits with atomic-safe accessors, and annotate the known harmless races on those flags - Various optimizations caching ctx and other request fields in local variables to avoid repeated loads, and cleanups for tctx setup, ring fd registration, and read path early returns * tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits) io_uring: unify getting ctx from passed in file descriptor io_uring/register: don't get a reference to the registered ring fd io_uring/tctx: clean up __io_uring_add_tctx_node() error handling io_uring/tctx: have io_uring_alloc_task_context() return tctx io_uring/timeout: use 'ctx' consistently io_uring/rw: clean up __io_read() obsolete comment and early returns io_uring/zcrx: use correct mmap off constants io_uring/zcrx: use dma_len for chunk size calculation io_uring/zcrx: don't clear not allocated niovs io_uring/zcrx: don't use mark0 for allocating xarray io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring() io_uring/zcrx: reject REG_NODEV with large rx_buf_size io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP io_uring/rsrc: use io_cache_free() to free node io_uring/zcrx: rename zcrx [un]register functions io_uring/zcrx: check ctrl op payload struct sizes io_uring/zcrx: cache fallback availability in zcrx ctx io_uring/zcrx: warn on a repeated area append io_uring/zcrx: consolidate dma syncing io_uring/zcrx: netmem array as refiling format ...
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Kconfig5
-rw-r--r--io_uring/Makefile3
-rw-r--r--io_uring/bpf-ops.c270
-rw-r--r--io_uring/bpf-ops.h28
-rw-r--r--io_uring/cancel.c9
-rw-r--r--io_uring/cmd_net.c34
-rw-r--r--io_uring/eventfd.c4
-rw-r--r--io_uring/io_uring.c183
-rw-r--r--io_uring/io_uring.h11
-rw-r--r--io_uring/kbuf.c4
-rw-r--r--io_uring/loop.c91
-rw-r--r--io_uring/loop.h27
-rw-r--r--io_uring/msg_ring.c2
-rw-r--r--io_uring/net.c148
-rw-r--r--io_uring/net.h1
-rw-r--r--io_uring/opdef.c12
-rw-r--r--io_uring/opdef.h2
-rw-r--r--io_uring/poll.c8
-rw-r--r--io_uring/query.c4
-rw-r--r--io_uring/register.c49
-rw-r--r--io_uring/register.h1
-rw-r--r--io_uring/rsrc.c15
-rw-r--r--io_uring/rw.c24
-rw-r--r--io_uring/sqpoll.c8
-rw-r--r--io_uring/tctx.c79
-rw-r--r--io_uring/tctx.h4
-rw-r--r--io_uring/timeout.c78
-rw-r--r--io_uring/timeout.h2
-rw-r--r--io_uring/tw.c2
-rw-r--r--io_uring/uring_cmd.c9
-rw-r--r--io_uring/wait.h1
-rw-r--r--io_uring/zcrx.c384
-rw-r--r--io_uring/zcrx.h34
33 files changed, 1030 insertions, 506 deletions
diff --git a/io_uring/Kconfig b/io_uring/Kconfig
index a7ae23cf1035..a283d9e53787 100644
--- a/io_uring/Kconfig
+++ b/io_uring/Kconfig
@@ -14,3 +14,8 @@ config IO_URING_BPF
def_bool y
depends on BPF
depends on NET
+
+config IO_URING_BPF_OPS
+ def_bool y
+ depends on IO_URING
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 931f9156132a..c54e328d1410 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
advise.o openclose.o statx.o timeout.o \
cancel.o waitid.o register.o \
truncate.o memmap.o alloc_cache.o \
- query.o
+ query.o loop.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
@@ -25,3 +25,4 @@ obj-$(CONFIG_NET) += net.o cmd_net.o
obj-$(CONFIG_PROC_FS) += fdinfo.o
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
+obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o
diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c
new file mode 100644
index 000000000000..937e48bef40b
--- /dev/null
+++ b/io_uring/bpf-ops.c
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/mutex.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+
+#include "io_uring.h"
+#include "register.h"
+#include "loop.h"
+#include "memmap.h"
+#include "bpf-ops.h"
+
+static DEFINE_MUTEX(io_bpf_ctrl_mutex);
+static const struct btf_type *loop_params_type;
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr)
+{
+ return io_submit_sqes(ctx, nr);
+}
+
+__bpf_kfunc
+__u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id,
+ const size_t rdwr_buf_size)
+{
+ struct io_mapped_region *r;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ switch (region_id) {
+ case IOU_REGION_MEM:
+ r = &ctx->param_region;
+ break;
+ case IOU_REGION_CQ:
+ r = &ctx->ring_region;
+ break;
+ case IOU_REGION_SQ:
+ r = &ctx->sq_region;
+ break;
+ default:
+ return NULL;
+ }
+
+ if (unlikely(rdwr_buf_size > io_region_size(r)))
+ return NULL;
+ return io_region_get_ptr(r);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(io_uring_kfunc_set)
+BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
+BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL);
+BTF_KFUNCS_END(io_uring_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &io_uring_kfunc_set,
+};
+
+static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx,
+ struct iou_loop_params *lp)
+{
+ return IOU_LOOP_STOP;
+}
+
+static struct io_uring_bpf_ops io_bpf_ops_stubs = {
+ .loop_step = io_bpf_ops__loop_step,
+};
+
+static bool bpf_io_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type != BPF_READ)
+ return false;
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_io_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off,
+ int size)
+{
+ const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
+
+ if (t == loop_params_type) {
+ if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx))
+ return SCALAR_VALUE;
+ }
+
+ return -EACCES;
+}
+
+static const struct bpf_verifier_ops bpf_io_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = bpf_io_is_valid_access,
+ .btf_struct_access = bpf_io_btf_struct_access,
+};
+
+static const struct btf_type *
+io_lookup_struct_type(struct btf *btf, const char *name)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return NULL;
+ return btf_type_by_id(btf, type_id);
+}
+
+static int bpf_io_init(struct btf *btf)
+{
+ int ret;
+
+ loop_params_type = io_lookup_struct_type(btf, "iou_loop_params");
+ if (!loop_params_type) {
+ pr_err("io_uring: Failed to locate iou_loop_params\n");
+ return -EINVAL;
+ }
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &bpf_io_uring_kfunc_set);
+ if (ret) {
+ pr_err("io_uring: Failed to register kfuncs (%d)\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+static int bpf_io_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ return 0;
+}
+
+static int bpf_io_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+ const struct io_uring_bpf_ops *uops = udata;
+ struct io_uring_bpf_ops *ops = kdata;
+
+ switch (moff) {
+ case offsetof(struct io_uring_bpf_ops, ring_fd):
+ ops->ring_fd = uops->ring_fd;
+ return 1;
+ }
+ return 0;
+}
+
+static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops)
+{
+ if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL))
+ return -EOPNOTSUPP;
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ return -EOPNOTSUPP;
+
+ if (ctx->bpf_ops)
+ return -EBUSY;
+ if (WARN_ON_ONCE(!ops->loop_step))
+ return -EINVAL;
+
+ ops->priv = ctx;
+ ctx->bpf_ops = ops;
+ ctx->loop_step = ops->loop_step;
+ return 0;
+}
+
+static int bpf_io_reg(void *kdata, struct bpf_link *link)
+{
+ struct io_uring_bpf_ops *ops = kdata;
+ struct io_ring_ctx *ctx;
+ struct file *file;
+ int ret = -EBUSY;
+
+ file = io_uring_ctx_get_file(ops->ring_fd, false);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ ctx = file->private_data;
+
+ scoped_guard(mutex, &io_bpf_ctrl_mutex) {
+ guard(mutex)(&ctx->uring_lock);
+ ret = io_install_bpf(ctx, ops);
+ }
+
+ fput(file);
+ return ret;
+}
+
+static void io_eject_bpf(struct io_ring_ctx *ctx)
+{
+ struct io_uring_bpf_ops *ops = ctx->bpf_ops;
+
+ if (WARN_ON_ONCE(!ops))
+ return;
+ if (WARN_ON_ONCE(ops->priv != ctx))
+ return;
+
+ ops->priv = NULL;
+ ctx->bpf_ops = NULL;
+ ctx->loop_step = NULL;
+}
+
+static void bpf_io_unreg(void *kdata, struct bpf_link *link)
+{
+ struct io_uring_bpf_ops *ops = kdata;
+ struct io_ring_ctx *ctx;
+
+ guard(mutex)(&io_bpf_ctrl_mutex);
+ ctx = ops->priv;
+ if (ctx) {
+ guard(mutex)(&ctx->uring_lock);
+ if (WARN_ON_ONCE(ctx->bpf_ops != ops))
+ return;
+
+ io_eject_bpf(ctx);
+ }
+}
+
+void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
+{
+ /*
+ * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock,
+ * and read protected by either. Try to avoid taking the global lock
+ * for rings that never had any bpf installed.
+ */
+ scoped_guard(mutex, &ctx->uring_lock) {
+ if (!ctx->bpf_ops)
+ return;
+ }
+
+ guard(mutex)(&io_bpf_ctrl_mutex);
+ guard(mutex)(&ctx->uring_lock);
+ if (ctx->bpf_ops)
+ io_eject_bpf(ctx);
+}
+
+static struct bpf_struct_ops bpf_ring_ops = {
+ .verifier_ops = &bpf_io_verifier_ops,
+ .reg = bpf_io_reg,
+ .unreg = bpf_io_unreg,
+ .check_member = bpf_io_check_member,
+ .init_member = bpf_io_init_member,
+ .init = bpf_io_init,
+ .cfi_stubs = &io_bpf_ops_stubs,
+ .name = "io_uring_bpf_ops",
+ .owner = THIS_MODULE,
+};
+
+static int __init io_uring_bpf_init(void)
+{
+ int ret;
+
+ ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops);
+ if (ret) {
+ pr_err("io_uring: Failed to register struct_ops (%d)\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+__initcall(io_uring_bpf_init);
diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h
new file mode 100644
index 000000000000..b39b3fd3acda
--- /dev/null
+++ b/io_uring/bpf-ops.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_BPF_OPS_H
+#define IOU_BPF_OPS_H
+
+#include <linux/io_uring_types.h>
+
+enum {
+ IOU_REGION_MEM,
+ IOU_REGION_CQ,
+ IOU_REGION_SQ,
+};
+
+struct io_uring_bpf_ops {
+ int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp);
+
+ __u32 ring_fd;
+ void *priv;
+};
+
+#ifdef CONFIG_IO_URING_BPF_OPS
+void io_unregister_bpf_ops(struct io_ring_ctx *ctx);
+#else
+static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
+{
+}
+#endif
+
+#endif /* IOU_BPF_OPS_H */
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 65e04063e343..5e5eb9cfc7cd 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -156,9 +156,16 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
cancel->fd = READ_ONCE(sqe->fd);
}
if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
+ u32 op;
+
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
- cancel->opcode = READ_ONCE(sqe->len);
+
+ op = READ_ONCE(sqe->len);
+ if (op >= IORING_OP_LAST)
+ return -EINVAL;
+
+ cancel->opcode = op;
}
return 0;
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index 125a81c520a6..7cd411fc4f33 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -7,6 +7,21 @@
#include "uring_cmd.h"
#include "io_uring.h"
+static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op)
+{
+ struct sock *sk = sock->sk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ int ret, arg = 0;
+
+ if (!prot || !prot->ioctl)
+ return -EOPNOTSUPP;
+
+ ret = prot->ioctl(sk, op, &arg);
+ if (ret)
+ return ret;
+ return arg;
+}
+
static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
@@ -156,27 +171,12 @@ static int io_uring_cmd_getsockname(struct socket *sock,
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
- struct sock *sk = sock->sk;
- struct proto *prot = READ_ONCE(sk->sk_prot);
- int ret, arg = 0;
switch (cmd->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
- if (!prot || !prot->ioctl)
- return -EOPNOTSUPP;
-
- ret = prot->ioctl(sk, SIOCINQ, &arg);
- if (ret)
- return ret;
- return arg;
+ return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ);
case SOCKET_URING_OP_SIOCOUTQ:
- if (!prot || !prot->ioctl)
- return -EOPNOTSUPP;
-
- ret = prot->ioctl(sk, SIOCOUTQ, &arg);
- if (ret)
- return ret;
- return arg;
+ return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ);
case SOCKET_URING_OP_GETSOCKOPT:
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
case SOCKET_URING_OP_SETSOCKOPT:
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 7482a7dc6b38..3da028500f76 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
+ ctx->int_flags |= IO_RING_F_HAS_EVFD;
refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
@@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
- ctx->has_evfd = false;
+ ctx->int_flags &= ~IO_RING_F_HAS_EVFD;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
io_eventfd_put(ev_fd);
return 0;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4d7bcbb97406..dd6326dc5f88 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -87,6 +87,7 @@
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
+#include "bpf-ops.h"
#include "timeout.h"
#include "poll.h"
@@ -95,6 +96,7 @@
#include "eventfd.h"
#include "wait.h"
#include "bpf_filter.h"
+#include "loop.h"
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -356,7 +358,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
- struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CREDS)) {
req->flags |= REQ_F_CREDS;
@@ -378,7 +379,7 @@ static void io_prep_async_work(struct io_kiocb *req)
if (should_hash && (req->file->f_flags & O_DIRECT) &&
(req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
should_hash = false;
- if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
+ if (should_hash || (req->flags & REQ_F_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
@@ -477,17 +478,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (ctx->poll_activated)
+ if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED)
io_poll_wq_wake(ctx);
- if (ctx->off_timeout_used)
+ if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED)
io_flush_timeouts(ctx);
- if (ctx->has_evfd)
+ if (ctx->int_flags & IO_RING_F_HAS_EVFD)
io_eventfd_signal(ctx, true);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{
- if (!ctx->lockless_cq)
+ if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ))
spin_lock(&ctx->completion_lock);
}
@@ -500,11 +501,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{
io_commit_cqring(ctx);
- if (!ctx->task_complete) {
- if (!ctx->lockless_cq)
+ if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) {
+ if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ))
spin_unlock(&ctx->completion_lock);
/* IOPOLL rings only need to wake up if it's also SQPOLL */
- if (!ctx->syscall_iopoll)
+ if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL))
io_cqring_wake(ctx);
}
io_commit_cqring_flush(ctx);
@@ -589,6 +590,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
+void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx)
+{
+ __io_cqring_overflow_flush(ctx, false);
+}
+
/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct io_kiocb *req)
{
@@ -830,7 +836,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
lockdep_assert_held(&ctx->uring_lock);
- lockdep_assert(ctx->lockless_cq);
+ lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ);
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
@@ -860,7 +866,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
lockdep_assert(!io_wq_current_is_worker());
lockdep_assert_held(&ctx->uring_lock);
- if (!ctx->lockless_cq) {
+ if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) {
spin_lock(&ctx->completion_lock);
posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
spin_unlock(&ctx->completion_lock);
@@ -885,7 +891,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2])
lockdep_assert_held(&ctx->uring_lock);
cqe[0].user_data = req->cqe.user_data;
- if (!ctx->lockless_cq) {
+ if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) {
spin_lock(&ctx->completion_lock);
posted = io_fill_cqe_aux32(ctx, cqe);
spin_unlock(&ctx->completion_lock);
@@ -913,7 +919,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
* the submitter task context, IOPOLL protects with uring_lock.
*/
- if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
+ if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) {
defer_complete:
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
@@ -1067,12 +1073,14 @@ void io_queue_next(struct io_kiocb *req)
static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
if (req->file_node) {
- io_put_rsrc_node(req->ctx, req->file_node);
+ io_put_rsrc_node(ctx, req->file_node);
req->file_node = NULL;
}
if (req->flags & REQ_F_BUF_NODE)
- io_put_rsrc_node(req->ctx, req->buf_node);
+ io_put_rsrc_node(ctx, req->buf_node);
}
static void io_free_batch_list(struct io_ring_ctx *ctx,
@@ -1135,7 +1143,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
- if (ctx->lockless_cq)
+ if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ)
io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
else
io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
@@ -1148,7 +1156,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
INIT_WQ_LIST(&state->compl_reqs);
}
- if (unlikely(ctx->drain_active))
+ if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE))
io_queue_deferred(ctx);
ctx->submit_state.cq_flush = false;
@@ -1187,7 +1195,6 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
{
- unsigned int nr_events = 0;
unsigned long check_cq;
min_events = min(min_events, ctx->cq_entries);
@@ -1230,8 +1237,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
* very same mutex.
*/
if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) {
- u32 tail = ctx->cached_cq_tail;
-
(void) io_run_local_work_locked(ctx, min_events);
if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) {
@@ -1240,7 +1245,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
mutex_lock(&ctx->uring_lock);
}
/* some requests don't go through iopoll_list */
- if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list))
+ if (list_empty(&ctx->iopoll_list))
break;
}
ret = io_do_iopoll(ctx, !min_events);
@@ -1251,9 +1256,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
return -EINTR;
if (need_resched())
break;
-
- nr_events += ret;
- } while (nr_events < min_events);
+ } while (io_cqring_events(ctx) < min_events);
return 0;
}
@@ -1344,7 +1347,7 @@ static __cold void io_drain_req(struct io_kiocb *req)
list_add_tail(&de->list, &ctx->defer_list);
io_queue_deferred(ctx);
if (!drain && list_empty(&ctx->defer_list))
- ctx->drain_active = false;
+ ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -1418,8 +1421,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (ret == IOU_ISSUE_SKIP_COMPLETE) {
ret = 0;
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ if (req->flags & REQ_F_IOPOLL)
io_iopoll_req_issued(req, issue_flags);
}
return ret;
@@ -1435,7 +1437,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)
io_tw_lock(req->ctx, tw);
WARN_ON_ONCE(!req->file);
- if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL))
return -EFAULT;
ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
@@ -1533,7 +1535,7 @@ fail:
* wait for request slots on the block side.
*/
if (!needs_poll) {
- if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (!(req->flags & REQ_F_IOPOLL))
break;
if (io_wq_worker_stopped())
break;
@@ -1655,7 +1657,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
} else {
/* can't fail with IO_URING_F_INLINE */
io_req_sqe_copy(req, IO_URING_F_INLINE);
- if (unlikely(req->ctx->drain_active))
+ if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE))
io_drain_req(req);
else
io_queue_iowq(req);
@@ -1671,7 +1673,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (!ctx->op_restricted)
+ if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED))
return true;
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -1691,7 +1693,7 @@ static void io_init_drain(struct io_ring_ctx *ctx)
{
struct io_kiocb *head = ctx->submit_state.link.head;
- ctx->drain_active = true;
+ ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE;
if (head) {
/*
* If we need to drain a request in the middle of a link, drain
@@ -1701,7 +1703,7 @@ static void io_init_drain(struct io_ring_ctx *ctx)
* link.
*/
head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
- ctx->drain_next = true;
+ ctx->int_flags |= IO_RING_F_DRAIN_NEXT;
}
}
@@ -1767,23 +1769,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->buf_index = READ_ONCE(sqe->buf_group);
}
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
- ctx->drain_disabled = true;
+ ctx->int_flags |= IO_RING_F_DRAIN_DISABLED;
if (sqe_flags & IOSQE_IO_DRAIN) {
- if (ctx->drain_disabled)
+ if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED)
return io_init_fail_req(req, -EOPNOTSUPP);
io_init_drain(ctx);
}
}
- if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) {
+ if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) {
if (!io_check_restriction(ctx, req, sqe_flags))
return io_init_fail_req(req, -EACCES);
/* knock it to the slow queue path, will be drained there */
- if (ctx->drain_active)
+ if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)
req->flags |= REQ_F_FORCE_ASYNC;
/* if there is no link, we're at "next" request and need to drain */
- if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
- ctx->drain_next = false;
- ctx->drain_active = true;
+ if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) {
+ ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT;
+ ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE;
req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
}
}
@@ -2148,12 +2150,13 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
+ io_unregister_bpf_ops(ctx);
io_sq_thread_finish(ctx);
mutex_lock(&ctx->uring_lock);
io_sqe_buffers_unregister(ctx);
io_sqe_files_unregister(ctx);
- io_unregister_zcrx_ifqs(ctx);
+ io_unregister_zcrx(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
io_free_alloc_caches(ctx);
@@ -2204,7 +2207,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
poll_wq_task_work);
mutex_lock(&ctx->uring_lock);
- ctx->poll_activated = true;
+ ctx->int_flags |= IO_RING_F_POLL_ACTIVATED;
mutex_unlock(&ctx->uring_lock);
/*
@@ -2219,9 +2222,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
- if (ctx->poll_activated || ctx->poll_wq_task_work.func)
+ if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func)
goto out;
- if (WARN_ON_ONCE(!ctx->task_complete))
+ if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)))
goto out;
if (!ctx->submitter_task)
goto out;
@@ -2242,7 +2245,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- if (unlikely(!ctx->poll_activated))
+ if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED)))
io_activate_pollwq(ctx);
/*
* provides mb() which pairs with barrier from wq_has_sleeper
@@ -2308,6 +2311,10 @@ static __cold void io_ring_exit_work(struct work_struct *work)
struct io_tctx_node *node;
int ret;
+ mutex_lock(&ctx->uring_lock);
+ io_terminate_zcrx(ctx);
+ mutex_unlock(&ctx->uring_lock);
+
/*
* If we're doing polled IO and end up having requests being
* submitted async (out-of-line), then completions can come in while
@@ -2539,39 +2546,54 @@ uaccess_end:
#endif
}
-SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const void __user *, argp,
- size_t, argsz)
+/*
+ * Given an 'fd' value, return the ctx associated with if. If 'registered' is
+ * true, then the registered index is used. Otherwise, the normal fd table.
+ * Caller must call fput() on the returned file if it isn't a registered file,
+ * unless it's an ERR_PTR.
+ */
+struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
{
- struct io_ring_ctx *ctx;
struct file *file;
- long ret;
- if (unlikely(flags & ~IORING_ENTER_FLAGS))
- return -EINVAL;
-
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- if (flags & IORING_ENTER_REGISTERED_RING) {
+ if (registered) {
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
- if (unlikely(!file))
- return -EBADF;
} else {
file = fget(fd);
- if (unlikely(!file))
- return -EBADF;
- ret = -EOPNOTSUPP;
- if (unlikely(!io_is_uring_fops(file)))
- goto out;
}
+ if (unlikely(!file))
+ return ERR_PTR(-EBADF);
+ if (io_is_uring_fops(file))
+ return file;
+ fput(file);
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+ u32, min_complete, u32, flags, const void __user *, argp,
+ size_t, argsz)
+{
+ struct io_ring_ctx *ctx;
+ struct file *file;
+ long ret;
+
+ if (unlikely(flags & ~IORING_ENTER_FLAGS))
+ return -EINVAL;
+
+ file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
ctx = file->private_data;
ret = -EBADFD;
/*
@@ -2581,6 +2603,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED))
goto out;
+ if (io_has_loop_ops(ctx)) {
+ ret = io_run_loop(ctx);
+ goto out;
+ }
+
/*
* For SQ polling, the thread will do all submissions and completions.
* Just return the requested submit count, and wake the thread if
@@ -2610,7 +2637,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
- if (ctx->syscall_iopoll)
+ if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)
goto iopoll_locked;
/*
* Ignore errors, we'll soon call io_cqring_wait() and
@@ -2625,7 +2652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (flags & IORING_ENTER_GETEVENTS) {
int ret2;
- if (ctx->syscall_iopoll) {
+ if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) {
/*
* We disallow the app entering submit/complete with
* polling, but we still need to lock the ring to
@@ -2926,9 +2953,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
if (dst->bpf_filters)
WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
if (dst->op_registered)
- ctx->op_restricted = 1;
+ ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
if (dst->reg_registered)
- ctx->reg_restricted = 1;
+ ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
}
static __cold int io_uring_create(struct io_ctx_config *config)
@@ -2955,17 +2982,18 @@ static __cold int io_uring_create(struct io_ctx_config *config)
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
!(ctx->flags & IORING_SETUP_IOPOLL))
- ctx->task_complete = true;
+ ctx->int_flags |= IO_RING_F_TASK_COMPLETE;
- if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
- ctx->lockless_cq = true;
+ if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) ||
+ (ctx->flags & IORING_SETUP_IOPOLL))
+ ctx->int_flags |= IO_RING_F_LOCKLESS_CQ;
/*
* lazy poll_wq activation relies on ->task_complete for synchronisation
* purposes, see io_activate_pollwq()
*/
- if (!ctx->task_complete)
- ctx->poll_activated = true;
+ if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))
+ ctx->int_flags |= IO_RING_F_POLL_ACTIVATED;
/*
* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
@@ -2975,9 +3003,10 @@ static __cold int io_uring_create(struct io_ctx_config *config)
*/
if (ctx->flags & IORING_SETUP_IOPOLL &&
!(ctx->flags & IORING_SETUP_SQPOLL))
- ctx->syscall_iopoll = 1;
+ ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL;
- ctx->compat = in_compat_syscall();
+ if (in_compat_syscall())
+ ctx->int_flags |= IO_RING_F_COMPAT;
if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ee24bc5d77b3..e612a66ee80e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -185,6 +185,7 @@ void io_req_track_inflight(struct io_kiocb *req);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
+struct file *io_uring_ctx_get_file(unsigned int fd, bool registered);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw);
@@ -223,7 +224,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
if (ctx->flags & IORING_SETUP_IOPOLL) {
lockdep_assert_held(&ctx->uring_lock);
- } else if (!ctx->task_complete) {
+ } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) {
lockdep_assert_held(&ctx->completion_lock);
} else if (ctx->submitter_task) {
/*
@@ -240,7 +241,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
static inline bool io_is_compat(struct io_ring_ctx *ctx)
{
- return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat);
+ return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT);
}
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
@@ -494,10 +495,12 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
+#define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \
+ IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED)
+
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used ||
- ctx->has_evfd || ctx->poll_activated))
+ if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK))
__io_commit_cqring_flush(ctx);
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 5257b3aad395..8da2ff798170 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -230,7 +230,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
struct io_br_sel sel = { };
struct io_buffer_list *bl;
- io_ring_submit_lock(req->ctx, issue_flags);
+ io_ring_submit_lock(ctx, issue_flags);
bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
@@ -239,7 +239,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
else
sel.addr = io_provided_buffer_select(req, len, bl);
}
- io_ring_submit_unlock(req->ctx, issue_flags);
+ io_ring_submit_unlock(ctx, issue_flags);
return sel;
}
diff --git a/io_uring/loop.c b/io_uring/loop.c
new file mode 100644
index 000000000000..31843cc3e451
--- /dev/null
+++ b/io_uring/loop.c
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include "io_uring.h"
+#include "wait.h"
+#include "loop.h"
+
+static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx,
+ const struct iou_loop_params *lp)
+{
+ return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail);
+}
+
+static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait)
+{
+ atomic_set(&ctx->cq_wait_nr, nr_wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+}
+
+static inline void io_loop_wait_finish(struct io_ring_ctx *ctx)
+{
+ __set_current_state(TASK_RUNNING);
+ atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
+}
+
+static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp,
+ unsigned nr_wait)
+{
+ io_loop_wait_start(ctx, nr_wait);
+
+ if (unlikely(io_local_work_pending(ctx) ||
+ io_loop_nr_cqes(ctx, lp) <= 0) ||
+ READ_ONCE(ctx->check_cq)) {
+ io_loop_wait_finish(ctx);
+ return;
+ }
+
+ mutex_unlock(&ctx->uring_lock);
+ schedule();
+ io_loop_wait_finish(ctx);
+ mutex_lock(&ctx->uring_lock);
+}
+
+static int __io_run_loop(struct io_ring_ctx *ctx)
+{
+ struct iou_loop_params lp = {};
+
+ while (true) {
+ int nr_wait, step_res;
+
+ if (unlikely(!ctx->loop_step))
+ return -EFAULT;
+
+ step_res = ctx->loop_step(ctx, &lp);
+ if (step_res == IOU_LOOP_STOP)
+ break;
+ if (step_res != IOU_LOOP_CONTINUE)
+ return -EINVAL;
+
+ nr_wait = io_loop_nr_cqes(ctx, &lp);
+ if (nr_wait > 0)
+ io_loop_wait(ctx, &lp, nr_wait);
+ else
+ nr_wait = 0;
+
+ if (task_work_pending(current)) {
+ mutex_unlock(&ctx->uring_lock);
+ io_run_task_work();
+ mutex_lock(&ctx->uring_lock);
+ }
+ if (unlikely(task_sigpending(current)))
+ return -EINTR;
+ io_run_local_work_locked(ctx, nr_wait);
+
+ if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
+ io_cqring_overflow_flush_locked(ctx);
+ }
+
+ return 0;
+}
+
+int io_run_loop(struct io_ring_ctx *ctx)
+{
+ int ret;
+
+ if (!io_allowed_run_tw(ctx))
+ return -EEXIST;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_run_loop(ctx);
+ mutex_unlock(&ctx->uring_lock);
+ return ret;
+}
diff --git a/io_uring/loop.h b/io_uring/loop.h
new file mode 100644
index 000000000000..d7718b9ce61e
--- /dev/null
+++ b/io_uring/loop.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_LOOP_H
+#define IOU_LOOP_H
+
+#include <linux/io_uring_types.h>
+
+struct iou_loop_params {
+ /*
+ * The CQE index to wait for. Only serves as a hint and can still be
+ * woken up earlier.
+ */
+ __u32 cq_wait_idx;
+};
+
+enum {
+ IOU_LOOP_CONTINUE = 0,
+ IOU_LOOP_STOP,
+};
+
+static inline bool io_has_loop_ops(struct io_ring_ctx *ctx)
+{
+ return data_race(ctx->loop_step);
+}
+
+int io_run_loop(struct io_ring_ctx *ctx);
+
+#endif
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 57ad0085869a..3ff9098573db 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
{
- return target_ctx->task_complete;
+ return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE;
}
static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
diff --git a/io_uring/net.c b/io_uring/net.c
index 8885d944130a..30cd22c0b934 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -375,10 +375,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_namelen = addr_len;
}
if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
- if (sr->flags & IORING_SEND_VECTORIZED)
- return -EINVAL;
- req->flags |= REQ_F_IMPORT_BUFFER;
- return 0;
+ if (!(sr->flags & IORING_SEND_VECTORIZED)) {
+ req->flags |= REQ_F_IMPORT_BUFFER;
+ return 0;
+ }
+
+ kmsg->msg.msg_iter.nr_segs = sr->len;
+ return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
}
if (req->flags & REQ_F_BUFFER_SELECT)
return 0;
@@ -396,6 +399,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
struct user_msghdr msg;
int ret;
+ sr->flags |= IORING_SEND_VECTORIZED;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
if (unlikely(ret))
@@ -1333,11 +1337,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_ring_ctx *ctx = req->ctx;
struct io_async_msghdr *iomsg;
struct io_kiocb *notif;
+ u64 user_data;
int ret;
zc->done_io = 0;
- if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
+ if (unlikely(READ_ONCE(sqe->__pad2[0])))
return -EINVAL;
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
if (req->flags & REQ_F_CQE_SKIP)
@@ -1346,7 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
notif = zc->notif = io_alloc_notif(ctx);
if (!notif)
return -ENOMEM;
- notif->cqe.user_data = req->cqe.user_data;
+ user_data = READ_ONCE(sqe->addr3);
+ if (!user_data)
+ user_data = req->cqe.user_data;
+
+ notif->cqe.user_data = user_data;
notif->cqe.res = 0;
notif->cqe.flags = IORING_CQE_F_NOTIF;
req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
@@ -1370,7 +1379,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
- if (io_is_compat(req->ctx))
+ if (io_is_compat(ctx))
zc->msg_flags |= MSG_CMSG_COMPAT;
iomsg = io_msg_alloc_async(req);
@@ -1445,22 +1454,39 @@ static int io_sg_from_iter(struct sk_buff *skb,
return ret;
}
-static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
+static int io_send_zc_import(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg,
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *kmsg = req->async_data;
+ struct io_kiocb *notif = sr->notif;
+ int ret;
WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
- sr->notif->buf_index = req->buf_index;
- return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter,
- (u64)(uintptr_t)sr->buf, sr->len,
- ITER_SOURCE, issue_flags);
+ notif->buf_index = req->buf_index;
+
+ if (!(sr->flags & IORING_SEND_VECTORIZED)) {
+ ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
+ (u64)(uintptr_t)sr->buf, sr->len,
+ ITER_SOURCE, issue_flags);
+ } else {
+ unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
+
+ ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
+ notif, &kmsg->vec, uvec_segs,
+ issue_flags);
+ }
+
+ if (unlikely(ret))
+ return ret;
+ req->flags &= ~REQ_F_IMPORT_BUFFER;
+ return 0;
}
-int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
+int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned msg_flags;
@@ -1471,106 +1497,38 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
-
if (!(req->flags & REQ_F_POLLED) &&
- (zc->flags & IORING_RECVSEND_POLL_FIRST))
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
if (req->flags & REQ_F_IMPORT_BUFFER) {
- req->flags &= ~REQ_F_IMPORT_BUFFER;
- ret = io_send_zc_import(req, issue_flags);
+ ret = io_send_zc_import(req, kmsg, issue_flags);
if (unlikely(ret))
return ret;
}
- msg_flags = zc->msg_flags;
+ msg_flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
- msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
-
- kmsg->msg.msg_flags = msg_flags;
- kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
- ret = sock_sendmsg(sock, &kmsg->msg);
-
- if (unlikely(ret < min_ret)) {
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return -EAGAIN;
-
- if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
- zc->done_io += ret;
- return -EAGAIN;
- }
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- req_set_fail(req);
- }
-
- if (ret >= 0)
- ret += zc->done_io;
- else if (zc->done_io)
- ret = zc->done_io;
-
- /*
- * If we're in io-wq we can't rely on tw ordering guarantees, defer
- * flushing notif to io_send_zc_cleanup()
- */
- if (!(issue_flags & IO_URING_F_UNLOCKED)) {
- io_notif_flush(zc->notif);
- zc->notif = NULL;
- io_req_msg_cleanup(req, 0);
- }
- io_req_set_res(req, ret, IORING_CQE_F_MORE);
- return IOU_COMPLETE;
-}
-int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *kmsg = req->async_data;
- struct socket *sock;
- unsigned flags;
- int ret, min_ret = 0;
-
- if (req->flags & REQ_F_IMPORT_BUFFER) {
- unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
- int ret;
+ kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
- sr->notif->buf_index = req->buf_index;
- ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
- sr->notif, &kmsg->vec, uvec_segs,
- issue_flags);
- if (unlikely(ret))
- return ret;
- req->flags &= ~REQ_F_IMPORT_BUFFER;
+ if (req->opcode == IORING_OP_SEND_ZC) {
+ msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
+ kmsg->msg.msg_flags = msg_flags;
+ ret = sock_sendmsg(sock, &kmsg->msg);
+ } else {
+ kmsg->msg.msg_control_user = sr->msg_control;
+ ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
}
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
- if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
- return -EOPNOTSUPP;
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return -EAGAIN;
-
- flags = sr->msg_flags;
- if (issue_flags & IO_URING_F_NONBLOCK)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
- kmsg->msg.msg_control_user = sr->msg_control;
- kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
- ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
- if (ret > 0 && io_net_retry(sock, flags)) {
+ if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
sr->done_io += ret;
return -EAGAIN;
}
diff --git a/io_uring/net.h b/io_uring/net.h
index a862960a3bb9..d4d1ddce50e3 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -50,7 +50,6 @@ void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
-int io_send_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 91a23baf415e..c3ef52b70811 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -67,7 +67,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv,
@@ -82,7 +81,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev,
@@ -102,7 +100,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_read_fixed,
.issue = io_read_fixed,
@@ -116,7 +113,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_write_fixed,
.issue = io_write_fixed,
@@ -250,7 +246,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_read,
.issue = io_read,
@@ -264,7 +259,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_write,
.issue = io_write,
@@ -423,7 +417,6 @@ const struct io_issue_def io_issue_defs[] = {
.needs_file = 1,
.plug = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
@@ -437,7 +430,7 @@ const struct io_issue_def io_issue_defs[] = {
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep,
- .issue = io_send_zc,
+ .issue = io_sendmsg_zc,
#else
.prep = io_eopnotsupp_prep,
#endif
@@ -556,7 +549,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv_fixed,
@@ -571,7 +563,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev_fixed,
@@ -593,7 +584,6 @@ const struct io_issue_def io_issue_defs[] = {
.needs_file = 1,
.plug = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.is_128 = 1,
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index faf3955dce8b..667f981e63b0 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -25,8 +25,6 @@ struct io_issue_def {
unsigned poll_exclusive : 1;
/* skip auditing */
unsigned audit_skip : 1;
- /* have to be put into the iopoll list */
- unsigned iopoll_queue : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
/* set to 1 if this opcode uses 128b sqes in a mixed sq */
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 2e9ee47d74bf..74eef7884159 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -277,8 +277,10 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
/* the mask was stashed in __io_poll_execute */
if (!req->cqe.res) {
- struct poll_table_struct pt = { ._key = req->apoll_events };
- req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
+ __poll_t events = req->apoll_events;
+ struct poll_table_struct pt = { ._key = events };
+
+ req->cqe.res = vfs_poll(req->file, &pt) & events;
/*
* We got woken with a mask, but someone else got to
* it first. The above vfs_poll() doesn't add us back
@@ -287,7 +289,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
*/
if (unlikely(!req->cqe.res)) {
/* Multishot armed need not reissue */
- if (!(req->apoll_events & EPOLLONESHOT))
+ if (!(events & EPOLLONESHOT))
continue;
return IOU_POLL_REISSUE;
}
diff --git a/io_uring/query.c b/io_uring/query.c
index 63cc30c9803d..c1704d088374 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -34,12 +34,12 @@ static ssize_t io_query_zcrx(union io_query_data *data)
{
struct io_uring_query_zcrx *e = &data->zcrx;
- e->register_flags = ZCRX_REG_IMPORT;
+ e->register_flags = ZCRX_SUPPORTED_REG_FLAGS;
e->area_flags = IORING_ZCRX_AREA_DMABUF;
e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST;
e->rq_hdr_size = sizeof(struct io_uring);
e->rq_hdr_alignment = L1_CACHE_BYTES;
- e->features = ZCRX_FEATURE_RX_PAGE_SIZE;
+ e->features = ZCRX_FEATURES;
e->__resv2 = 0;
return sizeof(*e);
}
diff --git a/io_uring/register.c b/io_uring/register.c
index 05362fe79804..24e593332d1a 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -192,9 +192,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
return ret;
}
if (ctx->restrictions.op_registered)
- ctx->op_restricted = 1;
+ ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
if (ctx->restrictions.reg_registered)
- ctx->reg_restricted = 1;
+ ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
return 0;
}
@@ -392,7 +392,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i])
ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
+ ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
@@ -733,7 +733,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (ctx->submitter_task && ctx->submitter_task != current)
return -EEXIST;
- if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
+ if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
@@ -908,7 +908,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
- ret = io_register_zcrx_ifq(ctx, arg);
+ ret = io_register_zcrx(ctx, arg);
break;
case IORING_REGISTER_RESIZE_RINGS:
ret = -EINVAL;
@@ -946,40 +946,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return ret;
}
-/*
- * Given an 'fd' value, return the ctx associated with if. If 'registered' is
- * true, then the registered index is used. Otherwise, the normal fd table.
- * Caller must call fput() on the returned file, unless it's an ERR_PTR.
- */
-struct file *io_uring_register_get_file(unsigned int fd, bool registered)
-{
- struct file *file;
-
- if (registered) {
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- struct io_uring_task *tctx = current->io_uring;
-
- if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
- return ERR_PTR(-EINVAL);
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- file = tctx->registered_rings[fd];
- if (file)
- get_file(file);
- } else {
- file = fget(fd);
- }
-
- if (unlikely(!file))
- return ERR_PTR(-EBADF);
- if (io_is_uring_fops(file))
- return file;
- fput(file);
- return ERR_PTR(-EOPNOTSUPP);
-}
-
static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
{
struct io_uring_sqe sqe;
@@ -1034,7 +1000,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (fd == -1)
return io_uring_register_blind(opcode, arg, nr_args);
- file = io_uring_register_get_file(fd, use_registered_ring);
+ file = io_uring_ctx_get_file(fd, use_registered_ring);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data;
@@ -1046,6 +1012,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ctx->buf_table.nr, ret);
mutex_unlock(&ctx->uring_lock);
- fput(file);
+ if (!use_registered_ring)
+ fput(file);
return ret;
}
diff --git a/io_uring/register.h b/io_uring/register.h
index a5f39d5ef9e0..c9da997d503c 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -4,6 +4,5 @@
int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
-struct file *io_uring_register_get_file(unsigned int fd, bool registered);
#endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1b96ab5e98c9..fd36e0e319a2 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
u64 tag = 0;
uvec = u64_to_user_ptr(user_data);
- iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
+ iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
if (IS_ERR(iov)) {
err = PTR_ERR(iov);
break;
@@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
ctx->buf_table.nodes[i] = node;
- if (ctx->compat)
+ if (io_is_compat(ctx))
user_data += sizeof(struct compat_iovec);
else
user_data += sizeof(struct iovec);
@@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
if (arg) {
uvec = (struct iovec __user *) arg;
- iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
+ iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
if (IS_ERR(iov)) {
ret = PTR_ERR(iov);
break;
}
- if (ctx->compat)
+ if (io_is_compat(ctx))
arg += sizeof(struct compat_iovec);
else
arg += sizeof(struct iovec);
@@ -961,7 +961,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
*/
imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
if (!imu) {
- kfree(node);
+ io_cache_free(&ctx->node_cache, node);
ret = -ENOMEM;
goto unlock;
}
@@ -1273,7 +1273,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
- file = io_uring_register_get_file(buf.src_fd, registered_src);
+ file = io_uring_ctx_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -1295,7 +1295,8 @@ out:
if (src_ctx != ctx)
mutex_unlock(&src_ctx->uring_lock);
- fput(file);
+ if (!registered_src)
+ fput(file);
return ret;
}
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1a5f262734e8..20654deff84d 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
- !(ctx->flags & IORING_SETUP_IOPOLL)))
+ !(req->flags & REQ_F_IOPOLL)))
return false;
/*
* If ref is dying, we might be running poll reap from the exit work.
@@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
}
}
- if (req->ctx->flags & IORING_SETUP_IOPOLL)
+ if (req->flags & REQ_F_IOPOLL)
io_complete_rw_iopoll(&rw->kiocb, ret);
else
io_complete_rw(&rw->kiocb, ret);
@@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
- if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
+ if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) {
u32 cflags = 0;
__io_complete_rw_common(req, ret);
@@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
+ req->flags |= REQ_F_IOPOLL;
kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
req->iopoll_completed = 0;
@@ -899,7 +900,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
* We have a union of meta fields with wpq used for buffered-io
* in io_async_rw, so fail it here.
*/
- if (!(req->file->f_flags & O_DIRECT))
+ if (!(file->f_flags & O_DIRECT))
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HAS_METADATA;
kiocb->private = &io->meta;
@@ -961,13 +962,13 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
if (ret == -EAGAIN) {
/* If we can poll, just do that. */
if (io_file_can_poll(req))
- return -EAGAIN;
+ return ret;
/* IOPOLL retry should happen for io-wq threads */
- if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
- goto done;
+ if (!force_nonblock && !(req->flags & REQ_F_IOPOLL))
+ return ret;
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
- goto done;
+ return ret;
ret = 0;
} else if (ret == -EIOCBQUEUED) {
return IOU_ISSUE_SKIP_COMPLETE;
@@ -975,7 +976,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
(issue_flags & IO_URING_F_MULTISHOT)) {
/* read all, failed, already did sync or don't want to retry */
- goto done;
+ return ret;
}
/*
@@ -1018,8 +1019,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
kiocb->ki_flags &= ~IOCB_WAITQ;
iov_iter_restore(&io->iter, &io->iter_state);
} while (ret > 0);
-done:
- /* it's faster to check here than delegate to kfree */
+
return ret;
}
@@ -1188,7 +1188,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL))
goto ret_eagain;
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index c6bb938ec5ea..46c12afec73e 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -458,6 +458,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
return -EINVAL;
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_uring_task *tctx;
struct task_struct *tsk;
struct io_sq_data *sqd;
bool attached;
@@ -524,8 +525,13 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
rcu_assign_pointer(sqd->thread, tsk);
mutex_unlock(&sqd->lock);
+ ret = 0;
get_task_struct(tsk);
- ret = io_uring_alloc_task_context(tsk, ctx);
+ tctx = io_uring_alloc_task_context(tsk, ctx);
+ if (!IS_ERR(tctx))
+ tsk->io_uring = tctx;
+ else
+ ret = PTR_ERR(tctx);
wake_up_new_task(tsk);
if (ret)
goto err;
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7cbcb82aedfb..61533f30494f 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -74,20 +74,20 @@ void __io_uring_free(struct task_struct *tsk)
}
}
-__cold int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+__cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
tctx = kzalloc_obj(*tctx);
if (unlikely(!tctx))
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
kfree(tctx);
- return ret;
+ return ERR_PTR(ret);
}
tctx->io_wq = io_init_wq_offload(ctx, task);
@@ -95,7 +95,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
- return ret;
+ return ERR_PTR(ret);
}
tctx->task = task;
@@ -103,31 +103,56 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_cancel, 0);
atomic_set(&tctx->inflight_tracked, 0);
- task->io_uring = tctx;
init_llist_head(&tctx->task_list);
init_task_work(&tctx->task_work, tctx_task_work);
+ return tctx;
+}
+
+static int io_tctx_install_node(struct io_ring_ctx *ctx,
+ struct io_uring_task *tctx)
+{
+ struct io_tctx_node *node;
+ int ret;
+
+ if (xa_load(&tctx->xa, (unsigned long)ctx))
+ return 0;
+
+ node = kmalloc_obj(*node);
+ if (!node)
+ return -ENOMEM;
+ node->ctx = ctx;
+ node->task = current;
+
+ ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+ node, GFP_KERNEL));
+ if (ret) {
+ kfree(node);
+ return ret;
+ }
+
+ mutex_lock(&ctx->tctx_lock);
+ list_add(&node->ctx_node, &ctx->tctx_list);
+ mutex_unlock(&ctx->tctx_lock);
return 0;
}
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
int ret;
if (unlikely(!tctx)) {
- ret = io_uring_alloc_task_context(current, ctx);
- if (unlikely(ret))
- return ret;
+ tctx = io_uring_alloc_task_context(current, ctx);
+ if (IS_ERR(tctx))
+ return PTR_ERR(tctx);
- tctx = current->io_uring;
- if (ctx->iowq_limits_set) {
+ if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
unsigned int limits[2] = { ctx->iowq_limits[0],
ctx->iowq_limits[1], };
ret = io_wq_max_workers(tctx->io_wq, limits);
if (ret)
- return ret;
+ goto err_free;
}
}
@@ -138,25 +163,19 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
*/
if (tctx->io_wq)
io_wq_set_exit_on_idle(tctx->io_wq, false);
- if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
- node = kmalloc_obj(*node);
- if (!node)
- return -ENOMEM;
- node->ctx = ctx;
- node->task = current;
-
- ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
- node, GFP_KERNEL));
- if (ret) {
- kfree(node);
- return ret;
- }
- mutex_lock(&ctx->tctx_lock);
- list_add(&node->ctx_node, &ctx->tctx_list);
- mutex_unlock(&ctx->tctx_lock);
+ ret = io_tctx_install_node(ctx, tctx);
+ if (!ret) {
+ current->io_uring = tctx;
+ return 0;
}
- return 0;
+ if (!current->io_uring) {
+err_free:
+ io_wq_put_and_exit(tctx->io_wq);
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ }
+ return ret;
}
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index 608e96de70a2..2310d2a0c46d 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -6,8 +6,8 @@ struct io_tctx_node {
struct io_ring_ctx *ctx;
};
-int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx);
+struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx);
void io_uring_del_tctx_node(unsigned long index);
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index cb61d4862fc6..4cfdfc519770 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -30,11 +30,30 @@ struct io_timeout_rem {
u64 addr;
/* timeout update */
- struct timespec64 ts;
+ ktime_t time;
u32 flags;
bool ltimeout;
};
+static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags)
+{
+ struct timespec64 ts;
+
+ if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) {
+ *time = ns_to_ktime(arg);
+ if (*time < 0)
+ return -EINVAL;
+ return 0;
+ }
+
+ if (get_timespec64(&ts, u64_to_user_ptr(arg)))
+ return -EFAULT;
+ if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+ return -EINVAL;
+ *time = timespec64_to_ktime(ts);
+ return 0;
+}
+
static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link);
@@ -80,7 +99,7 @@ static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw)
/* re-arm timer */
raw_spin_lock_irq(&ctx->timeout_lock);
list_add(&timeout->list, ctx->timeout_list.prev);
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
+ hrtimer_start(&data->timer, data->time, data->mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
return;
}
@@ -265,8 +284,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
raw_spin_lock_irqsave(&ctx->timeout_lock, flags);
list_del_init(&timeout->list);
- atomic_set(&req->ctx->cq_timeouts,
- atomic_read(&req->ctx->cq_timeouts) + 1);
+ atomic_set(&ctx->cq_timeouts,
+ atomic_read(&ctx->cq_timeouts) + 1);
raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags);
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
@@ -395,7 +414,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
}
static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
- struct timespec64 *ts, enum hrtimer_mode mode)
+ ktime_t ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_timeout_data *io;
@@ -417,12 +436,12 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
if (hrtimer_try_to_cancel(&io->timer) == -1)
return -EALREADY;
hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode);
- hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
+ hrtimer_start(&io->timer, ts, mode);
return 0;
}
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
- struct timespec64 *ts, enum hrtimer_mode mode)
+ ktime_t time, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
@@ -435,20 +454,23 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
timeout->off = 0; /* noseq */
data = req->async_data;
- data->ts = *ts;
+ data->time = time;
list_add_tail(&timeout->list, &ctx->timeout_list);
hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode);
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode);
+ hrtimer_start(&data->timer, data->time, mode);
return 0;
}
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
+ int ret;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
+ if (sqe->addr3 || sqe->__pad2[0])
+ return -EINVAL;
if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
@@ -460,12 +482,13 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
tr->ltimeout = true;
- if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
- return -EINVAL;
- if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2))))
- return -EFAULT;
- if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK |
+ IORING_TIMEOUT_ABS |
+ IORING_TIMEOUT_IMMEDIATE_ARG))
return -EINVAL;
+ ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags);
+ if (ret)
+ return ret;
} else if (tr->flags) {
/* timeout removal doesn't support flags */
return -EINVAL;
@@ -500,9 +523,9 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
raw_spin_lock_irq(&ctx->timeout_lock);
if (tr->ltimeout)
- ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode);
else
- ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ ret = io_timeout_update(ctx, tr->addr, tr->time, mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
}
@@ -520,7 +543,10 @@ static int __io_timeout_prep(struct io_kiocb *req,
struct io_timeout_data *data;
unsigned flags;
u32 off = READ_ONCE(sqe->off);
+ int ret;
+ if (sqe->addr3 || sqe->__pad2[0])
+ return -EINVAL;
if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
@@ -528,7 +554,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
IORING_TIMEOUT_ETIME_SUCCESS |
- IORING_TIMEOUT_MULTISHOT))
+ IORING_TIMEOUT_MULTISHOT |
+ IORING_TIMEOUT_IMMEDIATE_ARG))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -539,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
INIT_LIST_HEAD(&timeout->list);
timeout->off = off;
- if (unlikely(off && !req->ctx->off_timeout_used))
- req->ctx->off_timeout_used = true;
+ if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED)))
+ req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED;
/*
* for multishot reqs w/ fixed nr of repeats, repeats tracks the
* remaining nr
@@ -557,11 +584,9 @@ static int __io_timeout_prep(struct io_kiocb *req,
data->req = req;
data->flags = flags;
- if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr))))
- return -EFAULT;
-
- if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
- return -EINVAL;
+ ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags);
+ if (ret)
+ return ret;
data->mode = io_translate_timeout_mode(flags);
@@ -637,7 +662,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
}
add:
list_add(&timeout->list, entry);
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
+ hrtimer_start(&data->timer, data->time, data->mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
return IOU_ISSUE_SKIP_COMPLETE;
}
@@ -655,8 +680,7 @@ void io_queue_linked_timeout(struct io_kiocb *req)
if (timeout->head) {
struct io_timeout_data *data = req->async_data;
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
- data->mode);
+ hrtimer_start(&data->timer, data->time, data->mode);
list_add_tail(&timeout->list, &ctx->ltimeout_list);
}
raw_spin_unlock_irq(&ctx->timeout_lock);
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index 2b7c9ad72992..1620f94dd45a 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -3,7 +3,7 @@
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
- struct timespec64 ts;
+ ktime_t time;
enum hrtimer_mode mode;
u32 flags;
};
diff --git a/io_uring/tw.c b/io_uring/tw.c
index 2f2b4ac4b126..fdff81eebc95 100644
--- a/io_uring/tw.c
+++ b/io_uring/tw.c
@@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (!head) {
io_ctx_mark_taskrun(ctx);
- if (ctx->has_evfd)
+ if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD)
io_eventfd_signal(ctx, false);
}
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ee7b49f47cb5..7b25dcd9d05f 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
* because iopoll completion data overlaps with the hash_node used
* for tracking.
*/
- if (ctx->flags & IORING_SETUP_IOPOLL)
+ if (req->flags & REQ_F_IOPOLL)
return;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
@@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
io_req_set_cqe32_extra(req, res2, 0);
}
io_req_uring_cleanup(req, issue_flags);
- if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+ if (req->flags & REQ_F_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
smp_store_release(&req->iopoll_completed, 1);
} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
@@ -257,9 +257,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
issue_flags |= IO_URING_F_CQE32;
if (io_is_compat(ctx))
issue_flags |= IO_URING_F_COMPAT;
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!file->f_op->uring_cmd_iopoll)
- return -EOPNOTSUPP;
+ if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) {
+ req->flags |= REQ_F_IOPOLL;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
diff --git a/io_uring/wait.h b/io_uring/wait.h
index 3a145fcfd3dd..a4274b137f81 100644
--- a/io_uring/wait.h
+++ b/io_uring/wait.h
@@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
struct ext_arg *ext_arg);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx);
+void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx);
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 62d693287457..bd970fb084c1 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -63,7 +63,7 @@ static int io_area_max_shift(struct io_zcrx_mem *mem)
unsigned i;
for_each_sgtable_dma_sg(sgt, sg, i)
- shift = min(shift, __ffs(sg->length));
+ shift = min(shift, __ffs(sg_dma_len(sg)));
return shift;
}
@@ -127,10 +127,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
int dmabuf_fd = area_reg->dmabuf_fd;
int i, ret;
+ if (!ifq->dev)
+ return -EINVAL;
if (off)
return -EINVAL;
- if (WARN_ON_ONCE(!ifq->dev))
- return -EFAULT;
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return -EINVAL;
@@ -194,6 +194,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
{
struct page **pages;
int nr_pages, ret;
+ bool mapped = false;
if (area_reg->dmabuf_fd)
return -EINVAL;
@@ -207,22 +208,37 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
0, (unsigned long)nr_pages << PAGE_SHIFT,
GFP_KERNEL_ACCOUNT);
- if (ret) {
- unpin_user_pages(pages, nr_pages);
- kvfree(pages);
- return ret;
+ if (ret)
+ goto out_err;
+
+ if (ifq->dev) {
+ ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ if (ret < 0)
+ goto out_err;
+ mapped = true;
}
mem->account_pages = io_count_account_pages(pages, nr_pages);
ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
- if (ret < 0)
+ if (ret < 0) {
mem->account_pages = 0;
+ goto out_err;
+ }
mem->sgt = &mem->page_sg_table;
mem->pages = pages;
mem->nr_folios = nr_pages;
mem->size = area_reg->len;
return ret;
+out_err:
+ if (mapped)
+ dma_unmap_sgtable(ifq->dev, &mem->page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ sg_free_table(&mem->page_sg_table);
+ unpin_user_pages(pages, nr_pages);
+ kvfree(pages);
+ return ret;
}
static void io_release_area_mem(struct io_zcrx_mem *mem)
@@ -273,8 +289,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
return;
area->is_mapped = false;
- for (i = 0; i < area->nia.num_niovs; i++)
- net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
+ if (area->nia.niovs) {
+ for (i = 0; i < area->nia.num_niovs; i++)
+ net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
+ }
if (area->mem.is_dmabuf) {
io_release_dmabuf(&area->mem);
@@ -284,45 +302,23 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
}
}
-static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
-{
- int ret;
-
- guard(mutex)(&ifq->pp_lock);
- if (area->is_mapped)
- return 0;
-
- if (!area->mem.is_dmabuf) {
- ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- if (ret < 0)
- return ret;
- }
-
- ret = io_populate_area_dma(ifq, area);
- if (ret && !area->mem.is_dmabuf)
- dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- if (ret == 0)
- area->is_mapped = true;
- return ret;
-}
-
-static void io_zcrx_sync_for_device(struct page_pool *pool,
- struct net_iov *niov)
+static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx,
+ netmem_ref *netmems, unsigned nr)
{
#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+ struct device *dev = pp->p.dev;
+ unsigned i, niov_size;
dma_addr_t dma_addr;
- unsigned niov_size;
-
- if (!dma_dev_need_sync(pool->p.dev))
+ if (!dma_dev_need_sync(dev))
return;
+ niov_size = 1U << zcrx->niov_shift;
- niov_size = 1U << io_pp_to_ifq(pool)->niov_shift;
- dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
- __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
- niov_size, pool->p.dma_dir);
+ for (i = 0; i < nr; i++) {
+ dma_addr = page_pool_get_dma_addr_netmem(netmems[i]);
+ __dma_sync_single_for_device(dev, dma_addr + pp->p.offset,
+ niov_size, pp->p.dma_dir);
+ }
#endif
}
@@ -390,24 +386,24 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
return -EINVAL;
mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
- mmap_offset += id << IORING_OFF_PBUF_SHIFT;
+ mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT;
- ret = io_create_region(ctx, &ifq->region, rd, mmap_offset);
+ ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset);
if (ret < 0)
return ret;
- ptr = io_region_get_ptr(&ifq->region);
- ifq->rq_ring = (struct io_uring *)ptr;
- ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
+ ptr = io_region_get_ptr(&ifq->rq_region);
+ ifq->rq.ring = (struct io_uring *)ptr;
+ ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
}
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
- io_free_region(ifq->user, &ifq->region);
- ifq->rq_ring = NULL;
- ifq->rqes = NULL;
+ io_free_region(ifq->user, &ifq->rq_region);
+ ifq->rq.ring = NULL;
+ ifq->rq.rqes = NULL;
}
static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
@@ -429,8 +425,13 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area *area)
{
- if (ifq->area)
+ bool kern_readable = !area->mem.is_dmabuf;
+
+ if (WARN_ON_ONCE(ifq->area))
+ return -EINVAL;
+ if (WARN_ON_ONCE(ifq->kern_readable != kern_readable))
return -EINVAL;
+
ifq->area = area;
return 0;
}
@@ -450,6 +451,8 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
return -EINVAL;
buf_size_shift = ilog2(reg->rx_buf_len);
}
+ if (!ifq->dev && buf_size_shift != PAGE_SHIFT)
+ return -EOPNOTSUPP;
ret = -ENOMEM;
area = kzalloc_obj(*area);
@@ -460,8 +463,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
ret = io_import_area(ifq, &area->mem, area_reg);
if (ret)
goto err;
+ if (ifq->dev)
+ area->is_mapped = true;
- if (buf_size_shift > io_area_max_shift(&area->mem)) {
+ if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) {
ret = -ERANGE;
goto err;
}
@@ -495,6 +500,12 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
niov->type = NET_IOV_IOURING;
}
+ if (ifq->dev) {
+ ret = io_populate_area_dma(ifq, area);
+ if (ret)
+ goto err;
+ }
+
area->free_count = nr_iovs;
/* we're only supporting one area per ifq for now */
area->area_id = 0;
@@ -519,7 +530,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
return NULL;
ifq->if_rxq = -1;
- spin_lock_init(&ifq->rq_lock);
+ spin_lock_init(&ifq->rq.lock);
mutex_init(&ifq->pp_lock);
refcount_set(&ifq->refs, 1);
refcount_set(&ifq->user_refs, 1);
@@ -586,9 +597,21 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
- spin_lock_bh(&area->freelist_lock);
+ guard(spinlock_bh)(&area->freelist_lock);
area->freelist[area->free_count++] = net_iov_idx(niov);
- spin_unlock_bh(&area->freelist_lock);
+}
+
+static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area)
+{
+ unsigned niov_idx;
+
+ lockdep_assert_held(&area->freelist_lock);
+
+ if (unlikely(!area->free_count))
+ return NULL;
+
+ niov_idx = area->freelist[--area->free_count];
+ return &area->nia.niovs[niov_idx];
}
static void io_zcrx_return_niov(struct net_iov *niov)
@@ -624,12 +647,17 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
}
}
-static void zcrx_unregister(struct io_zcrx_ifq *ifq)
+static void zcrx_unregister_user(struct io_zcrx_ifq *ifq)
{
if (refcount_dec_and_test(&ifq->user_refs)) {
io_close_queue(ifq);
io_zcrx_scrub(ifq);
}
+}
+
+static void zcrx_unregister(struct io_zcrx_ifq *ifq)
+{
+ zcrx_unregister_user(ifq);
io_put_zcrx_ifq(ifq);
}
@@ -640,7 +668,7 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
lockdep_assert_held(&ctx->mmap_lock);
- return ifq ? &ifq->region : NULL;
+ return ifq ? &ifq->rq_region : NULL;
}
static int zcrx_box_release(struct inode *inode, struct file *file)
@@ -751,10 +779,50 @@ err:
return ret;
}
-int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
- struct io_uring_zcrx_ifq_reg __user *arg)
+static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
+ struct io_uring_zcrx_ifq_reg *reg,
+ struct io_uring_zcrx_area_reg *area)
{
struct pp_memory_provider_params mp_param = {};
+ unsigned if_rxq = reg->if_rxq;
+ int ret;
+
+ ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns,
+ reg->if_idx);
+ if (!ifq->netdev)
+ return -ENODEV;
+
+ netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
+
+ ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq);
+ if (!ifq->dev) {
+ ret = -EOPNOTSUPP;
+ goto netdev_put_unlock;
+ }
+ get_device(ifq->dev);
+
+ ret = io_zcrx_create_area(ifq, area, reg);
+ if (ret)
+ goto netdev_put_unlock;
+
+ if (reg->rx_buf_len)
+ mp_param.rx_page_size = 1U << ifq->niov_shift;
+ mp_param.mp_ops = &io_uring_pp_zc_ops;
+ mp_param.mp_priv = ifq;
+ ret = __net_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL);
+ if (ret)
+ goto netdev_put_unlock;
+
+ ifq->if_rxq = if_rxq;
+ ret = 0;
+netdev_put_unlock:
+ netdev_unlock(ifq->netdev);
+ return ret;
+}
+
+int io_register_zcrx(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg)
+{
struct io_uring_zcrx_area_reg area;
struct io_uring_zcrx_ifq_reg reg;
struct io_uring_region_desc rd;
@@ -778,11 +846,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EFAULT;
if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
return -EINVAL;
+ if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS)
+ return -EINVAL;
if (reg.flags & ZCRX_REG_IMPORT)
return import_zcrx(ctx, arg, &reg);
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
- if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
+ if (reg.if_rxq == -1 || !reg.rq_entries)
+ return -EINVAL;
+ if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV))
return -EINVAL;
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
if (!(ctx->flags & IORING_SETUP_CLAMP))
@@ -806,7 +878,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
mmgrab(ctx->mm_account);
ifq->mm_account = ctx->mm_account;
}
- ifq->rq_entries = reg.rq_entries;
+ ifq->rq.nr_entries = reg.rq_entries;
scoped_guard(mutex, &ctx->mmap_lock) {
/* preallocate id */
@@ -819,33 +891,17 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (ret)
goto err;
- ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx);
- if (!ifq->netdev) {
- ret = -ENODEV;
- goto err;
- }
- netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
+ ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
- ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
- if (!ifq->dev) {
- ret = -EOPNOTSUPP;
- goto netdev_put_unlock;
+ if (!(reg.flags & ZCRX_REG_NODEV)) {
+ ret = zcrx_register_netdev(ifq, &reg, &area);
+ if (ret)
+ goto err;
+ } else {
+ ret = io_zcrx_create_area(ifq, &area, &reg);
+ if (ret)
+ goto err;
}
- get_device(ifq->dev);
-
- ret = io_zcrx_create_area(ifq, &area, &reg);
- if (ret)
- goto netdev_put_unlock;
-
- if (reg.rx_buf_len)
- mp_param.rx_page_size = 1U << ifq->niov_shift;
- mp_param.mp_ops = &io_uring_pp_zc_ops;
- mp_param.mp_priv = ifq;
- ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
- if (ret)
- goto netdev_put_unlock;
- netdev_unlock(ifq->netdev);
- ifq->if_rxq = reg.if_rxq;
reg.zcrx_id = id;
@@ -865,8 +921,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto err;
}
return 0;
-netdev_put_unlock:
- netdev_unlock(ifq->netdev);
err:
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->zcrx_ctxs, id);
@@ -875,17 +929,37 @@ ifq_free:
return ret;
}
-static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
+static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id)
{
- unsigned niov_idx;
+ return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
+}
- lockdep_assert_held(&area->freelist_lock);
+static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id)
+{
+ xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
+}
- niov_idx = area->freelist[--area->free_count];
- return &area->nia.niovs[niov_idx];
+void io_terminate_zcrx(struct io_ring_ctx *ctx)
+{
+ struct io_zcrx_ifq *ifq;
+ unsigned long id = 0;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ while (1) {
+ scoped_guard(mutex, &ctx->mmap_lock)
+ ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+ if (!ifq)
+ break;
+ if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id)))
+ break;
+ set_zcrx_entry_mark(ctx, id);
+ id++;
+ zcrx_unregister_user(ifq);
+ }
}
-void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+void io_unregister_zcrx(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
@@ -896,31 +970,35 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
unsigned long id = 0;
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
- if (ifq)
+ if (ifq) {
+ if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) {
+ ifq = NULL;
+ break;
+ }
xa_erase(&ctx->zcrx_ctxs, id);
+ }
}
if (!ifq)
break;
- zcrx_unregister(ifq);
+ io_put_zcrx_ifq(ifq);
}
xa_destroy(&ctx->zcrx_ctxs);
}
-static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
+static inline u32 zcrx_rq_entries(struct zcrx_rq *rq)
{
u32 entries;
- entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
- return min(entries, ifq->rq_entries);
+ entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head;
+ return min(entries, rq->nr_entries);
}
-static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
- unsigned mask)
+static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask)
{
- unsigned int idx = ifq->cached_rq_head++ & mask;
+ unsigned int idx = rq->cached_head++ & mask;
- return &ifq->rqes[idx];
+ return &rq->rqes[idx];
}
static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
@@ -946,21 +1024,24 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
return true;
}
-static void io_zcrx_ring_refill(struct page_pool *pp,
- struct io_zcrx_ifq *ifq)
+static unsigned io_zcrx_ring_refill(struct page_pool *pp,
+ struct io_zcrx_ifq *ifq,
+ netmem_ref *netmems, unsigned to_alloc)
{
- unsigned int mask = ifq->rq_entries - 1;
+ struct zcrx_rq *rq = &ifq->rq;
+ unsigned int mask = rq->nr_entries - 1;
unsigned int entries;
+ unsigned allocated = 0;
- guard(spinlock_bh)(&ifq->rq_lock);
+ guard(spinlock_bh)(&rq->lock);
- entries = io_zcrx_rqring_entries(ifq);
- entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL);
+ entries = zcrx_rq_entries(rq);
+ entries = min_t(unsigned, entries, to_alloc);
if (unlikely(!entries))
- return;
+ return 0;
do {
- struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
+ struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
struct net_iov *niov;
netmem_ref netmem;
@@ -978,46 +1059,56 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
continue;
}
- io_zcrx_sync_for_device(pp, niov);
- net_mp_netmem_place_in_cache(pp, netmem);
+ netmems[allocated] = netmem;
+ allocated++;
} while (--entries);
- smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+ smp_store_release(&rq->ring->head, rq->cached_head);
+ return allocated;
}
-static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
+static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq,
+ netmem_ref *netmems, unsigned to_alloc)
{
struct io_zcrx_area *area = ifq->area;
+ unsigned allocated = 0;
- spin_lock_bh(&area->freelist_lock);
- while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
- struct net_iov *niov = __io_zcrx_get_free_niov(area);
- netmem_ref netmem = net_iov_to_netmem(niov);
+ guard(spinlock_bh)(&area->freelist_lock);
+ for (allocated = 0; allocated < to_alloc; allocated++) {
+ struct net_iov *niov = zcrx_get_free_niov(area);
+
+ if (!niov)
+ break;
net_mp_niov_set_page_pool(pp, niov);
- io_zcrx_sync_for_device(pp, niov);
- net_mp_netmem_place_in_cache(pp, netmem);
+ netmems[allocated] = net_iov_to_netmem(niov);
}
- spin_unlock_bh(&area->freelist_lock);
+ return allocated;
}
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
+ netmem_ref *netmems = pp->alloc.cache;
+ unsigned to_alloc = PP_ALLOC_CACHE_REFILL;
+ unsigned allocated;
/* pp should already be ensuring that */
- if (unlikely(pp->alloc.count))
- goto out_return;
+ if (WARN_ON_ONCE(pp->alloc.count))
+ return 0;
- io_zcrx_ring_refill(pp, ifq);
- if (likely(pp->alloc.count))
+ allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc);
+ if (likely(allocated))
goto out_return;
- io_zcrx_refill_slow(pp, ifq);
- if (!pp->alloc.count)
+ allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
+ if (!allocated)
return 0;
out_return:
- return pp->alloc.cache[--pp->alloc.count];
+ zcrx_sync_for_device(pp, ifq, netmems, allocated);
+ allocated--;
+ pp->alloc.count += allocated;
+ return netmems[allocated];
}
static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
@@ -1036,7 +1127,6 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
static int io_pp_zc_init(struct page_pool *pp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
- int ret;
if (WARN_ON_ONCE(!ifq))
return -EINVAL;
@@ -1049,10 +1139,6 @@ static int io_pp_zc_init(struct page_pool *pp)
if (pp->p.dma_dir != DMA_FROM_DEVICE)
return -EOPNOTSUPP;
- ret = io_zcrx_map_area(ifq, ifq->area);
- if (ret)
- return ret;
-
refcount_inc(&ifq->refs);
return 0;
}
@@ -1100,14 +1186,14 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
};
static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
- struct io_zcrx_ifq *zcrx)
+ struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq)
{
- unsigned int mask = zcrx->rq_entries - 1;
+ unsigned int mask = rq->nr_entries - 1;
unsigned int i;
- nr = min(nr, io_zcrx_rqring_entries(zcrx));
+ nr = min(nr, zcrx_rq_entries(rq));
for (i = 0; i < nr; i++) {
- struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
+ struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
struct net_iov *niov;
if (!io_parse_rqe(rqe, zcrx, &niov))
@@ -1115,7 +1201,7 @@ static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
netmem_array[i] = net_iov_to_netmem(niov);
}
- smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
+ smp_store_release(&rq->ring->head, rq->cached_head);
return i;
}
@@ -1149,8 +1235,10 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
return -EINVAL;
do {
- scoped_guard(spinlock_bh, &zcrx->rq_lock) {
- nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
+ struct zcrx_rq *rq = &zcrx->rq;
+
+ scoped_guard(spinlock_bh, &rq->lock) {
+ nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq);
zcrx_return_buffers(netmems, nr);
}
@@ -1159,7 +1247,7 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
if (fatal_signal_pending(current))
break;
cond_resched();
- } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
+ } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries);
return 0;
}
@@ -1169,6 +1257,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
struct zcrx_ctrl ctrl;
struct io_zcrx_ifq *zcrx;
+ BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
+
if (nr_args)
return -EINVAL;
if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
@@ -1221,13 +1311,11 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
struct io_zcrx_area *area = ifq->area;
struct net_iov *niov = NULL;
- if (area->mem.is_dmabuf)
+ if (!ifq->kern_readable)
return NULL;
- spin_lock_bh(&area->freelist_lock);
- if (area->free_count)
- niov = __io_zcrx_get_free_niov(area);
- spin_unlock_bh(&area->freelist_lock);
+ scoped_guard(spinlock_bh, &area->freelist_lock)
+ niov = zcrx_get_free_niov(area);
if (niov)
page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 32ab95b2cb81..75e0a4e6ef6e 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -8,6 +8,9 @@
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
+#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
+#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE)
+
struct io_zcrx_mem {
unsigned long size;
bool is_dmabuf;
@@ -38,17 +41,22 @@ struct io_zcrx_area {
struct io_zcrx_mem mem;
};
+struct zcrx_rq {
+ spinlock_t lock;
+ struct io_uring *ring;
+ struct io_uring_zcrx_rqe *rqes;
+ u32 cached_head;
+ u32 nr_entries;
+};
+
struct io_zcrx_ifq {
struct io_zcrx_area *area;
unsigned niov_shift;
struct user_struct *user;
struct mm_struct *mm_account;
+ bool kern_readable;
- spinlock_t rq_lock ____cacheline_aligned_in_smp;
- struct io_uring *rq_ring;
- struct io_uring_zcrx_rqe *rqes;
- u32 cached_rq_head;
- u32 rq_entries;
+ struct zcrx_rq rq ____cacheline_aligned_in_smp;
u32 if_rxq;
struct device *dev;
@@ -63,26 +71,30 @@ struct io_zcrx_ifq {
* net stack.
*/
struct mutex pp_lock;
- struct io_mapped_region region;
+ struct io_mapped_region rq_region;
};
#if defined(CONFIG_IO_URING_ZCRX)
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
-int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+int io_register_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
-void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
+void io_unregister_zcrx(struct io_ring_ctx *ctx);
+void io_terminate_zcrx(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id);
#else
-static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
- struct io_uring_zcrx_ifq_reg __user *arg)
+static inline int io_register_zcrx(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg)
{
return -EOPNOTSUPP;
}
-static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+static inline void io_unregister_zcrx(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_terminate_zcrx(struct io_ring_ctx *ctx)
{
}
static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,