diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-13 16:22:30 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-13 16:22:30 -0700 |
| commit | 23acda7c221a76ff711d65f4ca90029d43b249a0 (patch) | |
| tree | 3e7745c9210489864e153990c06833d7d47a3dcd /io_uring | |
| parent | 7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (diff) | |
| parent | c5e9f6a96bf7379da87df1b852b90527e242b56f (diff) | |
Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Add a callback driven main loop for io_uring, and BPF struct_ops
on top to allow implementing custom event loop logic
- Decouple IOPOLL from being a ring-wide all-or-nothing setting,
allowing IOPOLL use cases to also issue certain white listed
non-polled opcodes
- Timeout improvements. Migrate internal timeout storage from
timespec64 to ktime_t for simpler arithmetic and avoid copying of
timespec data
- Zero-copy receive (zcrx) updates:
- Add a device-less mode (ZCRX_REG_NODEV) for testing and
experimentation where data flows through the copy fallback path
- Fix two-step unregistration regression, DMA length calculations,
xarray mark usage, and a potential 32-bit overflow in id
shifting
- Refactoring toward multi-area support: dedicated refill queue
struct, consolidated DMA syncing, netmem array refilling format,
and guard-based locking
- Zero-copy transmit (zctx) cleanup:
- Unify io_send_zc() and io_sendmsg_zc() into a single function
- Add vectorized registered buffer send for IORING_OP_SEND_ZC
- Add separate notification user_data via sqe->addr3 so
notification and completion CQEs can be distinguished without
extra reference counting
- Switch struct io_ring_ctx internal bitfields to explicit flag bits
with atomic-safe accessors, and annotate the known harmless races on
those flags
- Various optimizations caching ctx and other request fields in local
variables to avoid repeated loads, and cleanups for tctx setup, ring
fd registration, and read path early returns
* tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits)
io_uring: unify getting ctx from passed in file descriptor
io_uring/register: don't get a reference to the registered ring fd
io_uring/tctx: clean up __io_uring_add_tctx_node() error handling
io_uring/tctx: have io_uring_alloc_task_context() return tctx
io_uring/timeout: use 'ctx' consistently
io_uring/rw: clean up __io_read() obsolete comment and early returns
io_uring/zcrx: use correct mmap off constants
io_uring/zcrx: use dma_len for chunk size calculation
io_uring/zcrx: don't clear not allocated niovs
io_uring/zcrx: don't use mark0 for allocating xarray
io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring()
io_uring/zcrx: reject REG_NODEV with large rx_buf_size
io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP
io_uring/rsrc: use io_cache_free() to free node
io_uring/zcrx: rename zcrx [un]register functions
io_uring/zcrx: check ctrl op payload struct sizes
io_uring/zcrx: cache fallback availability in zcrx ctx
io_uring/zcrx: warn on a repeated area append
io_uring/zcrx: consolidate dma syncing
io_uring/zcrx: netmem array as refiling format
...
Diffstat (limited to 'io_uring')
| -rw-r--r-- | io_uring/Kconfig | 5 | ||||
| -rw-r--r-- | io_uring/Makefile | 3 | ||||
| -rw-r--r-- | io_uring/bpf-ops.c | 270 | ||||
| -rw-r--r-- | io_uring/bpf-ops.h | 28 | ||||
| -rw-r--r-- | io_uring/cancel.c | 9 | ||||
| -rw-r--r-- | io_uring/cmd_net.c | 34 | ||||
| -rw-r--r-- | io_uring/eventfd.c | 4 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 183 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 11 | ||||
| -rw-r--r-- | io_uring/kbuf.c | 4 | ||||
| -rw-r--r-- | io_uring/loop.c | 91 | ||||
| -rw-r--r-- | io_uring/loop.h | 27 | ||||
| -rw-r--r-- | io_uring/msg_ring.c | 2 | ||||
| -rw-r--r-- | io_uring/net.c | 148 | ||||
| -rw-r--r-- | io_uring/net.h | 1 | ||||
| -rw-r--r-- | io_uring/opdef.c | 12 | ||||
| -rw-r--r-- | io_uring/opdef.h | 2 | ||||
| -rw-r--r-- | io_uring/poll.c | 8 | ||||
| -rw-r--r-- | io_uring/query.c | 4 | ||||
| -rw-r--r-- | io_uring/register.c | 49 | ||||
| -rw-r--r-- | io_uring/register.h | 1 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 15 | ||||
| -rw-r--r-- | io_uring/rw.c | 24 | ||||
| -rw-r--r-- | io_uring/sqpoll.c | 8 | ||||
| -rw-r--r-- | io_uring/tctx.c | 79 | ||||
| -rw-r--r-- | io_uring/tctx.h | 4 | ||||
| -rw-r--r-- | io_uring/timeout.c | 78 | ||||
| -rw-r--r-- | io_uring/timeout.h | 2 | ||||
| -rw-r--r-- | io_uring/tw.c | 2 | ||||
| -rw-r--r-- | io_uring/uring_cmd.c | 9 | ||||
| -rw-r--r-- | io_uring/wait.h | 1 | ||||
| -rw-r--r-- | io_uring/zcrx.c | 384 | ||||
| -rw-r--r-- | io_uring/zcrx.h | 34 |
33 files changed, 1030 insertions, 506 deletions
diff --git a/io_uring/Kconfig b/io_uring/Kconfig index a7ae23cf1035..a283d9e53787 100644 --- a/io_uring/Kconfig +++ b/io_uring/Kconfig @@ -14,3 +14,8 @@ config IO_URING_BPF def_bool y depends on BPF depends on NET + +config IO_URING_BPF_OPS + def_bool y + depends on IO_URING + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF diff --git a/io_uring/Makefile b/io_uring/Makefile index 931f9156132a..c54e328d1410 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ advise.o openclose.o statx.o timeout.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o \ - query.o + query.o loop.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o @@ -25,3 +25,4 @@ obj-$(CONFIG_NET) += net.o cmd_net.o obj-$(CONFIG_PROC_FS) += fdinfo.o obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o +obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c new file mode 100644 index 000000000000..937e48bef40b --- /dev/null +++ b/io_uring/bpf-ops.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/mutex.h> +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> + +#include "io_uring.h" +#include "register.h" +#include "loop.h" +#include "memmap.h" +#include "bpf-ops.h" + +static DEFINE_MUTEX(io_bpf_ctrl_mutex); +static const struct btf_type *loop_params_type; + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr) +{ + return io_submit_sqes(ctx, nr); +} + +__bpf_kfunc +__u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id, + const size_t rdwr_buf_size) +{ + struct io_mapped_region *r; + + lockdep_assert_held(&ctx->uring_lock); + + switch (region_id) { + case IOU_REGION_MEM: + r = &ctx->param_region; + break; + case IOU_REGION_CQ: + r = &ctx->ring_region; + break; + case IOU_REGION_SQ: + r = &ctx->sq_region; + break; + default: + return NULL; + } + + if (unlikely(rdwr_buf_size > io_region_size(r))) + return NULL; + return io_region_get_ptr(r); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(io_uring_kfunc_set) +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE); +BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL); +BTF_KFUNCS_END(io_uring_kfunc_set) + +static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = { + .owner = THIS_MODULE, + .set = &io_uring_kfunc_set, +}; + +static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx, + struct iou_loop_params *lp) +{ + return IOU_LOOP_STOP; +} + +static struct io_uring_bpf_ops io_bpf_ops_stubs = { + .loop_step = io_bpf_ops__loop_step, +}; + +static bool bpf_io_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_io_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == loop_params_type) { + if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_verifier_ops bpf_io_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_io_is_valid_access, + .btf_struct_access = bpf_io_btf_struct_access, +}; + +static const struct btf_type * +io_lookup_struct_type(struct btf *btf, const char *name) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (type_id < 0) + return NULL; + return btf_type_by_id(btf, type_id); +} + +static int bpf_io_init(struct btf *btf) +{ + int ret; + + loop_params_type = io_lookup_struct_type(btf, "iou_loop_params"); + if (!loop_params_type) { + pr_err("io_uring: Failed to locate iou_loop_params\n"); + return -EINVAL; + } + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_io_uring_kfunc_set); + if (ret) { + pr_err("io_uring: Failed to register kfuncs (%d)\n", ret); + return ret; + } + return 0; +} + +static int bpf_io_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + return 0; +} + +static int bpf_io_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + const struct io_uring_bpf_ops *uops = udata; + struct io_uring_bpf_ops *ops = kdata; + + switch (moff) { + case offsetof(struct io_uring_bpf_ops, ring_fd): + ops->ring_fd = uops->ring_fd; + return 1; + } + return 0; +} + +static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) +{ + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) + return -EOPNOTSUPP; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EOPNOTSUPP; + + if (ctx->bpf_ops) + return -EBUSY; + if (WARN_ON_ONCE(!ops->loop_step)) + return -EINVAL; + + ops->priv = ctx; + ctx->bpf_ops = ops; + ctx->loop_step = ops->loop_step; + return 0; +} + +static int bpf_io_reg(void *kdata, struct bpf_link *link) +{ + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + struct file *file; + int ret = -EBUSY; + + file = io_uring_ctx_get_file(ops->ring_fd, false); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + scoped_guard(mutex, &io_bpf_ctrl_mutex) { + guard(mutex)(&ctx->uring_lock); + ret = io_install_bpf(ctx, ops); + } + + fput(file); + return ret; +} + +static void io_eject_bpf(struct io_ring_ctx *ctx) +{ + struct io_uring_bpf_ops *ops = ctx->bpf_ops; + + if (WARN_ON_ONCE(!ops)) + return; + if (WARN_ON_ONCE(ops->priv != ctx)) + return; + + ops->priv = NULL; + ctx->bpf_ops = NULL; + ctx->loop_step = NULL; +} + +static void bpf_io_unreg(void *kdata, struct bpf_link *link) +{ + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + + guard(mutex)(&io_bpf_ctrl_mutex); + ctx = ops->priv; + if (ctx) { + guard(mutex)(&ctx->uring_lock); + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) + return; + + io_eject_bpf(ctx); + } +} + +void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ + /* + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, + * and read protected by either. Try to avoid taking the global lock + * for rings that never had any bpf installed. + */ + scoped_guard(mutex, &ctx->uring_lock) { + if (!ctx->bpf_ops) + return; + } + + guard(mutex)(&io_bpf_ctrl_mutex); + guard(mutex)(&ctx->uring_lock); + if (ctx->bpf_ops) + io_eject_bpf(ctx); +} + +static struct bpf_struct_ops bpf_ring_ops = { + .verifier_ops = &bpf_io_verifier_ops, + .reg = bpf_io_reg, + .unreg = bpf_io_unreg, + .check_member = bpf_io_check_member, + .init_member = bpf_io_init_member, + .init = bpf_io_init, + .cfi_stubs = &io_bpf_ops_stubs, + .name = "io_uring_bpf_ops", + .owner = THIS_MODULE, +}; + +static int __init io_uring_bpf_init(void) +{ + int ret; + + ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops); + if (ret) { + pr_err("io_uring: Failed to register struct_ops (%d)\n", ret); + return ret; + } + + return 0; +} +__initcall(io_uring_bpf_init); diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h new file mode 100644 index 000000000000..b39b3fd3acda --- /dev/null +++ b/io_uring/bpf-ops.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_BPF_OPS_H +#define IOU_BPF_OPS_H + +#include <linux/io_uring_types.h> + +enum { + IOU_REGION_MEM, + IOU_REGION_CQ, + IOU_REGION_SQ, +}; + +struct io_uring_bpf_ops { + int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp); + + __u32 ring_fd; + void *priv; +}; + +#ifdef CONFIG_IO_URING_BPF_OPS +void io_unregister_bpf_ops(struct io_ring_ctx *ctx); +#else +static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ +} +#endif + +#endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 65e04063e343..5e5eb9cfc7cd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -156,9 +156,16 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { + u32 op; + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; - cancel->opcode = READ_ONCE(sqe->len); + + op = READ_ONCE(sqe->len); + if (op >= IORING_OP_LAST) + return -EINVAL; + + cancel->opcode = op; } return 0; diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 125a81c520a6..7cd411fc4f33 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -7,6 +7,21 @@ #include "uring_cmd.h" #include "io_uring.h" +static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op) +{ + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + ret = prot->ioctl(sk, op, &arg); + if (ret) + return ret; + return arg; +} + static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -156,27 +171,12 @@ static int io_uring_cmd_getsockname(struct socket *sock, int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; switch (cmd->cmd_op) { case SOCKET_URING_OP_SIOCINQ: - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; + return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ); case SOCKET_URING_OP_SIOCOUTQ: - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; + return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ); case SOCKET_URING_OP_GETSOCKOPT: return io_uring_cmd_getsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_SETSOCKOPT: diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 7482a7dc6b38..3da028500f76 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; + ctx->int_flags |= IO_RING_F_HAS_EVFD; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); @@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { - ctx->has_evfd = false; + ctx->int_flags &= ~IO_RING_F_HAS_EVFD; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4d7bcbb97406..dd6326dc5f88 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -87,6 +87,7 @@ #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" +#include "bpf-ops.h" #include "timeout.h" #include "poll.h" @@ -95,6 +96,7 @@ #include "eventfd.h" #include "wait.h" #include "bpf_filter.h" +#include "loop.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -356,7 +358,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; @@ -378,7 +379,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) + if (should_hash || (req->flags & REQ_F_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -477,17 +478,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (ctx->poll_activated) + if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED) io_poll_wq_wake(ctx); - if (ctx->off_timeout_used) + if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED) io_flush_timeouts(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_lock(&ctx->completion_lock); } @@ -500,11 +501,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); - if (!ctx->task_complete) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ - if (!ctx->syscall_iopoll) + if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); @@ -589,6 +590,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) +{ + __io_cqring_overflow_flush(ctx, false); +} + /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { @@ -830,7 +836,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { lockdep_assert_held(&ctx->uring_lock); - lockdep_assert(ctx->lockless_cq); + lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_cqe cqe = io_init_cqe(user_data, res, cflags); @@ -860,7 +866,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); @@ -885,7 +891,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) lockdep_assert_held(&ctx->uring_lock); cqe[0].user_data = req->cqe.user_data; - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux32(ctx, cqe); spin_unlock(&ctx->completion_lock); @@ -913,7 +919,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { + if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) { defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); @@ -1067,12 +1073,14 @@ void io_queue_next(struct io_kiocb *req) static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + if (req->file_node) { - io_put_rsrc_node(req->ctx, req->file_node); + io_put_rsrc_node(ctx, req->file_node); req->file_node = NULL; } if (req->flags & REQ_F_BUF_NODE) - io_put_rsrc_node(req->ctx, req->buf_node); + io_put_rsrc_node(ctx, req->buf_node); } static void io_free_batch_list(struct io_ring_ctx *ctx, @@ -1135,7 +1143,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) + if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ) io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); else io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); @@ -1148,7 +1156,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) INIT_WQ_LIST(&state->compl_reqs); } - if (unlikely(ctx->drain_active)) + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_queue_deferred(ctx); ctx->submit_state.cq_flush = false; @@ -1187,7 +1195,6 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { - unsigned int nr_events = 0; unsigned long check_cq; min_events = min(min_events, ctx->cq_entries); @@ -1230,8 +1237,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) * very same mutex. */ if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { - u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) { @@ -1240,7 +1245,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ - if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list)) + if (list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, !min_events); @@ -1251,9 +1256,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) return -EINTR; if (need_resched()) break; - - nr_events += ret; - } while (nr_events < min_events); + } while (io_cqring_events(ctx) < min_events); return 0; } @@ -1344,7 +1347,7 @@ static __cold void io_drain_req(struct io_kiocb *req) list_add_tail(&de->list, &ctx->defer_list); io_queue_deferred(ctx); if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; + ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1418,8 +1421,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + if (req->flags & REQ_F_IOPOLL) io_iopoll_req_issued(req, issue_flags); } return ret; @@ -1435,7 +1437,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(req->ctx, tw); WARN_ON_ONCE(!req->file); - if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) return -EFAULT; ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); @@ -1533,7 +1535,7 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!(req->flags & REQ_F_IOPOLL)) break; if (io_wq_worker_stopped()) break; @@ -1655,7 +1657,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) } else { /* can't fail with IO_URING_F_INLINE */ io_req_sqe_copy(req, IO_URING_F_INLINE); - if (unlikely(req->ctx->drain_active)) + if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_drain_req(req); else io_queue_iowq(req); @@ -1671,7 +1673,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->op_restricted) + if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED)) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -1691,7 +1693,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) { struct io_kiocb *head = ctx->submit_state.link.head; - ctx->drain_active = true; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; if (head) { /* * If we need to drain a request in the middle of a link, drain @@ -1701,7 +1703,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; + ctx->int_flags |= IO_RING_F_DRAIN_NEXT; } } @@ -1767,23 +1769,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; + ctx->int_flags |= IO_RING_F_DRAIN_DISABLED; if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) + if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED) return io_init_fail_req(req, -EOPNOTSUPP); io_init_drain(ctx); } } - if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) + if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) { + ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -2148,12 +2150,13 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { + io_unregister_bpf_ops(ctx); io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); - io_unregister_zcrx_ifqs(ctx); + io_unregister_zcrx(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); @@ -2204,7 +2207,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) poll_wq_task_work); mutex_lock(&ctx->uring_lock); - ctx->poll_activated = true; + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; mutex_unlock(&ctx->uring_lock); /* @@ -2219,9 +2222,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ - if (ctx->poll_activated || ctx->poll_wq_task_work.func) + if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func) goto out; - if (WARN_ON_ONCE(!ctx->task_complete)) + if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))) goto out; if (!ctx->submitter_task) goto out; @@ -2242,7 +2245,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - if (unlikely(!ctx->poll_activated)) + if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED))) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper @@ -2308,6 +2311,10 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct io_tctx_node *node; int ret; + mutex_lock(&ctx->uring_lock); + io_terminate_zcrx(ctx); + mutex_unlock(&ctx->uring_lock); + /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while @@ -2539,39 +2546,54 @@ uaccess_end: #endif } -SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const void __user *, argp, - size_t, argsz) +/* + * Given an 'fd' value, return the ctx associated with if. If 'registered' is + * true, then the registered index is used. Otherwise, the normal fd table. + * Caller must call fput() on the returned file if it isn't a registered file, + * unless it's an ERR_PTR. + */ +struct file *io_uring_ctx_get_file(unsigned int fd, bool registered) { - struct io_ring_ctx *ctx; struct file *file; - long ret; - if (unlikely(flags & ~IORING_ENTER_FLAGS)) - return -EINVAL; - - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - if (flags & IORING_ENTER_REGISTERED_RING) { + if (registered) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; + return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; } else { file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (unlikely(!io_is_uring_fops(file))) - goto out; } + if (unlikely(!file)) + return ERR_PTR(-EBADF); + if (io_is_uring_fops(file)) + return file; + fput(file); + return ERR_PTR(-EOPNOTSUPP); +} + + +SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, + u32, min_complete, u32, flags, const void __user *, argp, + size_t, argsz) +{ + struct io_ring_ctx *ctx; + struct file *file; + long ret; + + if (unlikely(flags & ~IORING_ENTER_FLAGS)) + return -EINVAL; + + file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING); + if (IS_ERR(file)) + return PTR_ERR(file); ctx = file->private_data; ret = -EBADFD; /* @@ -2581,6 +2603,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; + if (io_has_loop_ops(ctx)) { + ret = io_run_loop(ctx); + goto out; + } + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if @@ -2610,7 +2637,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - if (ctx->syscall_iopoll) + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and @@ -2625,7 +2652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_GETEVENTS) { int ret2; - if (ctx->syscall_iopoll) { + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to @@ -2926,9 +2953,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, if (dst->bpf_filters) WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); if (dst->op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (dst->reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; } static __cold int io_uring_create(struct io_ctx_config *config) @@ -2955,17 +2982,18 @@ static __cold int io_uring_create(struct io_ctx_config *config) if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL)) - ctx->task_complete = true; + ctx->int_flags |= IO_RING_F_TASK_COMPLETE; - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) - ctx->lockless_cq = true; + if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) || + (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->int_flags |= IO_RING_F_LOCKLESS_CQ; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ - if (!ctx->task_complete) - ctx->poll_activated = true; + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user @@ -2975,9 +3003,10 @@ static __cold int io_uring_create(struct io_ctx_config *config) */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; + ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL; - ctx->compat = in_compat_syscall(); + if (in_compat_syscall()) + ctx->int_flags |= IO_RING_F_COMPAT; if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ee24bc5d77b3..e612a66ee80e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -185,6 +185,7 @@ void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); +struct file *io_uring_ctx_get_file(unsigned int fd, bool registered); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); @@ -223,7 +224,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); - } else if (!ctx->task_complete) { + } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* @@ -240,7 +241,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline bool io_is_compat(struct io_ring_ctx *ctx) { - return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -494,10 +495,12 @@ static inline void io_req_complete_defer(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } +#define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \ + IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED) + static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || - ctx->has_evfd || ctx->poll_activated)) + if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK)) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 5257b3aad395..8da2ff798170 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -230,7 +230,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, struct io_br_sel sel = { }; struct io_buffer_list *bl; - io_ring_submit_lock(req->ctx, issue_flags); + io_ring_submit_lock(ctx, issue_flags); bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { @@ -239,7 +239,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, else sel.addr = io_provided_buffer_select(req, len, bl); } - io_ring_submit_unlock(req->ctx, issue_flags); + io_ring_submit_unlock(ctx, issue_flags); return sel; } diff --git a/io_uring/loop.c b/io_uring/loop.c new file mode 100644 index 000000000000..31843cc3e451 --- /dev/null +++ b/io_uring/loop.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "io_uring.h" +#include "wait.h" +#include "loop.h" + +static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, + const struct iou_loop_params *lp) +{ + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); +} + +static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) +{ + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); +} + +static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) +{ + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); +} + +static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, + unsigned nr_wait) +{ + io_loop_wait_start(ctx, nr_wait); + + if (unlikely(io_local_work_pending(ctx) || + io_loop_nr_cqes(ctx, lp) <= 0) || + READ_ONCE(ctx->check_cq)) { + io_loop_wait_finish(ctx); + return; + } + + mutex_unlock(&ctx->uring_lock); + schedule(); + io_loop_wait_finish(ctx); + mutex_lock(&ctx->uring_lock); +} + +static int __io_run_loop(struct io_ring_ctx *ctx) +{ + struct iou_loop_params lp = {}; + + while (true) { + int nr_wait, step_res; + + if (unlikely(!ctx->loop_step)) + return -EFAULT; + + step_res = ctx->loop_step(ctx, &lp); + if (step_res == IOU_LOOP_STOP) + break; + if (step_res != IOU_LOOP_CONTINUE) + return -EINVAL; + + nr_wait = io_loop_nr_cqes(ctx, &lp); + if (nr_wait > 0) + io_loop_wait(ctx, &lp, nr_wait); + else + nr_wait = 0; + + if (task_work_pending(current)) { + mutex_unlock(&ctx->uring_lock); + io_run_task_work(); + mutex_lock(&ctx->uring_lock); + } + if (unlikely(task_sigpending(current))) + return -EINTR; + io_run_local_work_locked(ctx, nr_wait); + + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_overflow_flush_locked(ctx); + } + + return 0; +} + +int io_run_loop(struct io_ring_ctx *ctx) +{ + int ret; + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_loop(ctx); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/loop.h b/io_uring/loop.h new file mode 100644 index 000000000000..d7718b9ce61e --- /dev/null +++ b/io_uring/loop.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_LOOP_H +#define IOU_LOOP_H + +#include <linux/io_uring_types.h> + +struct iou_loop_params { + /* + * The CQE index to wait for. Only serves as a hint and can still be + * woken up earlier. + */ + __u32 cq_wait_idx; +}; + +enum { + IOU_LOOP_CONTINUE = 0, + IOU_LOOP_STOP, +}; + +static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) +{ + return data_race(ctx->loop_step); +} + +int io_run_loop(struct io_ring_ctx *ctx); + +#endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 57ad0085869a..3ff9098573db 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - return target_ctx->task_complete; + return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE; } static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) diff --git a/io_uring/net.c b/io_uring/net.c index 8885d944130a..30cd22c0b934 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -375,10 +375,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_namelen = addr_len; } if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - if (sr->flags & IORING_SEND_VECTORIZED) - return -EINVAL; - req->flags |= REQ_F_IMPORT_BUFFER; - return 0; + if (!(sr->flags & IORING_SEND_VECTORIZED)) { + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; + } + + kmsg->msg.msg_iter.nr_segs = sr->len; + return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len); } if (req->flags & REQ_F_BUFFER_SELECT) return 0; @@ -396,6 +399,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe struct user_msghdr msg; int ret; + sr->flags |= IORING_SEND_VECTORIZED; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); if (unlikely(ret)) @@ -1333,11 +1337,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *iomsg; struct io_kiocb *notif; + u64 user_data; int ret; zc->done_io = 0; - if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) + if (unlikely(READ_ONCE(sqe->__pad2[0]))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ if (req->flags & REQ_F_CQE_SKIP) @@ -1346,7 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif = zc->notif = io_alloc_notif(ctx); if (!notif) return -ENOMEM; - notif->cqe.user_data = req->cqe.user_data; + user_data = READ_ONCE(sqe->addr3); + if (!user_data) + user_data = req->cqe.user_data; + + notif->cqe.user_data = user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; @@ -1370,7 +1379,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - if (io_is_compat(req->ctx)) + if (io_is_compat(ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; iomsg = io_msg_alloc_async(req); @@ -1445,22 +1454,39 @@ static int io_sg_from_iter(struct sk_buff *skb, return ret; } -static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) +static int io_send_zc_import(struct io_kiocb *req, + struct io_async_msghdr *kmsg, + unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; + struct io_kiocb *notif = sr->notif; + int ret; WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); - sr->notif->buf_index = req->buf_index; - return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, - (u64)(uintptr_t)sr->buf, sr->len, - ITER_SOURCE, issue_flags); + notif->buf_index = req->buf_index; + + if (!(sr->flags & IORING_SEND_VECTORIZED)) { + ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); + } else { + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; + + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, + notif, &kmsg->vec, uvec_segs, + issue_flags); + } + + if (unlikely(ret)) + return ret; + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; } -int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) +int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned msg_flags; @@ -1471,106 +1497,38 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (!(req->flags & REQ_F_POLLED) && - (zc->flags & IORING_RECVSEND_POLL_FIRST)) + (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; if (req->flags & REQ_F_IMPORT_BUFFER) { - req->flags &= ~REQ_F_IMPORT_BUFFER; - ret = io_send_zc_import(req, issue_flags); + ret = io_send_zc_import(req, kmsg, issue_flags); if (unlikely(ret)) return ret; } - msg_flags = zc->msg_flags; + msg_flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - - kmsg->msg.msg_flags = msg_flags; - kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &kmsg->msg); - - if (unlikely(ret < min_ret)) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - - if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { - zc->done_io += ret; - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } - - if (ret >= 0) - ret += zc->done_io; - else if (zc->done_io) - ret = zc->done_io; - - /* - * If we're in io-wq we can't rely on tw ordering guarantees, defer - * flushing notif to io_send_zc_cleanup() - */ - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - io_notif_flush(zc->notif); - zc->notif = NULL; - io_req_msg_cleanup(req, 0); - } - io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_COMPLETE; -} -int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; - struct socket *sock; - unsigned flags; - int ret, min_ret = 0; - - if (req->flags & REQ_F_IMPORT_BUFFER) { - unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; - int ret; + kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - sr->notif->buf_index = req->buf_index; - ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, - sr->notif, &kmsg->vec, uvec_segs, - issue_flags); - if (unlikely(ret)) - return ret; - req->flags &= ~REQ_F_IMPORT_BUFFER; + if (req->opcode == IORING_OP_SEND_ZC) { + msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + kmsg->msg.msg_flags = msg_flags; + ret = sock_sendmsg(sock, &kmsg->msg); + } else { + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags); } - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) - return -EOPNOTSUPP; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - kmsg->msg.msg_control_user = sr->msg_control; - kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; - if (ret > 0 && io_net_retry(sock, flags)) { + if (ret > 0 && io_net_retry(sock, sr->msg_flags)) { sr->done_io += ret; return -EAGAIN; } diff --git a/io_uring/net.h b/io_uring/net.h index a862960a3bb9..d4d1ddce50e3 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -50,7 +50,6 @@ void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); -int io_send_zc(struct io_kiocb *req, unsigned int issue_flags); int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 91a23baf415e..c3ef52b70811 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,7 +67,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv, @@ -82,7 +81,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev, @@ -102,7 +100,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, .issue = io_read_fixed, @@ -116,7 +113,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, .issue = io_write_fixed, @@ -250,7 +246,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read, .issue = io_read, @@ -264,7 +259,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write, .issue = io_write, @@ -423,7 +417,6 @@ const struct io_issue_def io_issue_defs[] = { .needs_file = 1, .plug = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, @@ -437,7 +430,7 @@ const struct io_issue_def io_issue_defs[] = { #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, - .issue = io_send_zc, + .issue = io_sendmsg_zc, #else .prep = io_eopnotsupp_prep, #endif @@ -556,7 +549,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv_fixed, @@ -571,7 +563,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev_fixed, @@ -593,7 +584,6 @@ const struct io_issue_def io_issue_defs[] = { .needs_file = 1, .plug = 1, .iopoll = 1, - .iopoll_queue = 1, .is_128 = 1, .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, diff --git a/io_uring/opdef.h b/io_uring/opdef.h index faf3955dce8b..667f981e63b0 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -25,8 +25,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* skip auditing */ unsigned audit_skip : 1; - /* have to be put into the iopoll list */ - unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; /* set to 1 if this opcode uses 128b sqes in a mixed sq */ diff --git a/io_uring/poll.c b/io_uring/poll.c index 2e9ee47d74bf..74eef7884159 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -277,8 +277,10 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; + __poll_t events = req->apoll_events; + struct poll_table_struct pt = { ._key = events }; + + req->cqe.res = vfs_poll(req->file, &pt) & events; /* * We got woken with a mask, but someone else got to * it first. The above vfs_poll() doesn't add us back @@ -287,7 +289,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) */ if (unlikely(!req->cqe.res)) { /* Multishot armed need not reissue */ - if (!(req->apoll_events & EPOLLONESHOT)) + if (!(events & EPOLLONESHOT)) continue; return IOU_POLL_REISSUE; } diff --git a/io_uring/query.c b/io_uring/query.c index 63cc30c9803d..c1704d088374 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -34,12 +34,12 @@ static ssize_t io_query_zcrx(union io_query_data *data) { struct io_uring_query_zcrx *e = &data->zcrx; - e->register_flags = ZCRX_REG_IMPORT; + e->register_flags = ZCRX_SUPPORTED_REG_FLAGS; e->area_flags = IORING_ZCRX_AREA_DMABUF; e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; e->rq_hdr_size = sizeof(struct io_uring); e->rq_hdr_alignment = L1_CACHE_BYTES; - e->features = ZCRX_FEATURE_RX_PAGE_SIZE; + e->features = ZCRX_FEATURES; e->__resv2 = 0; return sizeof(*e); } diff --git a/io_uring/register.c b/io_uring/register.c index 05362fe79804..24e593332d1a 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -192,9 +192,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return ret; } if (ctx->restrictions.op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (ctx->restrictions.reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; return 0; } @@ -392,7 +392,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; + ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); @@ -733,7 +733,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; @@ -908,7 +908,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = -EINVAL; if (!arg || nr_args != 1) break; - ret = io_register_zcrx_ifq(ctx, arg); + ret = io_register_zcrx(ctx, arg); break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; @@ -946,40 +946,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return ret; } -/* - * Given an 'fd' value, return the ctx associated with if. If 'registered' is - * true, then the registered index is used. Otherwise, the normal fd table. - * Caller must call fput() on the returned file, unless it's an ERR_PTR. - */ -struct file *io_uring_register_get_file(unsigned int fd, bool registered) -{ - struct file *file; - - if (registered) { - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - struct io_uring_task *tctx = current->io_uring; - - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return ERR_PTR(-EINVAL); - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - file = tctx->registered_rings[fd]; - if (file) - get_file(file); - } else { - file = fget(fd); - } - - if (unlikely(!file)) - return ERR_PTR(-EBADF); - if (io_is_uring_fops(file)) - return file; - fput(file); - return ERR_PTR(-EOPNOTSUPP); -} - static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) { struct io_uring_sqe sqe; @@ -1034,7 +1000,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, if (fd == -1) return io_uring_register_blind(opcode, arg, nr_args); - file = io_uring_register_get_file(fd, use_registered_ring); + file = io_uring_ctx_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); ctx = file->private_data; @@ -1046,6 +1012,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); - fput(file); + if (!use_registered_ring) + fput(file); return ret; } diff --git a/io_uring/register.h b/io_uring/register.h index a5f39d5ef9e0..c9da997d503c 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,6 +4,5 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); -struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1b96ab5e98c9..fd36e0e319a2 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 tag = 0; uvec = u64_to_user_ptr(user_data); - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; - if (ctx->compat) + if (io_is_compat(ctx)) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); @@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (arg) { uvec = (struct iovec __user *) arg; - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } - if (ctx->compat) + if (io_is_compat(ctx)) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); @@ -961,7 +961,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, */ imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); if (!imu) { - kfree(node); + io_cache_free(&ctx->node_cache, node); ret = -ENOMEM; goto unlock; } @@ -1273,7 +1273,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; - file = io_uring_register_get_file(buf.src_fd, registered_src); + file = io_uring_ctx_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); @@ -1295,7 +1295,8 @@ out: if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); - fput(file); + if (!registered_src) + fput(file); return ret; } diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a5f262734e8..20654deff84d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) + !(req->flags & REQ_F_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. @@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } } - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); @@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { u32 cflags = 0; __io_complete_rw_common(req, ret); @@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; @@ -899,7 +900,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. */ - if (!(req->file->f_flags & O_DIRECT)) + if (!(file->f_flags & O_DIRECT)) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; @@ -961,13 +962,13 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) - return -EAGAIN; + return ret; /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) - goto done; + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) + return ret; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) - goto done; + return ret; ret = 0; } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; @@ -975,7 +976,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ - goto done; + return ret; } /* @@ -1018,8 +1019,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); -done: - /* it's faster to check here than delegate to kfree */ + return ret; } @@ -1188,7 +1188,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index c6bb938ec5ea..46c12afec73e 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -458,6 +458,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return -EINVAL; } if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *tctx; struct task_struct *tsk; struct io_sq_data *sqd; bool attached; @@ -524,8 +525,13 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, rcu_assign_pointer(sqd->thread, tsk); mutex_unlock(&sqd->lock); + ret = 0; get_task_struct(tsk); - ret = io_uring_alloc_task_context(tsk, ctx); + tctx = io_uring_alloc_task_context(tsk, ctx); + if (!IS_ERR(tctx)) + tsk->io_uring = tctx; + else + ret = PTR_ERR(tctx); wake_up_new_task(tsk); if (ret) goto err; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7cbcb82aedfb..61533f30494f 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -74,20 +74,20 @@ void __io_uring_free(struct task_struct *tsk) } } -__cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +__cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; tctx = kzalloc_obj(*tctx); if (unlikely(!tctx)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { kfree(tctx); - return ret; + return ERR_PTR(ret); } tctx->io_wq = io_init_wq_offload(ctx, task); @@ -95,7 +95,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); kfree(tctx); - return ret; + return ERR_PTR(ret); } tctx->task = task; @@ -103,31 +103,56 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); + return tctx; +} + +static int io_tctx_install_node(struct io_ring_ctx *ctx, + struct io_uring_task *tctx) +{ + struct io_tctx_node *node; + int ret; + + if (xa_load(&tctx->xa, (unsigned long)ctx)) + return 0; + + node = kmalloc_obj(*node); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; + + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); + if (ret) { + kfree(node); + return ret; + } + + mutex_lock(&ctx->tctx_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->tctx_lock); return 0; } int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; + tctx = io_uring_alloc_task_context(current, ctx); + if (IS_ERR(tctx)) + return PTR_ERR(tctx); - tctx = current->io_uring; - if (ctx->iowq_limits_set) { + if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; ret = io_wq_max_workers(tctx->io_wq, limits); if (ret) - return ret; + goto err_free; } } @@ -138,25 +163,19 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) */ if (tctx->io_wq) io_wq_set_exit_on_idle(tctx->io_wq, false); - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc_obj(*node); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - mutex_lock(&ctx->tctx_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->tctx_lock); + ret = io_tctx_install_node(ctx, tctx); + if (!ret) { + current->io_uring = tctx; + return 0; } - return 0; + if (!current->io_uring) { +err_free: + io_wq_put_and_exit(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + } + return ret; } int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 608e96de70a2..2310d2a0c46d 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -6,8 +6,8 @@ struct io_tctx_node { struct io_ring_ctx *ctx; }; -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); +struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index cb61d4862fc6..4cfdfc519770 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -30,11 +30,30 @@ struct io_timeout_rem { u64 addr; /* timeout update */ - struct timespec64 ts; + ktime_t time; u32 flags; bool ltimeout; }; +static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) +{ + struct timespec64 ts; + + if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) { + *time = ns_to_ktime(arg); + if (*time < 0) + return -EINVAL; + return 0; + } + + if (get_timespec64(&ts, u64_to_user_ptr(arg))) + return -EFAULT; + if (ts.tv_sec < 0 || ts.tv_nsec < 0) + return -EINVAL; + *time = timespec64_to_ktime(ts); + return 0; +} + static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_kiocb *link); @@ -80,7 +99,7 @@ static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) /* re-arm timer */ raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + hrtimer_start(&data->timer, data->time, data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return; } @@ -265,8 +284,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); + atomic_set(&ctx->cq_timeouts, + atomic_read(&ctx->cq_timeouts) + 1); raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) @@ -395,7 +414,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) + ktime_t ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -417,12 +436,12 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + hrtimer_start(&io->timer, ts, mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) + ktime_t time, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; @@ -435,20 +454,23 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; - data->ts = *ts; + data->time = time; list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); + hrtimer_start(&data->timer, data->time, mode); return 0; } int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); + int ret; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; + if (sqe->addr3 || sqe->__pad2[0]) + return -EINVAL; if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; @@ -460,12 +482,13 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) - return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2)))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK | + IORING_TIMEOUT_ABS | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; + ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags); + if (ret) + return ret; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; @@ -500,9 +523,9 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode); else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + ret = io_timeout_update(ctx, tr->addr, tr->time, mode); raw_spin_unlock_irq(&ctx->timeout_lock); } @@ -520,7 +543,10 @@ static int __io_timeout_prep(struct io_kiocb *req, struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); + int ret; + if (sqe->addr3 || sqe->__pad2[0]) + return -EINVAL; if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) @@ -528,7 +554,8 @@ static int __io_timeout_prep(struct io_kiocb *req, flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | IORING_TIMEOUT_ETIME_SUCCESS | - IORING_TIMEOUT_MULTISHOT)) + IORING_TIMEOUT_MULTISHOT | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -539,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req, INIT_LIST_HEAD(&timeout->list); timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; + if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED))) + req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr @@ -557,11 +584,9 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr)))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; + ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags); + if (ret) + return ret; data->mode = io_translate_timeout_mode(flags); @@ -637,7 +662,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } add: list_add(&timeout->list, entry); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + hrtimer_start(&data->timer, data->time, data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -655,8 +680,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) if (timeout->head) { struct io_timeout_data *data = req->async_data; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); + hrtimer_start(&data->timer, data->time, data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } raw_spin_unlock_irq(&ctx->timeout_lock); diff --git a/io_uring/timeout.h b/io_uring/timeout.h index 2b7c9ad72992..1620f94dd45a 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -3,7 +3,7 @@ struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; - struct timespec64 ts; + ktime_t time; enum hrtimer_mode mode; u32 flags; }; diff --git a/io_uring/tw.c b/io_uring/tw.c index 2f2b4ac4b126..fdff81eebc95 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (!head) { io_ctx_mark_taskrun(ctx); - if (ctx->has_evfd) + if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, false); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ee7b49f47cb5..7b25dcd9d05f 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, * because iopoll completion data overlaps with the hash_node used * for tracking. */ - if (ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) return; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { @@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_set_cqe32_extra(req, res2, 0); } io_req_uring_cleanup(req, issue_flags); - if (req->ctx->flags & IORING_SETUP_IOPOLL) { + if (req->flags & REQ_F_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { @@ -257,9 +257,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) issue_flags |= IO_URING_F_CQE32; if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!file->f_op->uring_cmd_iopoll) - return -EOPNOTSUPP; + if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) { + req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { diff --git a/io_uring/wait.h b/io_uring/wait.h index 3a145fcfd3dd..a4274b137f81 100644 --- a/io_uring/wait.h +++ b/io_uring/wait.h @@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 62d693287457..bd970fb084c1 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -63,7 +63,7 @@ static int io_area_max_shift(struct io_zcrx_mem *mem) unsigned i; for_each_sgtable_dma_sg(sgt, sg, i) - shift = min(shift, __ffs(sg->length)); + shift = min(shift, __ffs(sg_dma_len(sg))); return shift; } @@ -127,10 +127,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq, int dmabuf_fd = area_reg->dmabuf_fd; int i, ret; + if (!ifq->dev) + return -EINVAL; if (off) return -EINVAL; - if (WARN_ON_ONCE(!ifq->dev)) - return -EFAULT; if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) return -EINVAL; @@ -194,6 +194,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, { struct page **pages; int nr_pages, ret; + bool mapped = false; if (area_reg->dmabuf_fd) return -EINVAL; @@ -207,22 +208,37 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 0, (unsigned long)nr_pages << PAGE_SHIFT, GFP_KERNEL_ACCOUNT); - if (ret) { - unpin_user_pages(pages, nr_pages); - kvfree(pages); - return ret; + if (ret) + goto out_err; + + if (ifq->dev) { + ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + goto out_err; + mapped = true; } mem->account_pages = io_count_account_pages(pages, nr_pages); ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); - if (ret < 0) + if (ret < 0) { mem->account_pages = 0; + goto out_err; + } mem->sgt = &mem->page_sg_table; mem->pages = pages; mem->nr_folios = nr_pages; mem->size = area_reg->len; return ret; +out_err: + if (mapped) + dma_unmap_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + sg_free_table(&mem->page_sg_table); + unpin_user_pages(pages, nr_pages); + kvfree(pages); + return ret; } static void io_release_area_mem(struct io_zcrx_mem *mem) @@ -273,8 +289,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, return; area->is_mapped = false; - for (i = 0; i < area->nia.num_niovs; i++) - net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + if (area->nia.niovs) { + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + } if (area->mem.is_dmabuf) { io_release_dmabuf(&area->mem); @@ -284,45 +302,23 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, } } -static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) -{ - int ret; - - guard(mutex)(&ifq->pp_lock); - if (area->is_mapped) - return 0; - - if (!area->mem.is_dmabuf) { - ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (ret < 0) - return ret; - } - - ret = io_populate_area_dma(ifq, area); - if (ret && !area->mem.is_dmabuf) - dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (ret == 0) - area->is_mapped = true; - return ret; -} - -static void io_zcrx_sync_for_device(struct page_pool *pool, - struct net_iov *niov) +static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx, + netmem_ref *netmems, unsigned nr) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + struct device *dev = pp->p.dev; + unsigned i, niov_size; dma_addr_t dma_addr; - unsigned niov_size; - - if (!dma_dev_need_sync(pool->p.dev)) + if (!dma_dev_need_sync(dev)) return; + niov_size = 1U << zcrx->niov_shift; - niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; - dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); - __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, - niov_size, pool->p.dma_dir); + for (i = 0; i < nr; i++) { + dma_addr = page_pool_get_dma_addr_netmem(netmems[i]); + __dma_sync_single_for_device(dev, dma_addr + pp->p.offset, + niov_size, pp->p.dma_dir); + } #endif } @@ -390,24 +386,24 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, return -EINVAL; mmap_offset = IORING_MAP_OFF_ZCRX_REGION; - mmap_offset += id << IORING_OFF_PBUF_SHIFT; + mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT; - ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); + ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->region); - ifq->rq_ring = (struct io_uring *)ptr; - ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + ptr = io_region_get_ptr(&ifq->rq_region); + ifq->rq.ring = (struct io_uring *)ptr; + ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; } static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->user, &ifq->region); - ifq->rq_ring = NULL; - ifq->rqes = NULL; + io_free_region(ifq->user, &ifq->rq_region); + ifq->rq.ring = NULL; + ifq->rq.rqes = NULL; } static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, @@ -429,8 +425,13 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { - if (ifq->area) + bool kern_readable = !area->mem.is_dmabuf; + + if (WARN_ON_ONCE(ifq->area)) + return -EINVAL; + if (WARN_ON_ONCE(ifq->kern_readable != kern_readable)) return -EINVAL; + ifq->area = area; return 0; } @@ -450,6 +451,8 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, return -EINVAL; buf_size_shift = ilog2(reg->rx_buf_len); } + if (!ifq->dev && buf_size_shift != PAGE_SHIFT) + return -EOPNOTSUPP; ret = -ENOMEM; area = kzalloc_obj(*area); @@ -460,8 +463,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, ret = io_import_area(ifq, &area->mem, area_reg); if (ret) goto err; + if (ifq->dev) + area->is_mapped = true; - if (buf_size_shift > io_area_max_shift(&area->mem)) { + if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) { ret = -ERANGE; goto err; } @@ -495,6 +500,12 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, niov->type = NET_IOV_IOURING; } + if (ifq->dev) { + ret = io_populate_area_dma(ifq, area); + if (ret) + goto err; + } + area->free_count = nr_iovs; /* we're only supporting one area per ifq for now */ area->area_id = 0; @@ -519,7 +530,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) return NULL; ifq->if_rxq = -1; - spin_lock_init(&ifq->rq_lock); + spin_lock_init(&ifq->rq.lock); mutex_init(&ifq->pp_lock); refcount_set(&ifq->refs, 1); refcount_set(&ifq->user_refs, 1); @@ -586,9 +597,21 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - spin_lock_bh(&area->freelist_lock); + guard(spinlock_bh)(&area->freelist_lock); area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); +} + +static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + if (unlikely(!area->free_count)) + return NULL; + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; } static void io_zcrx_return_niov(struct net_iov *niov) @@ -624,12 +647,17 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } -static void zcrx_unregister(struct io_zcrx_ifq *ifq) +static void zcrx_unregister_user(struct io_zcrx_ifq *ifq) { if (refcount_dec_and_test(&ifq->user_refs)) { io_close_queue(ifq); io_zcrx_scrub(ifq); } +} + +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + zcrx_unregister_user(ifq); io_put_zcrx_ifq(ifq); } @@ -640,7 +668,7 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, lockdep_assert_held(&ctx->mmap_lock); - return ifq ? &ifq->region : NULL; + return ifq ? &ifq->rq_region : NULL; } static int zcrx_box_release(struct inode *inode, struct file *file) @@ -751,10 +779,50 @@ err: return ret; } -int io_register_zcrx_ifq(struct io_ring_ctx *ctx, - struct io_uring_zcrx_ifq_reg __user *arg) +static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_zcrx_area_reg *area) { struct pp_memory_provider_params mp_param = {}; + unsigned if_rxq = reg->if_rxq; + int ret; + + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, + reg->if_idx); + if (!ifq->netdev) + return -ENODEV; + + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); + + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq); + if (!ifq->dev) { + ret = -EOPNOTSUPP; + goto netdev_put_unlock; + } + get_device(ifq->dev); + + ret = io_zcrx_create_area(ifq, area, reg); + if (ret) + goto netdev_put_unlock; + + if (reg->rx_buf_len) + mp_param.rx_page_size = 1U << ifq->niov_shift; + mp_param.mp_ops = &io_uring_pp_zc_ops; + mp_param.mp_priv = ifq; + ret = __net_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL); + if (ret) + goto netdev_put_unlock; + + ifq->if_rxq = if_rxq; + ret = 0; +netdev_put_unlock: + netdev_unlock(ifq->netdev); + return ret; +} + +int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; struct io_uring_region_desc rd; @@ -778,11 +846,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id) return -EINVAL; + if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS) + return -EINVAL; if (reg.flags & ZCRX_REG_IMPORT) return import_zcrx(ctx, arg, ®); if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) return -EFAULT; - if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + if (reg.if_rxq == -1 || !reg.rq_entries) + return -EINVAL; + if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV)) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { if (!(ctx->flags & IORING_SETUP_CLAMP)) @@ -806,7 +878,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, mmgrab(ctx->mm_account); ifq->mm_account = ctx->mm_account; } - ifq->rq_entries = reg.rq_entries; + ifq->rq.nr_entries = reg.rq_entries; scoped_guard(mutex, &ctx->mmap_lock) { /* preallocate id */ @@ -819,33 +891,17 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; - ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); - if (!ifq->netdev) { - ret = -ENODEV; - goto err; - } - netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); + ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF); - ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); - if (!ifq->dev) { - ret = -EOPNOTSUPP; - goto netdev_put_unlock; + if (!(reg.flags & ZCRX_REG_NODEV)) { + ret = zcrx_register_netdev(ifq, ®, &area); + if (ret) + goto err; + } else { + ret = io_zcrx_create_area(ifq, &area, ®); + if (ret) + goto err; } - get_device(ifq->dev); - - ret = io_zcrx_create_area(ifq, &area, ®); - if (ret) - goto netdev_put_unlock; - - if (reg.rx_buf_len) - mp_param.rx_page_size = 1U << ifq->niov_shift; - mp_param.mp_ops = &io_uring_pp_zc_ops; - mp_param.mp_priv = ifq; - ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); - if (ret) - goto netdev_put_unlock; - netdev_unlock(ifq->netdev); - ifq->if_rxq = reg.if_rxq; reg.zcrx_id = id; @@ -865,8 +921,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; } return 0; -netdev_put_unlock: - netdev_unlock(ifq->netdev); err: scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); @@ -875,17 +929,37 @@ ifq_free: return ret; } -static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) +static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) { - unsigned niov_idx; + return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); +} - lockdep_assert_held(&area->freelist_lock); +static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) +{ + xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); +} - niov_idx = area->freelist[--area->free_count]; - return &area->nia.niovs[niov_idx]; +void io_terminate_zcrx(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + unsigned long id = 0; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (!ifq) + break; + if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id))) + break; + set_zcrx_entry_mark(ctx, id); + id++; + zcrx_unregister_user(ifq); + } } -void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +void io_unregister_zcrx(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -896,31 +970,35 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) unsigned long id = 0; ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); - if (ifq) + if (ifq) { + if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) { + ifq = NULL; + break; + } xa_erase(&ctx->zcrx_ctxs, id); + } } if (!ifq) break; - zcrx_unregister(ifq); + io_put_zcrx_ifq(ifq); } xa_destroy(&ctx->zcrx_ctxs); } -static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +static inline u32 zcrx_rq_entries(struct zcrx_rq *rq) { u32 entries; - entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; - return min(entries, ifq->rq_entries); + entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head; + return min(entries, rq->nr_entries); } -static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, - unsigned mask) +static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask) { - unsigned int idx = ifq->cached_rq_head++ & mask; + unsigned int idx = rq->cached_head++ & mask; - return &ifq->rqes[idx]; + return &rq->rqes[idx]; } static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, @@ -946,21 +1024,24 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, return true; } -static void io_zcrx_ring_refill(struct page_pool *pp, - struct io_zcrx_ifq *ifq) +static unsigned io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq, + netmem_ref *netmems, unsigned to_alloc) { - unsigned int mask = ifq->rq_entries - 1; + struct zcrx_rq *rq = &ifq->rq; + unsigned int mask = rq->nr_entries - 1; unsigned int entries; + unsigned allocated = 0; - guard(spinlock_bh)(&ifq->rq_lock); + guard(spinlock_bh)(&rq->lock); - entries = io_zcrx_rqring_entries(ifq); - entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); + entries = zcrx_rq_entries(rq); + entries = min_t(unsigned, entries, to_alloc); if (unlikely(!entries)) - return; + return 0; do { - struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); struct net_iov *niov; netmem_ref netmem; @@ -978,46 +1059,56 @@ static void io_zcrx_ring_refill(struct page_pool *pp, continue; } - io_zcrx_sync_for_device(pp, niov); - net_mp_netmem_place_in_cache(pp, netmem); + netmems[allocated] = netmem; + allocated++; } while (--entries); - smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); + smp_store_release(&rq->ring->head, rq->cached_head); + return allocated; } -static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq, + netmem_ref *netmems, unsigned to_alloc) { struct io_zcrx_area *area = ifq->area; + unsigned allocated = 0; - spin_lock_bh(&area->freelist_lock); - while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { - struct net_iov *niov = __io_zcrx_get_free_niov(area); - netmem_ref netmem = net_iov_to_netmem(niov); + guard(spinlock_bh)(&area->freelist_lock); + for (allocated = 0; allocated < to_alloc; allocated++) { + struct net_iov *niov = zcrx_get_free_niov(area); + + if (!niov) + break; net_mp_niov_set_page_pool(pp, niov); - io_zcrx_sync_for_device(pp, niov); - net_mp_netmem_place_in_cache(pp, netmem); + netmems[allocated] = net_iov_to_netmem(niov); } - spin_unlock_bh(&area->freelist_lock); + return allocated; } static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) { struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + netmem_ref *netmems = pp->alloc.cache; + unsigned to_alloc = PP_ALLOC_CACHE_REFILL; + unsigned allocated; /* pp should already be ensuring that */ - if (unlikely(pp->alloc.count)) - goto out_return; + if (WARN_ON_ONCE(pp->alloc.count)) + return 0; - io_zcrx_ring_refill(pp, ifq); - if (likely(pp->alloc.count)) + allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc); + if (likely(allocated)) goto out_return; - io_zcrx_refill_slow(pp, ifq); - if (!pp->alloc.count) + allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); + if (!allocated) return 0; out_return: - return pp->alloc.cache[--pp->alloc.count]; + zcrx_sync_for_device(pp, ifq, netmems, allocated); + allocated--; + pp->alloc.count += allocated; + return netmems[allocated]; } static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) @@ -1036,7 +1127,6 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) static int io_pp_zc_init(struct page_pool *pp) { struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - int ret; if (WARN_ON_ONCE(!ifq)) return -EINVAL; @@ -1049,10 +1139,6 @@ static int io_pp_zc_init(struct page_pool *pp) if (pp->p.dma_dir != DMA_FROM_DEVICE) return -EOPNOTSUPP; - ret = io_zcrx_map_area(ifq, ifq->area); - if (ret) - return ret; - refcount_inc(&ifq->refs); return 0; } @@ -1100,14 +1186,14 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { }; static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, - struct io_zcrx_ifq *zcrx) + struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq) { - unsigned int mask = zcrx->rq_entries - 1; + unsigned int mask = rq->nr_entries - 1; unsigned int i; - nr = min(nr, io_zcrx_rqring_entries(zcrx)); + nr = min(nr, zcrx_rq_entries(rq)); for (i = 0; i < nr; i++) { - struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); struct net_iov *niov; if (!io_parse_rqe(rqe, zcrx, &niov)) @@ -1115,7 +1201,7 @@ static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, netmem_array[i] = net_iov_to_netmem(niov); } - smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + smp_store_release(&rq->ring->head, rq->cached_head); return i; } @@ -1149,8 +1235,10 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, return -EINVAL; do { - scoped_guard(spinlock_bh, &zcrx->rq_lock) { - nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + struct zcrx_rq *rq = &zcrx->rq; + + scoped_guard(spinlock_bh, &rq->lock) { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq); zcrx_return_buffers(netmems, nr); } @@ -1159,7 +1247,7 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, if (fatal_signal_pending(current)) break; cond_resched(); - } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries); return 0; } @@ -1169,6 +1257,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) struct zcrx_ctrl ctrl; struct io_zcrx_ifq *zcrx; + BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush)); + if (nr_args) return -EINVAL; if (copy_from_user(&ctrl, arg, sizeof(ctrl))) @@ -1221,13 +1311,11 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) struct io_zcrx_area *area = ifq->area; struct net_iov *niov = NULL; - if (area->mem.is_dmabuf) + if (!ifq->kern_readable) return NULL; - spin_lock_bh(&area->freelist_lock); - if (area->free_count) - niov = __io_zcrx_get_free_niov(area); - spin_unlock_bh(&area->freelist_lock); + scoped_guard(spinlock_bh, &area->freelist_lock) + niov = zcrx_get_free_niov(area); if (niov) page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 32ab95b2cb81..75e0a4e6ef6e 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -8,6 +8,9 @@ #include <net/page_pool/types.h> #include <net/net_trackers.h> +#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV) +#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE) + struct io_zcrx_mem { unsigned long size; bool is_dmabuf; @@ -38,17 +41,22 @@ struct io_zcrx_area { struct io_zcrx_mem mem; }; +struct zcrx_rq { + spinlock_t lock; + struct io_uring *ring; + struct io_uring_zcrx_rqe *rqes; + u32 cached_head; + u32 nr_entries; +}; + struct io_zcrx_ifq { struct io_zcrx_area *area; unsigned niov_shift; struct user_struct *user; struct mm_struct *mm_account; + bool kern_readable; - spinlock_t rq_lock ____cacheline_aligned_in_smp; - struct io_uring *rq_ring; - struct io_uring_zcrx_rqe *rqes; - u32 cached_rq_head; - u32 rq_entries; + struct zcrx_rq rq ____cacheline_aligned_in_smp; u32 if_rxq; struct device *dev; @@ -63,26 +71,30 @@ struct io_zcrx_ifq { * net stack. */ struct mutex pp_lock; - struct io_mapped_region region; + struct io_mapped_region rq_region; }; #if defined(CONFIG_IO_URING_ZCRX) int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); -int io_register_zcrx_ifq(struct io_ring_ctx *ctx, +int io_register_zcrx(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); -void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_unregister_zcrx(struct io_ring_ctx *ctx); +void io_terminate_zcrx(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id); #else -static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, - struct io_uring_zcrx_ifq_reg __user *arg) +static inline int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) { return -EOPNOTSUPP; } -static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +static inline void io_unregister_zcrx(struct io_ring_ctx *ctx) +{ +} +static inline void io_terminate_zcrx(struct io_ring_ctx *ctx) { } static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, |
