diff options
Diffstat (limited to 'io_uring')
| -rw-r--r-- | io_uring/cancel.c | 2 | ||||
| -rw-r--r-- | io_uring/fdinfo.c | 3 | ||||
| -rw-r--r-- | io_uring/io-wq.c | 3 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 38 | ||||
| -rw-r--r-- | io_uring/net.c | 26 | ||||
| -rw-r--r-- | io_uring/nop.c | 4 | ||||
| -rw-r--r-- | io_uring/rw.c | 4 | ||||
| -rw-r--r-- | io_uring/tctx.c | 12 | ||||
| -rw-r--r-- | io_uring/timeout.c | 20 | ||||
| -rw-r--r-- | io_uring/waitid.c | 1 |
10 files changed, 87 insertions, 26 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 5e5eb9cfc7cd..4aa3103ba9c3 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -561,8 +561,8 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); if (tctx) ret |= io_run_task_work() > 0; else diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index c2d3e45544bb..001fb542dc11 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -190,8 +190,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) get_task_struct(tsk); rcu_read_unlock(); usec = io_sq_cpu_usec(tsk); + sq_pid = task_pid_nr_ns(tsk, + proc_pid_ns(file_inode(m->file)->i_sb)); put_task_struct(tsk); - sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; sq_total_time = usec; sq_work_time = sq->work_time; diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7a9f94a0ce6f..8cc7b47d3089 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1124,7 +1124,8 @@ static inline void io_wq_remove_pending(struct io_wq *wq, if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) + if (prev_work && io_wq_is_hashed(prev_work) && + io_get_work_hash(prev_work) == hash) wq->hash_tail[hash] = prev_work; else wq->hash_tail[hash] = NULL; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4ed998d60c09..103b6c88f252 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -687,12 +687,26 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, } /* + * Compute queued CQEs for free-space calculation, clamped to cq_entries. + */ +static unsigned int io_cqring_queued(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); + int diff; + + diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head)); + if (diff >= 0) + return min((unsigned int)diff, ctx->cq_entries); + return 0; +} + +/* * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE * because the ring is a single 16b entry away from wrapping. */ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) { - if (__io_cqring_events(ctx) < ctx->cq_entries) { + if (io_cqring_queued(ctx) < ctx->cq_entries) { struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; cqe->user_data = 0; @@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int free, queued, len; + unsigned int free, len; /* * Posting into the CQ when there are pending overflowed CQEs may break @@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) off = 0; } - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; + free = ctx->cq_entries - io_cqring_queued(ctx); /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (len < (cqe32 + 1)) @@ -1452,8 +1464,13 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test_atomic(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) + if (req->flags & IO_REQ_LINK_FLAGS) { + struct io_ring_ctx *ctx = req->ctx; + + mutex_lock(&ctx->uring_lock); nxt = io_req_find_next(req); + mutex_unlock(&ctx->uring_lock); + } io_free_req(req); } return nxt ? &nxt->work : NULL; @@ -1721,10 +1738,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_issue_def *def; unsigned int sqe_flags; int personality; - u8 opcode; req->ctx = ctx; - req->opcode = opcode = READ_ONCE(sqe->opcode); + req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); req->flags = (__force io_req_flags_t) sqe_flags; @@ -1734,13 +1750,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->cancel_seq_set = false; req->async_data = NULL; - if (unlikely(opcode >= IORING_OP_LAST)) { + if (unlikely(req->opcode >= IORING_OP_LAST)) { req->opcode = 0; return io_init_fail_req(req, -EINVAL); } - opcode = array_index_nospec(opcode, IORING_OP_LAST); + req->opcode = array_index_nospec(req->opcode, IORING_OP_LAST); - def = &io_issue_defs[opcode]; + def = &io_issue_defs[req->opcode]; if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { /* * A 128b op on a non-128b SQ requires mixed SQE support as diff --git a/io_uring/net.c b/io_uring/net.c index 30cd22c0b934..8df15b639358 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -4,6 +4,7 @@ #include <linux/file.h> #include <linux/slab.h> #include <linux/net.h> +#include <linux/un.h> #include <linux/compat.h> #include <net/compat.h> #include <linux/io_uring.h> @@ -1799,11 +1800,29 @@ out: return IOU_COMPLETE; } +/* + * Check if bind request would potentially end up with filename_create(), + * which in turn end up in mnt_want_write() which will grab the fs + * percpu start write sem. This can trigger a lockdep warning. + */ +static int io_bind_file_create(const struct io_async_msghdr *io, int addr_len) +{ + const struct sockaddr_un *sun; + + if (io->addr.ss_family != AF_UNIX) + return 0; + if (addr_len <= offsetof(struct sockaddr_un, sun_path)) + return 0; + sun = (const struct sockaddr_un *) &io->addr; + return sun->sun_path[0] != '\0'; +} + int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); struct sockaddr __user *uaddr; struct io_async_msghdr *io; + int ret; if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -1814,7 +1833,12 @@ int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io = io_msg_alloc_async(req); if (unlikely(!io)) return -ENOMEM; - return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); + ret = move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); + if (unlikely(ret)) + return ret; + if (io_bind_file_create(io, bind->addr_len)) + req->flags |= REQ_F_FORCE_ASYNC; + return 0; } int io_bind(struct io_kiocb *req, unsigned int issue_flags) diff --git a/io_uring/nop.c b/io_uring/nop.c index 3caf07878f8a..f5c9969e7f64 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -79,9 +79,9 @@ done: if (ret < 0) req_set_fail(req); if (nop->flags & IORING_NOP_CQE32) - io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); + io_req_set_res32(req, ret, 0, nop->extra1, nop->extra2); else - io_req_set_res(req, nop->result, 0); + io_req_set_res(req, ret, 0); if (nop->flags & IORING_NOP_TW) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); diff --git a/io_uring/rw.c b/io_uring/rw.c index e729e0e7657e..0c4834645279 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -230,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) } static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, - u64 attr_ptr, u64 attr_type_mask) + u64 attr_ptr) { struct io_uring_attr_pi pi_attr; struct io_async_rw *io; @@ -305,7 +305,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr); } return 0; } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 6af62ca9baba..42b219b34aa8 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -139,12 +139,14 @@ static int io_tctx_install_node(struct io_ring_ctx *ctx, int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; + bool new_tctx = false; int ret; if (unlikely(!tctx)) { tctx = io_uring_alloc_task_context(current, ctx); if (IS_ERR(tctx)) return PTR_ERR(tctx); + new_tctx = true; if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2]; @@ -168,13 +170,15 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) if (tctx->io_wq) io_wq_set_exit_on_idle(tctx->io_wq, false); - ret = io_tctx_install_node(ctx, tctx); - if (!ret) { + if (new_tctx) current->io_uring = tctx; + + ret = io_tctx_install_node(ctx, tctx); + if (!ret) return 0; - } - if (!current->io_uring) { err_free: + if (new_tctx) { + current->io_uring = NULL; if (tctx->io_wq) { io_wq_exit_start(tctx->io_wq); io_wq_put_and_exit(tctx->io_wq); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e2595cae2b07..c4dd26cf342d 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -284,6 +284,10 @@ static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); + + /* If this is NULL, then timer already claimed it and will complete it */ + if (!timeout->head) + return NULL; timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); @@ -367,6 +371,14 @@ static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) int ret; if (prev) { + /* + * splice the linked timeout out of prev's chain if the regular + * completion path didn't already do it. + */ + if (prev->link == req) + prev->link = req->link; + req->link = NULL; + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, @@ -401,12 +413,14 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) /* * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. + * race with the completion of the linked work. Splice of prev is + * done in io_req_task_link_timeout(), if needed. */ if (prev) { - io_remove_next_linked(prev); - if (!req_ref_inc_not_zero(prev)) + if (!req_ref_inc_not_zero(prev)) { + io_remove_next_linked(prev); prev = NULL; + } } list_del(&timeout->list); timeout->prev = prev; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index d25d60aed6af..32f68fd7fcdd 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -275,6 +275,7 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) iw->options = READ_ONCE(sqe->file_index); iw->head = NULL; iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + memset(&iw->info, 0, sizeof(iw->info)); return 0; } |
