From 0b8cfa974dfc964e6382c9e25fa6c1bdac6ef499 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 21 Mar 2021 14:16:08 -0600 Subject: io_uring: don't use {test,clear}_tsk_thread_flag() for current Linus correctly points out that this is both unnecessary and generates much worse code on some archs as going from current to thread_info is actually backwards - and obviously just wasteful, since the thread_info is what we care about. Since io_uring only operates on current for these operations, just use test_thread_flag() instead. For io-wq, we can further simplify and use tracehook_notify_signal() to handle the TIF_NOTIFY_SIGNAL work and clear the flag. The latter isn't an actual bug right now, but it may very well be in the future if we place other work items under TIF_NOTIFY_SIGNAL. Reported-by: Linus Torvalds Link: https://lore.kernel.org/io-uring/CAHk-=wgYhNck33YHKZ14mFB5MzTTk8gqXHcfj=RWTAXKwgQJgg@mail.gmail.com/ Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 543551d70327..be04bc6b5b99 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6873,7 +6873,7 @@ static int io_run_task_work_sig(void) return 1; if (!signal_pending(current)) return 0; - if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL)) + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) return -ERESTARTSYS; return -EINTR; } -- cgit v1.2.3 From d07f1e8a42614cc938c9c88866d4474a5a7fee31 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 22 Mar 2021 01:45:58 +0000 Subject: io_uring: correct io_queue_async_work() traces Request's io-wq work is hashed in io_prep_async_link(), so as trace_io_uring_queue_async_work() looks at it should follow after prep has been done. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/709c9f872f4d2e198c7aed9c49019ca7095dd24d.1616366969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index be04bc6b5b99..50acd6dfd966 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1239,10 +1239,10 @@ static void io_queue_async_work(struct io_kiocb *req) BUG_ON(!tctx); BUG_ON(!tctx->io_wq); - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); /* init ->work of the whole link before punting */ io_prep_async_link(req); + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); -- cgit v1.2.3 From b65c128f963df367a8adcfb08f5ecf8721052723 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 22 Mar 2021 01:45:59 +0000 Subject: io_uring: don't skip file_end_write() on reissue Don't miss to call kiocb_end_write() from __io_complete_rw() on reissue. Shouldn't be much of a problem as the function actually does some work only for ISREG, and NONBLOCK won't be reissued. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/32af9b77c5b874e1bee1a3c46396094bd969e577.1616366969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 50acd6dfd966..a664b44a5a94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2524,13 +2524,12 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { int cflags = 0; + if (req->rw.kiocb.ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req)) return; if (res != req->result) req_set_fail_links(req); - - if (req->rw.kiocb.ki_flags & IOCB_WRITE) - kiocb_end_write(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); __io_req_complete(req, issue_flags, res, cflags); -- cgit v1.2.3 From d81269fecb8ce16eb07efafc9ff5520b2a31c486 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Mar 2021 10:21:19 +0000 Subject: io_uring: fix provide_buffers sign extension io_provide_buffers_prep()'s "p->len * p->nbufs" to sign extension problems. Not a huge problem as it's only used for access_ok() and increases the checked length, but better to keep typing right. Reported-by: Colin Ian King Fixes: efe68c1ca8f49 ("io_uring: validate the full range of provided buffers for access") Signed-off-by: Pavel Begunkov Reviewed-by: Colin Ian King Link: https://lore.kernel.org/r/562376a39509e260d8532186a06226e56eb1f594.1616149233.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index a664b44a5a94..f3ae83a2d7bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3977,6 +3977,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + unsigned long size; struct io_provide_buf *p = &req->pbuf; u64 tmp; @@ -3990,7 +3991,8 @@ static int io_provide_buffers_prep(struct io_kiocb *req, p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); - if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs))) + size = (unsigned long)p->len * p->nbufs; + if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; p->bgid = READ_ONCE(sqe->buf_group); -- cgit v1.2.3 From a185f1db59f13de73aa470559030e90e50b34d93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 23 Mar 2021 10:52:38 +0000 Subject: io_uring: do ctx sqd ejection in a clear context WARNING: CPU: 1 PID: 27907 at fs/io_uring.c:7147 io_sq_thread_park+0xb5/0xd0 fs/io_uring.c:7147 CPU: 1 PID: 27907 Comm: iou-sqp-27905 Not tainted 5.12.0-rc4-syzkaller #0 RIP: 0010:io_sq_thread_park+0xb5/0xd0 fs/io_uring.c:7147 Call Trace: io_ring_ctx_wait_and_kill+0x214/0x700 fs/io_uring.c:8619 io_uring_release+0x3e/0x50 fs/io_uring.c:8646 __fput+0x288/0x920 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:140 io_run_task_work fs/io_uring.c:2238 [inline] io_run_task_work fs/io_uring.c:2228 [inline] io_uring_try_cancel_requests+0x8ec/0xc60 fs/io_uring.c:8770 io_uring_cancel_sqpoll+0x1cf/0x290 fs/io_uring.c:8974 io_sqpoll_cancel_cb+0x87/0xb0 fs/io_uring.c:8907 io_run_task_work_head+0x58/0xb0 fs/io_uring.c:1961 io_sq_thread+0x3e2/0x18d0 fs/io_uring.c:6763 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 May happen that last ctx ref is killed in io_uring_cancel_sqpoll(), so fput callback (i.e. io_uring_release()) is enqueued through task_work, and run by same cancellation. As it's deeply nested we can't do parking or taking sqd->lock there, because its state is unclear. So avoid ctx ejection from sqd list from io_ring_ctx_wait_and_kill() and do it in a clear context in io_ring_exit_work(). Fixes: f6d54255f423 ("io_uring: halt SQO submission on ctx exit") Reported-by: syzbot+e3a3f84f5cecf61f0583@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e90df88b8ff2cabb14a7534601d35d62ab4cb8c7.1616496707.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f3ae83a2d7bc..8c5789b96dbb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8564,6 +8564,14 @@ static void io_ring_exit_work(struct work_struct *work) struct io_tctx_node *node; int ret; + /* prevent SQPOLL from submitting new requests */ + if (ctx->sq_data) { + io_sq_thread_park(ctx->sq_data); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(ctx->sq_data); + io_sq_thread_unpark(ctx->sq_data); + } + /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while @@ -8615,14 +8623,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - /* prevent SQPOLL from submitting new requests */ - if (ctx->sq_data) { - io_sq_thread_park(ctx->sq_data); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(ctx->sq_data); - io_sq_thread_unpark(ctx->sq_data); - } - io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); -- cgit v1.2.3 From 90b8749022bbdd0c94a13182a78f4903b98fd0d7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 19:05:14 +0000 Subject: io_uring: maintain CQE order of a failed link Arguably we want CQEs of linked requests be in a strict order of submission as it always was. Now if init of a request fails its CQE may be posted before all prior linked requests including the head of the link. Fix it by failing it last. Fixes: de59bc104c24f ("io_uring: fail links more in io_submit_sqe()") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b7a96b05832e7ab23ad55f84092a2548c4a888b0.1616699075.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8c5789b96dbb..54ea561db4a5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6480,8 +6480,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: - io_put_req(req); - io_req_complete(req, ret); if (link->head) { /* fail even hard links since we don't submit */ link->head->flags |= REQ_F_FAIL_LINK; @@ -6489,6 +6487,8 @@ fail_req: io_req_complete(link->head, -ECANCELED); link->head = NULL; } + io_put_req(req); + io_req_complete(req, ret); return ret; } ret = io_req_prep(req, sqe); -- cgit v1.2.3 From dbe1bdbb39db7dfe80a903f0d267f62cf3f093d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:16:06 -0600 Subject: io_uring: handle signals for IO threads like a normal thread We go through various hoops to disallow signals for the IO threads, but there's really no reason why we cannot just allow them. The IO threads never return to userspace like a normal thread, and hence don't go through normal signal processing. Instead, just check for a pending signal as part of the work loop, and call get_signal() to handle it for us if anything is pending. With that, we can support receiving signals, including special ones like SIGSTOP. Acked-by: "Eric W. Biederman" Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 54ea561db4a5..2d43f7b87083 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,7 +78,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -6765,8 +6764,13 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; continue; } - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; break; + } sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -6809,7 +6813,6 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); schedule(); - try_to_freeze(); mutex_lock(&sqd->lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); -- cgit v1.2.3 From 1ee4160c73b2102a52bc97a4128a89c34821414f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:42 +0000 Subject: io_uring: fix timeout cancel return code When we cancel a timeout we should emit a sensible return code, like -ECANCELED but not 0, otherwise it may trick users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7b0ad1065e3bd1994722702bd0ba9e7bc9b0683b.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d43f7b87083..229ab9bfb45b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1247,7 +1247,7 @@ static void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_kill_timeout(struct io_kiocb *req) +static void io_kill_timeout(struct io_kiocb *req, int status) { struct io_timeout_data *io = req->async_data; int ret; @@ -1257,7 +1257,7 @@ static void io_kill_timeout(struct io_kiocb *req) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req, 0); + io_cqring_fill_event(req, status); io_put_req_deferred(req, 1); } } @@ -1274,7 +1274,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, files)) { - io_kill_timeout(req); + io_kill_timeout(req, -ECANCELED); canceled++; } } @@ -1326,7 +1326,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) break; list_del_init(&req->timeout.list); - io_kill_timeout(req); + io_kill_timeout(req, 0); } while (!list_empty(&ctx->timeout_list)); ctx->cq_last_tm_flush = seq; -- cgit v1.2.3 From 80c4cbdb5ee604712e59fe304d7bf084b562f705 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:43 +0000 Subject: io_uring: do post-completion chore on t-out cancel Don't forget about io_commit_cqring() + io_cqring_ev_posted() after exit/exec cancelling timeouts. Both functions declared only after io_kill_timeouts(), so to avoid tons of forward declarations move it down. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/72ace588772c0f14834a6a4185d56c445a366fb4.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 229ab9bfb45b..8498f74595f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1262,26 +1262,6 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -/* - * Returns true if we found and killed one or more timeouts - */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) -{ - struct io_kiocb *req, *tmp; - int canceled = 0; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, files)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->completion_lock); - return canceled != 0; -} - static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { @@ -8611,6 +8591,28 @@ static void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } +/* Returns true if we found and killed one or more timeouts */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) +{ + struct io_kiocb *req, *tmp; + int canceled = 0; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_match_task(req, tsk, files)) { + io_kill_timeout(req, -ECANCELED); + canceled++; + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + if (canceled != 0) + io_cqring_ev_posted(ctx); + return canceled != 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; -- cgit v1.2.3 From 2482b58ffbdc80cfaae969ad19cb32803056505b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:44 +0000 Subject: io_uring: don't cancel-track common timeouts Don't account usual timeouts (i.e. not linked) as REQ_F_INFLIGHT but keep behaviour prior to dd59a3d595cc1 ("io_uring: reliably cancel linked timeouts"). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/104441ef5d97e3932113d44501fda0df88656b83.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8498f74595f3..87d198d223a2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5563,7 +5563,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - io_req_track_inflight(req); + if (is_timeout_link) + io_req_track_inflight(req); return 0; } -- cgit v1.2.3 From 78d9d7c2a331fb7a68a86e53ef7e12966459e0c5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:45 +0000 Subject: io_uring: don't cancel extra on files match As tasks always wait and kill their io-wq on exec/exit, files are of no more concern to us, so we don't need to specifically cancel them by hand in those cases. Moreover we should not, because io_match_task() looks at req->task->files now, which is always true and so leads to extra cancellations, that wasn't a case before per-task io-wq. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0566c1de9b9dd417f5de345c817ca953580e0e2e.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 87d198d223a2..880abd8b6d31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1094,8 +1094,6 @@ static bool io_match_task(struct io_kiocb *head, io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; - if (req->task->files == files) - return true; } return false; } -- cgit v1.2.3 From 2b8ed1c94182dbbd0163d0eb443a934cbf6b0d85 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 26 Mar 2021 19:52:51 +0000 Subject: io_uring: remove unsued assignment to pointer io There is an assignment to io that is never read after the assignment, the assignment is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 880abd8b6d31..1949b80677e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4798,7 +4798,6 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = -ENOMEM; goto out; } - io = req->async_data; memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } -- cgit v1.2.3