From c9d952b9103b600ddafc5d1c0e2f2dbd30f0b805 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Oct 2024 19:06:50 -0600 Subject: io_uring/rw: fix cflags posting for single issue multishot read If multishot gets disabled, and hence the request will get terminated rather than persist for more iterations, then posting the CQE with the right cflags is still important. Most notably, the buffer reference needs to be included. Refactor the return of __io_read() a bit, so that the provided buffer is always put correctly, and hence returned to the application. Reported-by: Sharon Rosner Link: https://github.com/axboe/liburing/issues/1257 Cc: stable@vger.kernel.org Fixes: 2a975d426c82 ("io_uring/rw: don't allow multishot reads without NOWAIT support") Signed-off-by: Jens Axboe --- io_uring/rw.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index f023ff49c688..93ad92605884 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -972,17 +972,21 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_MULTISHOT) return IOU_ISSUE_SKIP_COMPLETE; return -EAGAIN; - } - - /* - * Any successful return value will keep the multishot read armed. - */ - if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) { + } else if (ret <= 0) { + io_kbuf_recycle(req, issue_flags); + if (ret < 0) + req_set_fail(req); + } else { /* - * Put our buffer and post a CQE. If we fail to post a CQE, then + * Any successful return value will keep the multishot read + * armed, if it's still set. Put our buffer and post a CQE. If + * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ cflags = io_put_kbuf(req, ret, issue_flags); + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + goto done; + rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1003,6 +1007,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Either an error, or we've hit overflow posting the CQE. For any * multishot request, hitting overflow will terminate it. */ +done: io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); if (issue_flags & IO_URING_F_MULTISHOT) -- cgit v1.2.3 From f7c9134385331c5ef36252895130aa01a92de907 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 6 Oct 2024 10:40:36 -0600 Subject: io_uring/rw: allow pollable non-blocking attempts for !FMODE_NOWAIT The checking for whether or not io_uring can do a non-blocking read or write attempt is gated on FMODE_NOWAIT. However, if the file is pollable, it's feasible to just check if it's currently in a state in which it can sanely receive or send _some_ data. This avoids unnecessary io-wq punts, and repeated worthless retries before doing that punt, by assuming that some data can get delivered or received if poll tells us that is true. It also allows multishot reads to properly work with these types of files, enabling a bit of a cleanup of the logic that: c9d952b9103b ("io_uring/rw: fix cflags posting for single issue multishot read") had to put in place. Signed-off-by: Jens Axboe --- io_uring/rw.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index 93ad92605884..80ae3c2ebb70 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -31,9 +31,19 @@ struct io_rw { rwf_t flags; }; -static inline bool io_file_supports_nowait(struct io_kiocb *req) +static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) { - return req->flags & REQ_F_SUPPORT_NOWAIT; + /* If FMODE_NOWAIT is set for a file, we're golden */ + if (req->flags & REQ_F_SUPPORT_NOWAIT) + return true; + /* No FMODE_NOWAIT, if we can poll, check the status */ + if (io_file_can_poll(req)) { + struct poll_table_struct pt = { ._key = mask }; + + return vfs_poll(req->file, &pt) & mask; + } + /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ + return false; } #ifdef CONFIG_COMPAT @@ -796,8 +806,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * supports async. Otherwise it's impossible to use O_NONBLOCK files * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + if (kiocb->ki_flags & IOCB_NOWAIT || + ((file->f_flags & O_NONBLOCK && (req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { @@ -838,7 +848,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) + if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { @@ -951,13 +961,6 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) ret = __io_read(req, issue_flags); - /* - * If the file doesn't support proper NOWAIT, then disable multishot - * and stay in single shot mode. - */ - if (!io_file_supports_nowait(req)) - req->flags &= ~REQ_F_APOLL_MULTISHOT; - /* * If we get -EAGAIN, recycle our buffer and just let normal poll * handling arm it. @@ -984,9 +987,6 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * jump to the termination path. This request is then done. */ cflags = io_put_kbuf(req, ret, issue_flags); - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) - goto done; - rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1007,7 +1007,6 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Either an error, or we've hit overflow posting the CQE. For any * multishot request, hitting overflow will terminate it. */ -done: io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); if (issue_flags & IO_URING_F_MULTISHOT) @@ -1031,7 +1030,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) + if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) goto ret_eagain; /* Check if we can support NOWAIT. */ -- cgit v1.2.3 From 28aabffae6be54284869a91cd8bccd3720041129 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Oct 2024 08:58:25 -0600 Subject: io_uring/sqpoll: close race on waiting for sqring entries When an application uses SQPOLL, it must wait for the SQPOLL thread to consume SQE entries, if it fails to get an sqe when calling io_uring_get_sqe(). It can do so by calling io_uring_enter(2) with the flag value of IORING_ENTER_SQ_WAIT. In liburing, this is generally done with io_uring_sqring_wait(). There's a natural expectation that once this call returns, a new SQE entry can be retrieved, filled out, and submitted. However, the kernel uses the cached sq head to determine if the SQRING is full or not. If the SQPOLL thread is currently in the process of submitting SQE entries, it may have updated the cached sq head, but not yet committed it to the SQ ring. Hence the kernel may find that there are SQE entries ready to be consumed, and return successfully to the application. If the SQPOLL thread hasn't yet committed the SQ ring entries by the time the application returns to userspace and attempts to get a new SQE, it will fail getting a new SQE. Fix this by having io_sqring_full() always use the user visible SQ ring head entry, rather than the internally cached one. Cc: stable@vger.kernel.org # 5.10+ Link: https://github.com/axboe/liburing/discussions/1267 Reported-by: Benedek Thaler Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 9d70b2cf7b1e..913dbcebe5c9 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -284,7 +284,14 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; + /* + * SQPOLL must use the actual sqring head, as using the cached_sq_head + * is race prone if the SQPOLL thread has grabbed entries but not yet + * committed them to the ring. For !SQPOLL, this doesn't matter, but + * since this helper is just used for SQPOLL sqring waits (or POLLOUT), + * just read the actual sqring head unconditionally. + */ + return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) -- cgit v1.2.3 From 858e686a30d7bffba3f3527add4f78766a4389d0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 07:09:25 -0600 Subject: io_uring/rsrc: ignore dummy_ubuf for buffer cloning For placeholder buffers, &dummy_ubuf is assigned which is a static value. When buffers are attempted cloned, don't attempt to grab a reference to it, as we both don't need it and it'll actively fail as dummy_ubuf doesn't have a valid reference count setup. Link: https://lore.kernel.org/io-uring/Zw8dkUzsxQ5LgAJL@ly-workstation/ Reported-by: Lai, Yi Fixes: 7cc2a6eadcd7 ("io_uring: add IORING_REGISTER_COPY_BUFFERS method") Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 33a3d156a85b..6f3b6de230bd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1176,7 +1176,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx for (i = 0; i < nbufs; i++) { struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; - refcount_inc(&src->refs); + if (src != &dummy_ubuf) + refcount_inc(&src->refs); user_bufs[i] = src; } -- cgit v1.2.3 From 8f7033aa4089fbaf7a33995f0f2ee6c9d7b9ca1b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2024 08:31:56 -0600 Subject: io_uring/sqpoll: ensure task state is TASK_RUNNING when running task_work When the sqpoll is exiting and cancels pending work items, it may need to run task_work. If this happens from within io_uring_cancel_generic(), then it may be under waiting for the io_uring_task waitqueue. This results in the below splat from the scheduler, as the ring mutex may be attempted grabbed while in a TASK_INTERRUPTIBLE state. Ensure that the task state is set appropriately for that, just like what is done for the other cases in io_run_task_work(). do not call blocking ops when !TASK_RUNNING; state=1 set at [<0000000029387fd2>] prepare_to_wait+0x88/0x2fc WARNING: CPU: 6 PID: 59939 at kernel/sched/core.c:8561 __might_sleep+0xf4/0x140 Modules linked in: CPU: 6 UID: 0 PID: 59939 Comm: iou-sqp-59938 Not tainted 6.12.0-rc3-00113-g8d020023b155 #7456 Hardware name: linux,dummy-virt (DT) pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : __might_sleep+0xf4/0x140 lr : __might_sleep+0xf4/0x140 sp : ffff80008c5e7830 x29: ffff80008c5e7830 x28: ffff0000d93088c0 x27: ffff60001c2d7230 x26: dfff800000000000 x25: ffff0000e16b9180 x24: ffff80008c5e7a50 x23: 1ffff000118bcf4a x22: ffff0000e16b9180 x21: ffff0000e16b9180 x20: 000000000000011b x19: ffff80008310fac0 x18: 1ffff000118bcd90 x17: 30303c5b20746120 x16: 74657320313d6574 x15: 0720072007200720 x14: 0720072007200720 x13: 0720072007200720 x12: ffff600036c64f0b x11: 1fffe00036c64f0a x10: ffff600036c64f0a x9 : dfff800000000000 x8 : 00009fffc939b0f6 x7 : ffff0001b6327853 x6 : 0000000000000001 x5 : ffff0001b6327850 x4 : ffff600036c64f0b x3 : ffff8000803c35bc x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000e16b9180 Call trace: __might_sleep+0xf4/0x140 mutex_lock+0x84/0x124 io_handle_tw_list+0xf4/0x260 tctx_task_work_run+0x94/0x340 io_run_task_work+0x1ec/0x3c0 io_uring_cancel_generic+0x364/0x524 io_sq_thread+0x820/0x124c ret_from_fork+0x10/0x20 Cc: stable@vger.kernel.org Fixes: af5d68f8892f ("io_uring/sqpoll: manage task_work privately") Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'io_uring') diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 913dbcebe5c9..70b6675941ff 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -327,6 +327,7 @@ static inline int io_run_task_work(void) if (current->io_uring) { unsigned int count = 0; + __set_current_state(TASK_RUNNING); tctx_task_work_run(current->io_uring, UINT_MAX, &count); if (count) ret = true; -- cgit v1.2.3 From ae6a888a4357131c01d85f4c91fb32552dd0bf70 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 19 Oct 2024 09:16:51 -0600 Subject: io_uring/rw: fix wrong NOWAIT check in io_rw_init_file() A previous commit improved how !FMODE_NOWAIT is dealt with, but inadvertently negated a check whilst doing so. This caused -EAGAIN to be returned from reading files with O_NONBLOCK set. Fix up the check for REQ_F_SUPPORT_NOWAIT. Reported-by: Julian Orth Link: https://github.com/axboe/liburing/issues/1270 Fixes: f7c913438533 ("io_uring/rw: allow pollable non-blocking attempts for !FMODE_NOWAIT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index 80ae3c2ebb70..354c4e175654 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -807,7 +807,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ if (kiocb->ki_flags & IOCB_NOWAIT || - ((file->f_flags & O_NONBLOCK && (req->flags & REQ_F_SUPPORT_NOWAIT)))) + ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { -- cgit v1.2.3 From 1d60d74e852647255bd8e76f5a22dc42531e4389 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 31 Oct 2024 08:05:44 -0600 Subject: io_uring/rw: fix missing NOWAIT check for O_DIRECT start write When io_uring starts a write, it'll call kiocb_start_write() to bump the super block rwsem, preventing any freezes from happening while that write is in-flight. The freeze side will grab that rwsem for writing, excluding any new writers from happening and waiting for existing writes to finish. But io_uring unconditionally uses kiocb_start_write(), which will block if someone is currently attempting to freeze the mount point. This causes a deadlock where freeze is waiting for previous writes to complete, but the previous writes cannot complete, as the task that is supposed to complete them is blocked waiting on starting a new write. This results in the following stuck trace showing that dependency with the write blocked starting a new write: task:fio state:D stack:0 pid:886 tgid:886 ppid:876 Call trace: __switch_to+0x1d8/0x348 __schedule+0x8e8/0x2248 schedule+0x110/0x3f0 percpu_rwsem_wait+0x1e8/0x3f8 __percpu_down_read+0xe8/0x500 io_write+0xbb8/0xff8 io_issue_sqe+0x10c/0x1020 io_submit_sqes+0x614/0x2110 __arm64_sys_io_uring_enter+0x524/0x1038 invoke_syscall+0x74/0x268 el0_svc_common.constprop.0+0x160/0x238 do_el0_svc+0x44/0x60 el0_svc+0x44/0xb0 el0t_64_sync_handler+0x118/0x128 el0t_64_sync+0x168/0x170 INFO: task fsfreeze:7364 blocked for more than 15 seconds. Not tainted 6.12.0-rc5-00063-g76aaf945701c #7963 with the attempting freezer stuck trying to grab the rwsem: task:fsfreeze state:D stack:0 pid:7364 tgid:7364 ppid:995 Call trace: __switch_to+0x1d8/0x348 __schedule+0x8e8/0x2248 schedule+0x110/0x3f0 percpu_down_write+0x2b0/0x680 freeze_super+0x248/0x8a8 do_vfs_ioctl+0x149c/0x1b18 __arm64_sys_ioctl+0xd0/0x1a0 invoke_syscall+0x74/0x268 el0_svc_common.constprop.0+0x160/0x238 do_el0_svc+0x44/0x60 el0_svc+0x44/0xb0 el0t_64_sync_handler+0x118/0x128 el0t_64_sync+0x168/0x170 Fix this by having the io_uring side honor IOCB_NOWAIT, and only attempt a blocking grab of the super block rwsem if it isn't set. For normal issue where IOCB_NOWAIT would always be set, this returns -EAGAIN which will have io_uring core issue a blocking attempt of the write. That will in turn also get completions run, ensuring forward progress. Since freezing requires CAP_SYS_ADMIN in the first place, this isn't something that can be triggered by a regular user. Cc: stable@vger.kernel.org # 5.10+ Reported-by: Peter Mann Link: https://lore.kernel.org/io-uring/38c94aec-81c9-4f62-b44e-1d87f5597644@sh.cz Signed-off-by: Jens Axboe --- io_uring/rw.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index 354c4e175654..155938f10093 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1014,6 +1014,25 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) +{ + struct inode *inode; + bool ret; + + if (!(req->flags & REQ_F_ISREG)) + return true; + if (!(kiocb->ki_flags & IOCB_NOWAIT)) { + kiocb_start_write(kiocb); + return true; + } + + inode = file_inode(kiocb->ki_filp); + ret = sb_start_write_trylock(inode->i_sb); + if (ret) + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + return ret; +} + int io_write(struct io_kiocb *req, unsigned int issue_flags) { bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -1051,8 +1070,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret)) return ret; - if (req->flags & REQ_F_ISREG) - kiocb_start_write(kiocb); + if (unlikely(!io_kiocb_start_write(req, kiocb))) + return -EAGAIN; kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) -- cgit v1.2.3