From bdf200731145f07a6127cb16753e2e8fdc159cf4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 1 Oct 2019 09:53:29 -0600 Subject: io_uring: use __kernel_timespec in timeout ABI All system calls use struct __kernel_timespec instead of the old struct timespec, but this one was just added with the old-style ABI. Change it now to enforce the use of __kernel_timespec, avoiding ABI confusion and the need for compat handlers on 32-bit architectures. Any user space caller will have to use __kernel_timespec now, but this is unambiguous and works for any C library regardless of the time_t definition. A nicer way to specify the timeout would have been a less ambiguous 64-bit nanosecond value, but I suppose it's too late now to change that as this would impact both 32-bit and 64-bit users. Fixes: 5262f567987d ("io_uring: IORING_OP_TIMEOUT support") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index dd094b387cab..0bc167aca46d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1892,15 +1892,15 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned count, req_dist, tail_index; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; - struct timespec ts; + struct timespec64 ts; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || sqe->len != 1) return -EINVAL; - if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, - sizeof(ts))) + + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; /* @@ -1934,7 +1934,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), HRTIMER_MODE_REL); return 0; } -- cgit v1.2.3 From bf7ec93c644cb0064ba7d2fc40d4841c5ba382ab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Oct 2019 17:01:08 +0300 Subject: io_uring: fix reversed nonblock flag for link submission io_queue_link_head() accepts @force_nonblock flag, but io_ring_submit() passes something opposite. Fixes: c576666863b78 ("io_uring: optimize submit_and_wait API") Reported-by: kbuild test robot Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0bc167aca46d..ab8c4e5e442c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2761,7 +2761,7 @@ out: if (link) io_queue_link_head(ctx, link, &link->submit, shadow_req, - block_for_last); + !block_for_last); if (statep) io_submit_state_end(statep); -- cgit v1.2.3 From 6805b32ec2b0897eb180295385efe306e5ac3b3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 8 Oct 2019 02:18:42 +0300 Subject: io_uring: remove wait loop spurious wakeups Any changes interesting to tasks waiting in io_cqring_wait() are commited with io_cqring_ev_posted(). However, io_ring_drop_ctx_refs() also tries to do that but with no reason, that means spurious wakeups every io_free_req() and io_uring_enter(). Just use percpu_ref_put() instead. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ab8c4e5e442c..ceb3497bdd2a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -591,14 +591,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, io_cqring_ev_posted(ctx); } -static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) -{ - percpu_ref_put_many(&ctx->refs, refs); - - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); -} - static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -646,7 +638,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; return req; out: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); return NULL; } @@ -654,7 +646,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); - io_ring_drop_ctx_refs(ctx, *nr); + percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; } } @@ -663,7 +655,7 @@ static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - io_ring_drop_ctx_refs(req->ctx, 1); + percpu_ref_put(&req->ctx->refs); kmem_cache_free(req_cachep, req); } @@ -3584,7 +3576,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; -- cgit v1.2.3 From 8a99734081775c012a4a6c442fdef0379fe52bdf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Oct 2019 14:40:13 -0600 Subject: io_uring: only flush workqueues on fileset removal We should not remove the workqueue, we just need to ensure that the workqueues are synced. The workqueues are torn down on ctx removal. Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-by: Stefan Hajnoczi Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index ceb3497bdd2a..2c44648217bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2866,8 +2866,12 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) + if (ctx->sqo_wq[i]) + flush_workqueue(ctx->sqo_wq[i]); - io_finish_async(ctx); unix_destruct_scm(skb); } -- cgit v1.2.3 From 7adf4eaf60f3d8c3584bed51fe7066d4dfc2cbe1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Oct 2019 21:42:58 -0600 Subject: io_uring: fix sequence logic for timeout requests We have two ways a request can be deferred: 1) It's a regular request that depends on another one 2) It's a timeout that tracks completions We have a shared helper to determine whether to defer, and that attempts to make the right decision based on the request. But we only have some of this information in the caller. Un-share the two timeout/defer helpers so the caller can use the right one. Fixes: 5262f567987d ("io_uring: IORING_OP_TIMEOUT support") Reported-by: yangerkun Reviewed-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c44648217bd..38d274fc0f25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -415,27 +415,27 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; } +static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; +} + static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { - /* timeout requests always honor sequence */ - if (!(req->flags & REQ_F_TIMEOUT) && - (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; + return __io_sequence_defer(ctx, req); } -static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, - struct list_head *list) +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; - if (list_empty(list)) - return NULL; - - req = list_first_entry(list, struct io_kiocb, list); - if (!io_sequence_defer(ctx, req)) { + req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); + if (req && !io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; } @@ -443,14 +443,17 @@ static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, return NULL; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) -{ - return __io_get_deferred_req(ctx, &ctx->defer_list); -} - static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) { - return __io_get_deferred_req(ctx, &ctx->timeout_list); + struct io_kiocb *req; + + req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); + if (req && !__io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; } static void __io_commit_cqring(struct io_ring_ctx *ctx) -- cgit v1.2.3 From 5da0fb1ab34ccfe6d49210b4f5a739c59fcbf25e Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 15 Oct 2019 21:59:29 +0800 Subject: io_uring: consider the overflow of sequence for timeout req Now we recalculate the sequence of timeout with 'req->sequence = ctx->cached_sq_head + count - 1', judge the right place to insert for timeout_list by compare the number of request we still expected for completion. But we have not consider about the situation of overflow: 1. ctx->cached_sq_head + count - 1 may overflow. And a bigger count for the new timeout req can have a small req->sequence. 2. cached_sq_head of now may overflow compare with before req. And it will lead the timeout req with small req->sequence. This overflow will lead to the misorder of timeout_list, which can lead to the wrong order of the completion of timeout_list. Fix it by reuse req->submit.sequence to store the count, and change the logic of inserting sort in io_timeout. Signed-off-by: yangerkun Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 38d274fc0f25..d2cb277da2f4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1884,7 +1884,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - unsigned count, req_dist, tail_index; + unsigned count; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; struct timespec64 ts; @@ -1907,21 +1907,36 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) count = 1; req->sequence = ctx->cached_sq_head + count - 1; + /* reuse it to store the count */ + req->submit.sequence = count; req->flags |= REQ_F_TIMEOUT; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ - tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; - req_dist = req->sequence - tail_index; spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned dist; + unsigned nxt_sq_head; + long long tmp, tmp_nxt; - dist = nxt->sequence - tail_index; - if (req_dist >= dist) + /* + * Since cached_sq_head + count - 1 can overflow, use type long + * long to store it. + */ + tmp = (long long)ctx->cached_sq_head + count - 1; + nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; + tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + + /* + * cached_sq_head may overflow, and it will never overflow twice + * once there is some timeout req still be valid. + */ + if (ctx->cached_sq_head < nxt_sq_head) + tmp_nxt += UINT_MAX; + + if (tmp >= tmp_nxt) break; } list_add(&req->list, entry); -- cgit v1.2.3 From 491381ce07ca57f68c49c79a8a43da5b60749e32 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2019 09:20:46 -0600 Subject: io_uring: fix up O_NONBLOCK handling for sockets We've got two issues with the non-regular file handling for non-blocking IO: 1) We don't want to re-do a short read in full for a non-regular file, as we can't just read the data again. 2) For non-regular files that don't support non-blocking IO attempts, we need to punt to async context even if the file is opened as non-blocking. Otherwise the caller always gets -EAGAIN. Add two new request flags to handle these cases. One is just a cache of the inode S_ISREG() status, the other tells io_uring that we always need to punt this request to async context, even if REQ_F_NOWAIT is set. Cc: stable@vger.kernel.org Reported-by: Hrvoje Zeba Tested-by: Hrvoje Zeba Signed-off-by: Jens Axboe --- fs/io_uring.c | 57 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 18 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d2cb277da2f4..b7d4085d6ffd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -322,6 +322,8 @@ struct io_kiocb { #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ #define REQ_F_TIMEOUT 1024 /* timeout request */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ u64 user_data; u32 result; u32 sequence; @@ -914,26 +916,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -945,7 +947,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1059,8 +1062,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + return -EAGAIN; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1081,7 +1093,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1382,7 +1395,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -1447,7 +1462,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -2282,7 +2297,13 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, int ret; ret = __io_submit_sqe(ctx, req, s, force_nonblock); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + + /* + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts + */ + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { struct io_uring_sqe *sqe_copy; sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); -- cgit v1.2.3 From 8b07a65ad30e5612d9590fb50468ff4fa314cfc7 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Thu, 17 Oct 2019 12:12:35 +0800 Subject: io_uring: fix logic error in io_timeout If ctx->cached_sq_head < nxt_sq_head, we should add UINT_MAX to tmp, not tmp_nxt. Fixes: 5da0fb1ab34c ("io_uring: consider the overflow of sequence for timeout req") Signed-off-by: yangerkun Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index b7d4085d6ffd..1d03afd74368 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1949,7 +1949,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) * once there is some timeout req still be valid. */ if (ctx->cached_sq_head < nxt_sq_head) - tmp_nxt += UINT_MAX; + tmp += UINT_MAX; if (tmp >= tmp_nxt) break; -- cgit v1.2.3