From d8aeb44a9ae324c4b823689fabb30b6621d93c88 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Mar 2023 09:40:28 -0700 Subject: fs: add FMODE_DIO_PARALLEL_WRITE flag Some filesystems support multiple threads writing to the same file with O_DIRECT without requiring exclusive access to it. io_uring can use this hint to avoid serializing dio writes to this inode, instead allowing them to run in parallel. XFS and ext4 both fall into this category, so set the flag for both of them. Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c85916e9f7db..475d88640d3d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)0x800000) +/* File supports non-exclusive O_DIRECT writes from multiple threads */ +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) -- cgit v1.2.3 From efba1a9e653e107577a48157b5424878c46f2285 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Feb 2023 08:43:52 -0800 Subject: io_uring: Move from hlist to io_wq_work_node Having cache entries linked using the hlist format brings no benefit, and also requires an unnecessary extra pointer address per cache entry. Use the internal io_wq_work_node single-linked list for the internal alloc caches (async_msghdr and async_poll) This is required to be able to use KASAN on cache entries, since we do not need to touch unused (and poisoned) cache entries when adding more entries to the list. Suggested-by: Pavel Begunkov Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00689c12f6ab..757105ebd8af 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -188,7 +188,7 @@ struct io_ev_fd { }; struct io_alloc_cache { - struct hlist_head list; + struct io_wq_work_node list; unsigned int nr_cached; }; -- cgit v1.2.3 From e1fe7ee885dc0712e982ee465d9f8b96254c30c1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Feb 2023 08:43:53 -0800 Subject: io_uring: Add KASAN support for alloc_caches Add support for KASAN in the alloc_caches (apoll and netmsg_cache). Thus, if something touches the unused caches, it will raise a KASAN warning/exception. It poisons the object when the object is put to the cache, and unpoisons it when the object is gotten or freed. Signed-off-by: Breno Leitao Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 757105ebd8af..3d152bdcd30a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -190,6 +190,7 @@ struct io_ev_fd { struct io_alloc_cache { struct io_wq_work_node list; unsigned int nr_cached; + size_t elem_size; }; struct io_ring_ctx { -- cgit v1.2.3 From a282967c848fb1d92c28334430c472da9c334e54 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 27 Mar 2023 16:38:15 +0100 Subject: io_uring: encapsulate task_work state For task works we're passing around a bool pointer for whether the current ring is locked or not, let's wrap it in a structure, that will make it more opaque preventing abuse and will also help us to pass more info in the future if needed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3d152bdcd30a..561fa421c453 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -367,6 +367,11 @@ struct io_ring_ctx { unsigned evfd_last_cq_tail; }; +struct io_tw_state { + /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ + bool locked; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -473,7 +478,7 @@ enum { REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); struct io_task_work { struct llist_node node; -- cgit v1.2.3 From 8e15c0e71b8ae64fb7163532860f8d608165281f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:46 +0100 Subject: io_uring/rsrc: keep cached refs per node We cache refs of the current node (i.e. ctx->rsrc_node) in ctx->rsrc_cached_refs. We'll be moving away from atomics, so move the cached refs in struct io_rsrc_node for now. It's a prep patch and shouldn't change anything in practise. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9edc3669c1d71b06c2dca78b2b2b8bb9292738b9.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 561fa421c453..a0a5b5964d3a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -240,7 +240,6 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; -- cgit v1.2.3 From 0a4813b1abdf06e44ce60cdebfd374cfd27c46bf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:50 +0100 Subject: io_uring/rsrc: kill rsrc_ref_lock We use ->rsrc_ref_lock spinlock to protect ->rsrc_ref_list in io_rsrc_node_ref_zero(). Now we removed pcpu refcounting, which means io_rsrc_node_ref_zero() is not executed from the irq context as an RCU callback anymore, and we also put it under ->uring_lock. io_rsrc_node_switch(), which queues up nodes into the list, is also protected by ->uring_lock, so we can safely get rid of ->rsrc_ref_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6b60af883c263551190b526a55ff2c9d5ae07141.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index a0a5b5964d3a..9492889f00c0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -333,8 +333,8 @@ struct io_ring_ctx { struct delayed_work rsrc_put_work; struct callback_head rsrc_put_tw; struct llist_head rsrc_put_llist; + /* protected by ->uring_lock */ struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; struct list_head io_buffers_pages; -- cgit v1.2.3 From 36b9818a5a84cb7c977fb723babca1c8d74f288f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:53 +0100 Subject: io_uring/rsrc: don't offload node free struct delayed_work rsrc_put_work was previously used to offload node freeing because io_rsrc_node_ref_zero() was previously called by RCU in the IRQ context. Now, as percpu refcounting is gone, we can do it eagerly at the spot without pushing it to a worker. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/13fb1aac1e8d068ad8fd4a0c6d0d157ab61b90c0.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9492889f00c0..47496059e13a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -330,9 +330,6 @@ struct io_ring_ctx { struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; - struct delayed_work rsrc_put_work; - struct callback_head rsrc_put_tw; - struct llist_head rsrc_put_llist; /* protected by ->uring_lock */ struct list_head rsrc_ref_list; -- cgit v1.2.3 From 9eae8655f9cd2eeed99fb7a0d2bb22816c17e497 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:54 +0100 Subject: io_uring/rsrc: cache struct io_rsrc_node Add allocation cache for struct io_rsrc_node, it's always allocated and put under ->uring_lock, so it doesn't need any extra synchronisation around caches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/252a9d9ef9654e6467af30fdc02f57c0118fb76e.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 47496059e13a..5d772e36e7fc 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -332,6 +332,7 @@ struct io_ring_ctx { /* protected by ->uring_lock */ struct list_head rsrc_ref_list; + struct io_alloc_cache rsrc_node_cache; struct list_head io_buffers_pages; -- cgit v1.2.3 From 69bbc6ade9d9d4e3c556cb83e77b6f3cd9ad3d18 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:57 +0100 Subject: io_uring/rsrc: add custom limit for node caching The number of entries in the rsrc node cache is limited to 512, which still seems unnecessarily large. Add per cache thresholds and set to to 32 for the rsrc node cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d0cd538b944dac0bf878e276fc0199f21e6bccea.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5d772e36e7fc..4a6ce03a4903 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -190,6 +190,7 @@ struct io_ev_fd { struct io_alloc_cache { struct io_wq_work_node list; unsigned int nr_cached; + unsigned int max_cached; size_t elem_size; }; -- cgit v1.2.3 From 8751d15426a31baaf40f7570263c27c3e5d1dc44 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:12 +0100 Subject: io_uring: reduce scheduling due to tw Every task_work will try to wake the task to be executed, which causes excessive scheduling and additional overhead. For some tw it's justified, but others won't do much but post a single CQE. When a task waits for multiple cqes, every such task_work will wake it up. Instead, the task may give a hint about how many cqes it waits for, io_req_local_work_add() will compare against it and skip wake ups if #cqes + #tw is not enough to satisfy the waiting condition. Task_work that uses the optimisation should be simple enough and never post more than one CQE. It's also ignored for non DEFER_TASKRUN rings. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d2b77e99d1e86624d8a69f7037d764b739dcd225.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4a6ce03a4903..fa621a508a01 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -296,7 +296,7 @@ struct io_ring_ctx { spinlock_t completion_lock; bool poll_multi_queue; - bool cq_waiting; + atomic_t cq_wait_nr; /* * ->iopoll_list is protected by the ctx->uring_lock for @@ -566,6 +566,7 @@ struct io_kiocb { atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; + unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ union { struct hlist_node hash_node; -- cgit v1.2.3 From 528407b1e0ea51260fff2cc8b669c632a65d7a09 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:05 +0100 Subject: io_uring/rsrc: consolidate node caching We store one pre-allocated rsrc node in ->rsrc_backup_node, merge it with ->rsrc_node_cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6d5410e51ccd29be7a716be045b51d6b371baef6.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fa621a508a01..40cab420b1bd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -326,7 +326,6 @@ struct io_ring_ctx { struct io_restriction restrictions; /* slow path rsrc auxilary data, used by update/register */ - struct io_rsrc_node *rsrc_backup_node; struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; -- cgit v1.2.3 From 4ea15b56f0810f0d8795d475db1bb74b3a7c1b2f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:08 +0100 Subject: io_uring/rsrc: use wq for quiescing Replace completions with waitqueues for rsrc data quiesce, the main wakeup condition is when data refs hit zero. Note that data refs are only changes under ->uring_lock, so we prepare before mutex_unlock() reacquire it after taking the lock back. This change will be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1d0dbc74b3b4fd67c8f01819e680c5e0da252956.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 40cab420b1bd..5c9645319770 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -333,6 +333,7 @@ struct io_ring_ctx { /* protected by ->uring_lock */ struct list_head rsrc_ref_list; struct io_alloc_cache rsrc_node_cache; + struct wait_queue_head rsrc_quiesce_wq; struct list_head io_buffers_pages; -- cgit v1.2.3 From 0b222eeb6514ba6c3457b667fa4f3645032e1fc9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:10 +0100 Subject: io_uring/rsrc: remove rsrc_data refs Instead of waiting for rsrc_data->refs to be downed to zero, check whether there are rsrc nodes queued for completion, that's easier then maintaining references. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8e33fd143d83e11af3e386aea28eb6d6c6a1be10.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5c9645319770..1b2a20a42413 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -334,6 +334,7 @@ struct io_ring_ctx { struct list_head rsrc_ref_list; struct io_alloc_cache rsrc_node_cache; struct wait_queue_head rsrc_quiesce_wq; + unsigned rsrc_quiesce; struct list_head io_buffers_pages; -- cgit v1.2.3