From ab6005f3912fff07330297aba08922d2456dcede Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Apr 2025 15:46:34 +0100
Subject: io_uring: don't post tag CQEs on file/buffer registration failure

Buffer / file table registration is all or nothing, if it fails all
resources we might have partially registered are dropped and the table
is killed. If that happens, it doesn't make sense to post any rsrc tag
CQEs. That would be confusing to the application, which should not need
to handle that case.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Fixes: 7029acd8a9503 ("io_uring/rsrc: get rid of per-ring io_rsrc_node list")
Link: https://lore.kernel.org/r/c514446a8dcb0197cddd5d4ba8f6511da081cf1f.1743777957.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5e64a8bb30a4..b36c8825550e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -175,6 +175,18 @@ void io_rsrc_cache_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->imu_cache, kfree);
 }
 
+static void io_clear_table_tags(struct io_rsrc_data *data)
+{
+	int i;
+
+	for (i = 0; i < data->nr; i++) {
+		struct io_rsrc_node *node = data->nodes[i];
+
+		if (node)
+			node->tag = 0;
+	}
+}
+
 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
 			      struct io_rsrc_data *data)
 {
@@ -583,6 +595,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
 	return 0;
 fail:
+	io_clear_table_tags(&ctx->file_table.data);
 	io_sqe_files_unregister(ctx);
 	return ret;
 }
@@ -902,8 +915,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	ctx->buf_table = data;
-	if (ret)
+	if (ret) {
+		io_clear_table_tags(&ctx->buf_table);
 		io_sqe_buffers_unregister(ctx);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9b58440a5b2fe78102ce1e9e03946645558d0f55 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 5 Apr 2025 11:17:49 +0100
Subject: io_uring/zcrx: put refill data into separate cache line

Refill queue lock and other bits are only used from the allocation path
on the rx softirq side, but it shares the cache line with other fields
like ctx that are used also in the "syscall" path, which causes cache
bouncing when softirq runs on a different CPU.

Separate them into different cache lines. The first one now contains
constant fields used by both contextx, followed by a line responsible
for refill queue data.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6d1f598e27d623c07fc49d6baee13089a9b1216c.1743848241.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 706cc7300780..b59c560d5d84 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -26,11 +26,11 @@ struct io_zcrx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct io_zcrx_area		*area;
 
+	spinlock_t			rq_lock ____cacheline_aligned_in_smp;
 	struct io_uring			*rq_ring;
 	struct io_uring_zcrx_rqe	*rqes;
-	u32				rq_entries;
 	u32				cached_rq_head;
-	spinlock_t			rq_lock;
+	u32				rq_entries;
 
 	u32				if_rxq;
 	struct device			*dev;
-- 
cgit v1.2.3


From 5a17131a5dbd0ebca655bfb65fe3fe643ccc27f3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 5 Apr 2025 11:18:29 +0100
Subject: io_uring/zcrx: separate niov number from pages

A preparation patch that separates the number of pages / folios from
the number of niovs. They will not match in the future to support huge
pages, improved dma mapping and/or larger chunk sizes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0780ac966ee84200385737f45bb0f2ada052392b.1743848231.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 19 ++++++++++---------
 io_uring/zcrx.h |  1 +
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 80d4a6f71d29..0f46e0404c04 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -181,7 +181,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
 	kvfree(area->nia.niovs);
 	kvfree(area->user_refs);
 	if (area->pages) {
-		unpin_user_pages(area->pages, area->nia.num_niovs);
+		unpin_user_pages(area->pages, area->nr_folios);
 		kvfree(area->pages);
 	}
 	kfree(area);
@@ -192,7 +192,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 			       struct io_uring_zcrx_area_reg *area_reg)
 {
 	struct io_zcrx_area *area;
-	int i, ret, nr_pages;
+	int i, ret, nr_pages, nr_iovs;
 	struct iovec iov;
 
 	if (area_reg->flags || area_reg->rq_area_token)
@@ -220,27 +220,28 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 		area->pages = NULL;
 		goto err;
 	}
-	area->nia.num_niovs = nr_pages;
+	area->nr_folios = nr_iovs = nr_pages;
+	area->nia.num_niovs = nr_iovs;
 
-	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
+	area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
 					 GFP_KERNEL | __GFP_ZERO);
 	if (!area->nia.niovs)
 		goto err;
 
-	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
+	area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
 					GFP_KERNEL | __GFP_ZERO);
 	if (!area->freelist)
 		goto err;
 
-	for (i = 0; i < nr_pages; i++)
+	for (i = 0; i < nr_iovs; i++)
 		area->freelist[i] = i;
 
-	area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]),
+	area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
 					GFP_KERNEL | __GFP_ZERO);
 	if (!area->user_refs)
 		goto err;
 
-	for (i = 0; i < nr_pages; i++) {
+	for (i = 0; i < nr_iovs; i++) {
 		struct net_iov *niov = &area->nia.niovs[i];
 
 		niov->owner = &area->nia;
@@ -248,7 +249,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 		atomic_set(&area->user_refs[i], 0);
 	}
 
-	area->free_count = nr_pages;
+	area->free_count = nr_iovs;
 	area->ifq = ifq;
 	/* we're only supporting one area per ifq for now */
 	area->area_id = 0;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index b59c560d5d84..47f1c0e8c197 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -15,6 +15,7 @@ struct io_zcrx_area {
 	bool			is_mapped;
 	u16			area_id;
 	struct page		**pages;
+	unsigned long		nr_folios;
 
 	/* freelist */
 	spinlock_t		freelist_lock ____cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From cf960726eb65e8d0bfecbcce6cf95f47b1ffa6cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 7 Apr 2025 07:51:23 -0600
Subject: io_uring/kbuf: reject zero sized provided buffers

This isn't fixing a real issue, but there's also zero point in going
through group and buffer setup, when the buffers are going to be
rejected once attempted to get used.

Cc: stable@vger.kernel.org
Reported-by: syzbot+58928048fd1416f1457c@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'io_uring')

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 098109259671..953d5e742569 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -504,6 +504,8 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	p->nbufs = tmp;
 	p->addr = READ_ONCE(sqe->addr);
 	p->len = READ_ONCE(sqe->len);
+	if (!p->len)
+		return -EINVAL;
 
 	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 				&size))
-- 
cgit v1.2.3


From 25744f849524e806a13ade17c4fb83f6888fe954 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 15 Apr 2025 14:09:45 +0100
Subject: io_uring/zcrx: return ifq id to the user

IORING_OP_RECV_ZC requests take a zcrx object id via sqe::zcrx_ifq_idx,
which binds it to the corresponding if / queue. However, we don't return
that id back to the user. It's fine as currently there can be only one
zcrx and the user assumes that its id should be 0, but as we'll need
multiple zcrx objects in the future let's explicitly pass it back on
registration.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8714667d370651962f7d1a169032e5f02682a73e.1744722517.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 0f46e0404c04..d0eccf277a20 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -354,7 +354,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
 		return -EFAULT;
-	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
+	    reg.__resv2 || reg.zcrx_id)
 		return -EINVAL;
 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
 		return -EINVAL;
-- 
cgit v1.2.3


From 70e4f9bfc13c9abcc97eb9f2feee51cc925524c8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 15 Apr 2025 14:10:16 +0100
Subject: io_uring/zcrx: add pp to ifq conversion helper

It'll likely change how page pools store memory providers, so in
preparation for that, keep accesses in one place in io_uring by
introducing a helper.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3522eb8fa9b4e21bcf32e7e9ae656c616b282210.1744722526.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index d0eccf277a20..5defbe8f95f9 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,6 +26,11 @@
 #include "zcrx.h"
 #include "rsrc.h"
 
+static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
+{
+	return pp->mp_priv;
+}
+
 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
@@ -586,7 +591,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
 
 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
 {
-	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
 
 	/* pp should already be ensuring that */
 	if (unlikely(pp->alloc.count))
@@ -618,7 +623,7 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
 
 static int io_pp_zc_init(struct page_pool *pp)
 {
-	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
 
 	if (WARN_ON_ONCE(!ifq))
 		return -EINVAL;
@@ -637,7 +642,7 @@ static int io_pp_zc_init(struct page_pool *pp)
 
 static void io_pp_zc_destroy(struct page_pool *pp)
 {
-	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
 	struct io_zcrx_area *area = ifq->area;
 
 	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
@@ -792,7 +797,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 
 	niov = netmem_to_net_iov(frag->netmem);
 	if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
-	    niov->pp->mp_priv != ifq)
+	    io_pp_to_ifq(niov->pp) != ifq)
 		return -EFAULT;
 
 	if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
-- 
cgit v1.2.3


From 1ac571288822253db32196c49f240739148417e3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 17 Apr 2025 10:32:31 +0100
Subject: io_uring/rsrc: don't skip offset calculation

Don't optimise for requests with offset=0. Large registered buffers are
the preference and hence the user is likely to pass an offset, and the
adjustments are not expensive and will be made even cheaper in following
patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1c2beb20470ee3c886a363d4d8340d3790db19f3.1744882081.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 75 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index b36c8825550e..4d62897d1c89 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1036,6 +1036,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len)
 {
+	const struct bio_vec *bvec;
 	size_t offset;
 	int ret;
 
@@ -1054,47 +1055,45 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	offset = buf_addr - imu->ubuf;
 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
-	if (offset) {
-		/*
-		 * Don't use iov_iter_advance() here, as it's really slow for
-		 * using the latter parts of a big fixed buffer - it iterates
-		 * over each segment manually. We can cheat a bit here for user
-		 * registered nodes, because we know that:
-		 *
-		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are the same in size, except potentially the
-		 *    first and last bvec
-		 *
-		 * So just find our index, and adjust the iterator afterwards.
-		 * If the offset is within the first bvec (or the whole first
-		 * bvec, just use iov_iter_advance(). This makes it easier
-		 * since we can just skip the first segment, which may not
-		 * be folio_size aligned.
-		 */
-		const struct bio_vec *bvec = imu->bvec;
+	/*
+	 * Don't use iov_iter_advance() here, as it's really slow for
+	 * using the latter parts of a big fixed buffer - it iterates
+	 * over each segment manually. We can cheat a bit here for user
+	 * registered nodes, because we know that:
+	 *
+	 * 1) it's a BVEC iter, we set it up
+	 * 2) all bvecs are the same in size, except potentially the
+	 *    first and last bvec
+	 *
+	 * So just find our index, and adjust the iterator afterwards.
+	 * If the offset is within the first bvec (or the whole first
+	 * bvec, just use iov_iter_advance(). This makes it easier
+	 * since we can just skip the first segment, which may not
+	 * be folio_size aligned.
+	 */
+	bvec = imu->bvec;
 
-		/*
-		 * Kernel buffer bvecs, on the other hand, don't necessarily
-		 * have the size property of user registered ones, so we have
-		 * to use the slow iter advance.
-		 */
-		if (offset < bvec->bv_len) {
-			iter->count -= offset;
-			iter->iov_offset = offset;
-		} else if (imu->is_kbuf) {
-			iov_iter_advance(iter, offset);
-		} else {
-			unsigned long seg_skip;
+	/*
+	 * Kernel buffer bvecs, on the other hand, don't necessarily
+	 * have the size property of user registered ones, so we have
+	 * to use the slow iter advance.
+	 */
+	if (offset < bvec->bv_len) {
+		iter->count -= offset;
+		iter->iov_offset = offset;
+	} else if (imu->is_kbuf) {
+		iov_iter_advance(iter, offset);
+	} else {
+		unsigned long seg_skip;
 
-			/* skip first vec */
-			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> imu->folio_shift);
+		/* skip first vec */
+		offset -= bvec->bv_len;
+		seg_skip = 1 + (offset >> imu->folio_shift);
 
-			iter->bvec += seg_skip;
-			iter->nr_segs -= seg_skip;
-			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
-		}
+		iter->bvec += seg_skip;
+		iter->nr_segs -= seg_skip;
+		iter->count -= bvec->bv_len + offset;
+		iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 50169d07548441e3033b9bbaa06e573e7224f140 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 17 Apr 2025 10:32:32 +0100
Subject: io_uring/rsrc: separate kbuf offset adjustments

Kernel registered buffers are special because segments are not uniform
in size, and we have a bunch of optimisations based on that uniformity
for normal buffers. Handle kbuf separately, it'll be cleaner this way.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4e9e5990b0ab5aee723c0be5cd9b5bcf810375f9.1744882081.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4d62897d1c89..fddde8ffe81e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1048,11 +1048,14 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	if (!(imu->dir & (1 << ddir)))
 		return -EFAULT;
 
-	/*
-	 * Might not be a start of buffer, set size appropriately
-	 * and advance us to the beginning.
-	 */
 	offset = buf_addr - imu->ubuf;
+
+	if (imu->is_kbuf) {
+		iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
+		iov_iter_advance(iter, offset);
+		return 0;
+	}
+
 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
 	/*
@@ -1072,17 +1075,9 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * be folio_size aligned.
 	 */
 	bvec = imu->bvec;
-
-	/*
-	 * Kernel buffer bvecs, on the other hand, don't necessarily
-	 * have the size property of user registered ones, so we have
-	 * to use the slow iter advance.
-	 */
 	if (offset < bvec->bv_len) {
 		iter->count -= offset;
 		iter->iov_offset = offset;
-	} else if (imu->is_kbuf) {
-		iov_iter_advance(iter, offset);
 	} else {
 		unsigned long seg_skip;
 
-- 
cgit v1.2.3


From 59852ebad954c8a3ac8b746930c2ea60febe797a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 17 Apr 2025 10:32:33 +0100
Subject: io_uring/rsrc: refactor io_import_fixed

io_import_fixed is a mess. Even though we know the final len of the
iterator, we still assign offset + len and do some magic after to
correct for that.

Do offset calculation first and finalise it with iov_iter_bvec at the
end.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2d5107fed24f8b23245ef2ede9a5a7f7c426df61.1744882081.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fddde8ffe81e..5cf854318b1d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1037,6 +1037,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 			   u64 buf_addr, size_t len)
 {
 	const struct bio_vec *bvec;
+	unsigned nr_segs;
 	size_t offset;
 	int ret;
 
@@ -1056,8 +1057,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 		return 0;
 	}
 
-	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
-
 	/*
 	 * Don't use iov_iter_advance() here, as it's really slow for
 	 * using the latter parts of a big fixed buffer - it iterates
@@ -1067,30 +1066,21 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * 1) it's a BVEC iter, we set it up
 	 * 2) all bvecs are the same in size, except potentially the
 	 *    first and last bvec
-	 *
-	 * So just find our index, and adjust the iterator afterwards.
-	 * If the offset is within the first bvec (or the whole first
-	 * bvec, just use iov_iter_advance(). This makes it easier
-	 * since we can just skip the first segment, which may not
-	 * be folio_size aligned.
 	 */
 	bvec = imu->bvec;
-	if (offset < bvec->bv_len) {
-		iter->count -= offset;
-		iter->iov_offset = offset;
-	} else {
+	if (offset >= bvec->bv_len) {
 		unsigned long seg_skip;
 
 		/* skip first vec */
 		offset -= bvec->bv_len;
 		seg_skip = 1 + (offset >> imu->folio_shift);
-
-		iter->bvec += seg_skip;
-		iter->nr_segs -= seg_skip;
-		iter->count -= bvec->bv_len + offset;
-		iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
+		bvec += seg_skip;
+		offset &= (1UL << imu->folio_shift) - 1;
 	}
 
+	nr_segs = imu->nr_bvecs - (bvec - imu->bvec);
+	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
+	iter->iov_offset = offset;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 80c7378f94cf193cb3bd2101bbcd5aea78d0e211 Mon Sep 17 00:00:00 2001
From: Nitesh Shetty <nj.shetty@samsung.com>
Date: Thu, 17 Apr 2025 10:32:34 +0100
Subject: io_uring/rsrc: send exact nr_segs for fixed buffer

Sending exact nr_segs, avoids bio split check and processing in
block layer, which takes around 5%[1] of overall CPU utilization.

In our setup, we see overall improvement of IOPS from 7.15M to 7.65M [2]
and 5% less CPU utilization.

[1]
     3.52%  io_uring         [kernel.kallsyms]     [k] bio_split_rw_at
     1.42%  io_uring         [kernel.kallsyms]     [k] bio_split_rw
     0.62%  io_uring         [kernel.kallsyms]     [k] bio_submit_split

[2]
sudo taskset -c 0,1 ./t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B1 -n2
-r4 /dev/nvme0n1 /dev/nvme1n1

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
[Pavel: fixed for kbuf, rebased and reworked on top of cleanups]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7a1a49a8d053bd617c244291d63dbfbc07afde36.1744882081.git.asml.silence@gmail.com
[axboe: fold in fix factoring in buf reg offset]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5cf854318b1d..0c6d7e7415c8 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1037,6 +1037,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 			   u64 buf_addr, size_t len)
 {
 	const struct bio_vec *bvec;
+	size_t folio_mask;
 	unsigned nr_segs;
 	size_t offset;
 	int ret;
@@ -1067,6 +1068,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * 2) all bvecs are the same in size, except potentially the
 	 *    first and last bvec
 	 */
+	folio_mask = (1UL << imu->folio_shift) - 1;
 	bvec = imu->bvec;
 	if (offset >= bvec->bv_len) {
 		unsigned long seg_skip;
@@ -1075,10 +1077,9 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 		offset -= bvec->bv_len;
 		seg_skip = 1 + (offset >> imu->folio_shift);
 		bvec += seg_skip;
-		offset &= (1UL << imu->folio_shift) - 1;
+		offset &= folio_mask;
 	}
-
-	nr_segs = imu->nr_bvecs - (bvec - imu->bvec);
+	nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
 	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
 	iter->iov_offset = offset;
 	return 0;
-- 
cgit v1.2.3


From b419bed4f0a62c65a57dd495185821dd56bc435c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 16 Apr 2025 16:48:26 -0600
Subject: io_uring/rsrc: ensure segments counts are correct on kbuf buffers

kbuf imports have the front offset adjusted and segments removed, but
the tail segments are still included in the segment count that gets
passed in the iov_iter. As the segments aren't necessarily all the
same size, move importing to a separate helper and iterate the
mapped length to get an exact count.

Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0c6d7e7415c8..f80a77c4973f 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1032,6 +1032,26 @@ static int validate_fixed_range(u64 buf_addr, size_t len,
 	return 0;
 }
 
+static int io_import_kbuf(int ddir, struct iov_iter *iter,
+			  struct io_mapped_ubuf *imu, size_t len, size_t offset)
+{
+	size_t count = len + offset;
+
+	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
+	iov_iter_advance(iter, offset);
+
+	if (count < imu->len) {
+		const struct bio_vec *bvec = iter->bvec;
+
+		while (len > bvec->bv_len) {
+			len -= bvec->bv_len;
+			bvec++;
+		}
+		iter->nr_segs = 1 + bvec - iter->bvec;
+	}
+	return 0;
+}
+
 static int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len)
@@ -1052,11 +1072,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 
 	offset = buf_addr - imu->ubuf;
 
-	if (imu->is_kbuf) {
-		iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
-		iov_iter_advance(iter, offset);
-		return 0;
-	}
+	if (imu->is_kbuf)
+		return io_import_kbuf(ddir, iter, imu, len, offset);
 
 	/*
 	 * Don't use iov_iter_advance() here, as it's really slow for
-- 
cgit v1.2.3


From f12ecf5e1c5eca48b8652e893afcdb730384a6aa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 18 Apr 2025 13:02:27 +0100
Subject: io_uring/zcrx: fix late dma unmap for a dead dev

There is a problem with page pools not dma-unmapping immediately when
the device is going down, and delaying it until the page pool is
destroyed, which is not allowed (see links). That just got fixed for
normal page pools, and we need to address memory providers as well.

Unmap pages in the memory provider uninstall callback, and protect it
with a new lock. There is also a gap between when a dma mapping is
created and the mp is installed, so if the device is killed in between,
io_uring would be holding on to dma mappings to a dead device with no
one to call ->uninstall. Move it to page pool init and rely on
->is_mapped to make sure it's only done once.

Link: https://lore.kernel.org/lkml/8067f204-1380-4d37-8ffd-007fc6f26738@kernel.org/T/
Link: https://lore.kernel.org/all/20250409-page-pool-track-dma-v9-0-6a9ef2e0cba8@redhat.com/
Fixes: 34a3e60821ab9 ("io_uring/zcrx: implement zerocopy receive pp memory provider")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ef9b7db249b14f6e0b570a1bb77ff177389f881c.1744965853.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 21 +++++++++++++++++----
 io_uring/zcrx.h |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 5defbe8f95f9..fe86606b9f30 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -51,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
 
 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
+	guard(mutex)(&ifq->dma_lock);
+
 	if (area->is_mapped)
 		__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
+	area->is_mapped = false;
 }
 
 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
 	int i;
 
+	guard(mutex)(&ifq->dma_lock);
+	if (area->is_mapped)
+		return 0;
+
 	for (i = 0; i < area->nia.num_niovs; i++) {
 		struct net_iov *niov = &area->nia.niovs[i];
 		dma_addr_t dma;
@@ -280,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
 	ifq->ctx = ctx;
 	spin_lock_init(&ifq->lock);
 	spin_lock_init(&ifq->rq_lock);
+	mutex_init(&ifq->dma_lock);
 	return ifq;
 }
 
@@ -329,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 		put_device(ifq->dev);
 
 	io_free_rbuf_ring(ifq);
+	mutex_destroy(&ifq->dma_lock);
 	kfree(ifq);
 }
 
@@ -400,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		goto err;
 	get_device(ifq->dev);
 
-	ret = io_zcrx_map_area(ifq, ifq->area);
-	if (ret)
-		goto err;
-
 	mp_param.mp_ops = &io_uring_pp_zc_ops;
 	mp_param.mp_priv = ifq;
 	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -624,6 +629,7 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
 static int io_pp_zc_init(struct page_pool *pp)
 {
 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
+	int ret;
 
 	if (WARN_ON_ONCE(!ifq))
 		return -EINVAL;
@@ -636,6 +642,10 @@ static int io_pp_zc_init(struct page_pool *pp)
 	if (pp->p.dma_dir != DMA_FROM_DEVICE)
 		return -EOPNOTSUPP;
 
+	ret = io_zcrx_map_area(ifq, ifq->area);
+	if (ret)
+		return ret;
+
 	percpu_ref_get(&ifq->ctx->refs);
 	return 0;
 }
@@ -671,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
 	struct io_zcrx_ifq *ifq = mp_priv;
 
 	io_zcrx_drop_netdev(ifq);
+	if (ifq->area)
+		io_zcrx_unmap_area(ifq, ifq->area);
+
 	p->mp_ops = NULL;
 	p->mp_priv = NULL;
 }
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 47f1c0e8c197..f2bc811f022c 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -38,6 +38,7 @@ struct io_zcrx_ifq {
 	struct net_device		*netdev;
 	netdevice_tracker		netdev_tracker;
 	spinlock_t			lock;
+	struct mutex			dma_lock;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
-- 
cgit v1.2.3


From 5e16f1a68d28965c12b6fa227a306fef8a680f84 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Apr 2025 12:28:39 +0100
Subject: io_uring: don't duplicate flushing in io_req_post_cqe

io_req_post_cqe() sets submit_state.cq_flush so that
*flush_completions() can take care of batch commiting CQEs. Don't commit
it twice by using __io_cq_unlock_post().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/41c416660c509cee676b6cad96081274bcb459f3.1745493861.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c6209fe44cb1..eedb47c8c79e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -872,10 +872,15 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
 	lockdep_assert(!io_wq_current_is_worker());
 	lockdep_assert_held(&ctx->uring_lock);
 
-	__io_cq_lock(ctx);
-	posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+	if (!ctx->lockless_cq) {
+		spin_lock(&ctx->completion_lock);
+		posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+		spin_unlock(&ctx->completion_lock);
+	} else {
+		posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+	}
+
 	ctx->submit_state.cq_flush = true;
-	__io_cq_unlock_post(ctx);
 	return posted;
 }
 
-- 
cgit v1.2.3


From edd43f4d6f50ec3de55a0c9e9df6348d1da51965 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 24 Apr 2025 10:28:14 -0600
Subject: io_uring: fix 'sync' handling of io_fallback_tw()

A previous commit added a 'sync' parameter to io_fallback_tw(), which if
true, means the caller wants to wait on the fallback thread handling it.
But the logic is somewhat messed up, ensure that ctxs are swapped and
flushed appropriately.

Cc: stable@vger.kernel.org
Fixes: dfbe5561ae93 ("io_uring: flush offloaded and delayed task_work on exit")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eedb47c8c79e..a2b256e96d5d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1083,21 +1083,22 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
 		node = node->next;
-		if (sync && last_ctx != req->ctx) {
+		if (last_ctx != req->ctx) {
 			if (last_ctx) {
-				flush_delayed_work(&last_ctx->fallback_work);
+				if (sync)
+					flush_delayed_work(&last_ctx->fallback_work);
 				percpu_ref_put(&last_ctx->refs);
 			}
 			last_ctx = req->ctx;
 			percpu_ref_get(&last_ctx->refs);
 		}
-		if (llist_add(&req->io_task_work.node,
-			      &req->ctx->fallback_llist))
-			schedule_delayed_work(&req->ctx->fallback_work, 1);
+		if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist))
+			schedule_delayed_work(&last_ctx->fallback_work, 1);
 	}
 
 	if (last_ctx) {
-		flush_delayed_work(&last_ctx->fallback_work);
+		if (sync)
+			flush_delayed_work(&last_ctx->fallback_work);
 		percpu_ref_put(&last_ctx->refs);
 	}
 }
-- 
cgit v1.2.3


From f024d3a8ded0d8d2129ae123d7a5305c29ca44ce Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 30 Apr 2025 07:17:17 -0600
Subject: io_uring/fdinfo: annotate racy sq/cq head/tail reads

syzbot complains about the cached sq head read, and it's totally right.
But we don't need to care, it's just reading fdinfo, and reading the
CQ or SQ tail/head entries are known racy in that they are just a view
into that very instant and may of course be outdated by the time they
are reported.

Annotate both the SQ head and CQ tail read with data_race() to avoid
this syzbot complaint.

Link: https://lore.kernel.org/io-uring/6811f6dc.050a0220.39e3a1.0d0e.GAE@google.com/
Reported-by: syzbot+3e77fd302e99f5af9394@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index f60d0a9d505e..9414ca6d101c 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -123,11 +123,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
 	seq_printf(m, "SqHead:\t%u\n", sq_head);
 	seq_printf(m, "SqTail:\t%u\n", sq_tail);
-	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+	seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head));
 	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
 	seq_printf(m, "CqHead:\t%u\n", cq_head);
 	seq_printf(m, "CqTail:\t%u\n", cq_tail);
-	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+	seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
 	seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
 	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
 	for (i = 0; i < sq_entries; i++) {
-- 
cgit v1.2.3