From 8e62ba590160f91abba6490d9c17aa13bada4752 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 29 Apr 2025 16:59:53 -0700
Subject: cxl/test: Address missing MODULE_DESCRIPTION warnings for cxl_test

Add MODULE_DESCRIPTION() to address the following warnings:
WARNING: modpost: missing MODULE_DESCRIPTION() in test/cxl_test.o
WARNING: modpost: missing MODULE_DESCRIPTION() in test/cxl_mock.o
WARNING: modpost: missing MODULE_DESCRIPTION() in test/cxl_mock_mem.o

[dj: s/CXL test/cxl_test:/ per djbw's comment]

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20250429235953.4175408-1-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 tools/testing/cxl/test/cxl.c  | 1 +
 tools/testing/cxl/test/mem.c  | 1 +
 tools/testing/cxl/test/mock.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 1c3336095923..8a5815ca870d 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -1527,5 +1527,6 @@ MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1");
 module_init(cxl_test_init);
 module_exit(cxl_test_exit);
 MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("cxl_test: setup module");
 MODULE_IMPORT_NS("ACPI");
 MODULE_IMPORT_NS("CXL");
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index bf9caa908f89..0f1d91f57ba3 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -1909,4 +1909,5 @@ static struct platform_driver cxl_mock_mem_driver = {
 
 module_platform_driver(cxl_mock_mem_driver);
 MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("cxl_test: mem device mock module");
 MODULE_IMPORT_NS("CXL");
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index af2594e4f35d..1989ae020df3 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -312,5 +312,6 @@ void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL");
 
 MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("cxl_test: emulation module");
 MODULE_IMPORT_NS("ACPI");
 MODULE_IMPORT_NS("CXL");
-- 
cgit v1.2.3


From 0c6e6f1357cbdc158d555346a728aa4aeb0d7011 Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Wed, 21 May 2025 13:47:41 +0100
Subject: cxl/edac: Add CXL memory device patrol scrub control feature

CXL spec 3.2 section 8.2.10.9.11.1 describes the device patrol scrub
control feature. The device patrol scrub proactively locates and makes
corrections to errors in regular cycle.

Allow specifying the number of hours within which the patrol scrub must be
completed, subject to minimum and maximum limits reported by the device.
Also allow disabling scrub allowing trade-off error rates against
performance.

Add support for patrol scrub control on CXL memory devices.
Register with the EDAC device driver, which retrieves the scrub attribute
descriptors from EDAC scrub and exposes the sysfs scrub control attributes
to userspace. For example, scrub control for the CXL memory device
"cxl_mem0" is exposed in /sys/bus/edac/devices/cxl_mem0/scrubX/.

Additionally, add support for region-based CXL memory patrol scrub control.
CXL memory regions may be interleaved across one or more CXL memory
devices. For example, region-based scrub control for "cxl_region1" is
exposed in /sys/bus/edac/devices/cxl_region1/scrubX/.

[dj: A few formatting fixes from Jonathan]

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20250521124749.817-4-shiju.jose@huawei.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 tools/testing/cxl/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 387f3df8b988..31a2d73c963f 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -67,6 +67,7 @@ cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
+cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o
 cxl_core-y += config_check.o
 cxl_core-y += cxl_core_test.o
 cxl_core-y += cxl_core_exports.o
-- 
cgit v1.2.3


From bf098d72696db1a947e72d549ec861dc1092deef Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:11 -0600
Subject: selftests: ublk: kublk: plumb q_id in io_uring user_data

Currently, when we process CQEs, we know which ublk_queue we are working
on because we know which ring we are working on, and ublk_queues and
rings are in 1:1 correspondence. However, as we decouple ublk_queues
from ublk server threads, ublk_queues and rings will no longer be in 1:1
correspondence - each ublk server thread will have a ring, and each
thread may issue commands against more than one ublk_queue. So in order
to know which ublk_queue a CQE refers to, plumb that information in the
associated SQE's user_data.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-2-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/fault_inject.c |  2 +-
 tools/testing/selftests/ublk/file_backed.c  | 10 +++++-----
 tools/testing/selftests/ublk/kublk.c        | 17 +++++++++--------
 tools/testing/selftests/ublk/kublk.h        | 17 +++++++++++++----
 tools/testing/selftests/ublk/null.c         | 12 ++++++------
 tools/testing/selftests/ublk/stripe.c       |  9 +++++----
 6 files changed, 39 insertions(+), 28 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 5421774d7867..5deff76327b2 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -48,7 +48,7 @@ static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag)
 
 	ublk_queue_alloc_sqes(q, &sqe, 1);
 	io_uring_prep_timeout(sqe, &ts, 1, 0);
-	sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, 1);
+	sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, q->q_id, 1);
 
 	ublk_queued_tgt_io(q, tag, 1);
 
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 509842df9bee..0e86123e309c 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -22,7 +22,7 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des
 	io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC);
 	io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 	/* bit63 marks us as tgt io */
-	sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+	sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 	return 1;
 }
 
@@ -48,7 +48,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 			sqe[0]->buf_index = tag;
 		io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 		/* bit63 marks us as tgt io */
-		sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+		sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 		return 1;
 	}
 
@@ -57,17 +57,17 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 	sqe[0]->user_data = build_user_data(tag,
-			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 
 	io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0,
 		iod->nr_sectors << 9,
 		iod->start_sector << 9);
 	sqe[1]->buf_index = tag;
 	sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
-	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1);
+	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 
 	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
-	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	return 2;
 }
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index b5131a000795..c3bb52953936 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -627,7 +627,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 	if (q->state & UBLKSRV_AUTO_BUF_REG)
 		ublk_set_auto_buf_reg(q, sqe[0], tag);
 
-	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
+	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
 	io_uring_sqe_set_data64(sqe[0], user_data);
 
 	io->flags = 0;
@@ -673,10 +673,11 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
 		q->tgt_ops->tgt_io_done(q, tag, cqe);
 }
 
-static void ublk_handle_cqe(struct io_uring *r,
+static void ublk_handle_cqe(struct ublk_dev *dev,
 		struct io_uring_cqe *cqe, void *data)
 {
-	struct ublk_queue *q = container_of(r, struct ublk_queue, ring);
+	unsigned q_id = user_data_to_q_id(cqe->user_data);
+	struct ublk_queue *q = &dev->q[q_id];
 	unsigned tag = user_data_to_tag(cqe->user_data);
 	unsigned cmd_op = user_data_to_op(cqe->user_data);
 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
@@ -727,17 +728,17 @@ static void ublk_handle_cqe(struct io_uring *r,
 	}
 }
 
-static int ublk_reap_events_uring(struct io_uring *r)
+static int ublk_reap_events_uring(struct ublk_queue *q)
 {
 	struct io_uring_cqe *cqe;
 	unsigned head;
 	int count = 0;
 
-	io_uring_for_each_cqe(r, head, cqe) {
-		ublk_handle_cqe(r, cqe, NULL);
+	io_uring_for_each_cqe(&q->ring, head, cqe) {
+		ublk_handle_cqe(q->dev, cqe, NULL);
 		count += 1;
 	}
-	io_uring_cq_advance(r, count);
+	io_uring_cq_advance(&q->ring, count);
 
 	return count;
 }
@@ -756,7 +757,7 @@ static int ublk_process_io(struct ublk_queue *q)
 		return -ENODEV;
 
 	ret = io_uring_submit_and_wait(&q->ring, 1);
-	reapped = ublk_reap_events_uring(&q->ring);
+	reapped = ublk_reap_events_uring(q);
 
 	ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n",
 			ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index e34508bf5798..424e5d96775f 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -49,7 +49,8 @@
 #define UBLKSRV_IO_IDLE_SECS		20
 
 #define UBLK_IO_MAX_BYTES               (1 << 20)
-#define UBLK_MAX_QUEUES                 32
+#define UBLK_MAX_QUEUES_SHIFT		5
+#define UBLK_MAX_QUEUES                 (1 << UBLK_MAX_QUEUES_SHIFT)
 #define UBLK_QUEUE_DEPTH                1024
 
 #define UBLK_DBG_DEV            (1U << 0)
@@ -225,11 +226,14 @@ static inline int is_target_io(__u64 user_data)
 }
 
 static inline __u64 build_user_data(unsigned tag, unsigned op,
-		unsigned tgt_data, unsigned is_target_io)
+		unsigned tgt_data, unsigned q_id, unsigned is_target_io)
 {
-	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16));
+	/* we only have 7 bits to encode q_id */
+	_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
 
-	return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63;
+	return tag | (op << 16) | (tgt_data << 24) |
+		(__u64)q_id << 56 | (__u64)is_target_io << 63;
 }
 
 static inline unsigned int user_data_to_tag(__u64 user_data)
@@ -247,6 +251,11 @@ static inline unsigned int user_data_to_tgt_data(__u64 user_data)
 	return (user_data >> 24) & 0xffff;
 }
 
+static inline unsigned int user_data_to_q_id(__u64 user_data)
+{
+	return (user_data >> 56) & 0x7f;
+}
+
 static inline unsigned short ublk_cmd_op_nr(unsigned int op)
 {
 	return _IOC_NR(op);
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 44aca31cf2b0..c415bf839e87 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -43,7 +43,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 }
 
 static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
-		struct io_uring_sqe *sqe)
+		struct io_uring_sqe *sqe, int q_id)
 {
 	unsigned ublk_op = ublksrv_get_op(iod);
 
@@ -52,7 +52,7 @@ static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
 	sqe->flags |= IOSQE_FIXED_FILE;
 	sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
 	sqe->len = iod->nr_sectors << 9; 	/* injected result */
-	sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
+	sqe->user_data = build_user_data(tag, ublk_op, 0, q_id, 1);
 }
 
 static int null_queue_zc_io(struct ublk_queue *q, int tag)
@@ -64,14 +64,14 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
 
 	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
 	sqe[0]->user_data = build_user_data(tag,
-			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 
-	__setup_nop_io(tag, iod, sqe[1]);
+	__setup_nop_io(tag, iod, sqe[1], q->q_id);
 	sqe[1]->flags |= IOSQE_IO_HARDLINK;
 
 	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
-	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
 	return 2;
@@ -83,7 +83,7 @@ static int null_queue_auto_zc_io(struct ublk_queue *q, int tag)
 	struct io_uring_sqe *sqe[1];
 
 	ublk_queue_alloc_sqes(q, sqe, 1);
-	__setup_nop_io(tag, iod, sqe[0]);
+	__setup_nop_io(tag, iod, sqe[0], q->q_id);
 	return 1;
 }
 
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 404a143bf3d6..4fc45f42b02e 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -144,7 +144,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
 		sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 		sqe[0]->user_data = build_user_data(tag,
-			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	}
 
 	for (i = zc; i < s->nr + extra - zc; i++) {
@@ -162,13 +162,14 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 				sqe[i]->flags |= IOSQE_IO_HARDLINK;
 		}
 		/* bit63 marks us as tgt io */
-		sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1);
+		sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, q->q_id, 1);
 	}
 	if (zc) {
 		struct io_uring_sqe *unreg = sqe[s->nr + 1];
 
 		io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag);
-		unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1);
+		unreg->user_data = build_user_data(
+			tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
 	}
 
 	/* register buffer is skip_success */
@@ -185,7 +186,7 @@ static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod,
 	for (i = 0; i < conf->nr_files; i++) {
 		io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
 		io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
-		sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1);
+		sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, q->q_id, 1);
 	}
 	return conf->nr_files;
 }
-- 
cgit v1.2.3


From 97737097528397a8106ffc552e827a3dedccf132 Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:12 -0600
Subject: selftests: ublk: kublk: tie sqe allocation to io instead of queue

We currently have a helper ublk_queue_alloc_sqes which the ublk targets
use to allocate SQEs for their own operations. However, as we move
towards decoupled ublk_queues and ublk server threads, this helper does
not make sense anymore. SQEs are allocated from rings, and we will have
one ring per thread to avoid locking. Change the SQE allocation helper
to ublk_io_alloc_sqes. Currently this still allocates SQEs from the io's
queue's ring, but when we fully decouple threads and queues, it will
allocate from the io's thread's ring instead.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-3-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/fault_inject.c |  2 +-
 tools/testing/selftests/ublk/file_backed.c  |  6 +++---
 tools/testing/selftests/ublk/kublk.c        |  2 +-
 tools/testing/selftests/ublk/kublk.h        | 16 ++++++++++++----
 tools/testing/selftests/ublk/null.c         |  4 ++--
 tools/testing/selftests/ublk/stripe.c       |  4 ++--
 6 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 5deff76327b2..6e60f7d97125 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -46,7 +46,7 @@ static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag)
 		.tv_nsec = (long long)q->dev->private_data,
 	};
 
-	ublk_queue_alloc_sqes(q, &sqe, 1);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), &sqe, 1);
 	io_uring_prep_timeout(sqe, &ts, 1, 0);
 	sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, q->q_id, 1);
 
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 0e86123e309c..922a87108b9f 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -18,7 +18,7 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des
 	unsigned ublk_op = ublksrv_get_op(iod);
 	struct io_uring_sqe *sqe[1];
 
-	ublk_queue_alloc_sqes(q, sqe, 1);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
 	io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC);
 	io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 	/* bit63 marks us as tgt io */
@@ -36,7 +36,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 	void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr;
 
 	if (!zc || auto_zc) {
-		ublk_queue_alloc_sqes(q, sqe, 1);
+		ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
 		if (!sqe[0])
 			return -ENOMEM;
 
@@ -52,7 +52,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 		return 1;
 	}
 
-	ublk_queue_alloc_sqes(q, sqe, 3);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
 
 	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index c3bb52953936..1602cf6f07a0 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -599,7 +599,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 	if (io_uring_sq_space_left(&q->ring) < 1)
 		io_uring_submit(&q->ring);
 
-	ublk_queue_alloc_sqes(q, sqe, 1);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
 	if (!sqe[0]) {
 		ublk_err("%s: run out of sqe %d, tag %d\n",
 				__func__, q->q_id, tag);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 424e5d96775f..64da26725fe1 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -124,6 +124,8 @@ struct ublk_io {
 	unsigned short flags;
 	unsigned short refs;		/* used by target code only */
 
+	int tag;
+
 	int result;
 
 	unsigned short tgt_ios;
@@ -289,17 +291,23 @@ static inline void ublk_dbg(int level, const char *fmt, ...)
 	}
 }
 
-static inline int ublk_queue_alloc_sqes(struct ublk_queue *q,
+static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io)
+{
+	return container_of(io, struct ublk_queue, ios[io->tag]);
+}
+
+static inline int ublk_io_alloc_sqes(struct ublk_io *io,
 		struct io_uring_sqe *sqes[], int nr_sqes)
 {
-	unsigned left = io_uring_sq_space_left(&q->ring);
+	struct io_uring *ring = &ublk_io_to_queue(io)->ring;
+	unsigned left = io_uring_sq_space_left(ring);
 	int i;
 
 	if (left < nr_sqes)
-		io_uring_submit(&q->ring);
+		io_uring_submit(ring);
 
 	for (i = 0; i < nr_sqes; i++) {
-		sqes[i] = io_uring_get_sqe(&q->ring);
+		sqes[i] = io_uring_get_sqe(ring);
 		if (!sqes[i])
 			return i;
 	}
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index c415bf839e87..9acc7e0d271b 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -60,7 +60,7 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
 	struct io_uring_sqe *sqe[3];
 
-	ublk_queue_alloc_sqes(q, sqe, 3);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
 
 	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
 	sqe[0]->user_data = build_user_data(tag,
@@ -82,7 +82,7 @@ static int null_queue_auto_zc_io(struct ublk_queue *q, int tag)
 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
 	struct io_uring_sqe *sqe[1];
 
-	ublk_queue_alloc_sqes(q, sqe, 1);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
 	__setup_nop_io(tag, iod, sqe[0], q->q_id);
 	return 1;
 }
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 4fc45f42b02e..97079c3121ef 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -138,7 +138,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 	io->private_data = s;
 	calculate_stripe_array(conf, iod, s, base);
 
-	ublk_queue_alloc_sqes(q, sqe, s->nr + extra);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra);
 
 	if (zc) {
 		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
@@ -182,7 +182,7 @@ static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod,
 	struct io_uring_sqe *sqe[NR_STRIPE];
 	int i;
 
-	ublk_queue_alloc_sqes(q, sqe, conf->nr_files);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, conf->nr_files);
 	for (i = 0; i < conf->nr_files; i++) {
 		io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
 		io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
-- 
cgit v1.2.3


From 8f75ba28b8747d6a788daede0060d574b5693dac Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:13 -0600
Subject: selftests: ublk: kublk: lift queue initialization out of thread

Currently, each ublk server I/O handler thread initializes its own
queue. However, as we move towards decoupled ublk_queues and ublk server
threads, this model does not make sense anymore, as there will no longer
be a concept of a thread having "its own" queue. So lift queue
initialization out of the per-thread ublk_io_handler_fn and into a loop
in ublk_start_daemon (which runs once for each device).

There is a part of ublk_queue_init (ring initialization) which does
actually need to happen on the thread that will use the ring; that is
separated into a separate ublk_thread_init which is still called by each
I/O handler thread.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-4-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 68 +++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 21 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 1602cf6f07a0..2d6d163b7448 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -412,6 +412,17 @@ static void ublk_queue_deinit(struct ublk_queue *q)
 	int i;
 	int nr_ios = q->q_depth;
 
+	if (q->io_cmd_buf)
+		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
+
+	for (i = 0; i < nr_ios; i++)
+		free(q->ios[i].buf_addr);
+}
+
+static void ublk_thread_deinit(struct ublk_queue *q)
+{
+	q->tid = 0;
+
 	io_uring_unregister_buffers(&q->ring);
 
 	io_uring_unregister_ring_fd(&q->ring);
@@ -421,28 +432,20 @@ static void ublk_queue_deinit(struct ublk_queue *q)
 		close(q->ring.ring_fd);
 		q->ring.ring_fd = -1;
 	}
-
-	if (q->io_cmd_buf)
-		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
-
-	for (i = 0; i < nr_ios; i++)
-		free(q->ios[i].buf_addr);
 }
 
 static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 {
 	struct ublk_dev *dev = q->dev;
 	int depth = dev->dev_info.queue_depth;
-	int i, ret = -1;
+	int i;
 	int cmd_buf_size, io_buf_size;
 	unsigned long off;
-	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
 
 	q->tgt_ops = dev->tgt.ops;
 	q->state = 0;
 	q->q_depth = depth;
 	q->cmd_inflight = 0;
-	q->tid = gettid();
 
 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
 		q->state |= UBLKSRV_NO_BUF;
@@ -479,6 +482,22 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 		}
 	}
 
+	return 0;
+ fail:
+	ublk_queue_deinit(q);
+	ublk_err("ublk dev %d queue %d failed\n",
+			dev->dev_info.dev_id, q->q_id);
+	return -ENOMEM;
+}
+
+static int ublk_thread_init(struct ublk_queue *q)
+{
+	struct ublk_dev *dev = q->dev;
+	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
+	int ret;
+
+	q->tid = gettid();
+
 	ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
 			IORING_SETUP_COOP_TASKRUN |
 			IORING_SETUP_SINGLE_ISSUER |
@@ -508,9 +527,9 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	}
 
 	return 0;
- fail:
-	ublk_queue_deinit(q);
-	ublk_err("ublk dev %d queue %d failed\n",
+fail:
+	ublk_thread_deinit(q);
+	ublk_err("ublk dev %d queue %d thread init failed\n",
 			dev->dev_info.dev_id, q->q_id);
 	return -ENOMEM;
 }
@@ -778,7 +797,6 @@ struct ublk_queue_info {
 	struct ublk_queue 	*q;
 	sem_t 			*queue_sem;
 	cpu_set_t 		*affinity;
-	unsigned char 		auto_zc_fallback;
 };
 
 static void *ublk_io_handler_fn(void *data)
@@ -786,15 +804,11 @@ static void *ublk_io_handler_fn(void *data)
 	struct ublk_queue_info *info = data;
 	struct ublk_queue *q = info->q;
 	int dev_id = q->dev->dev_info.dev_id;
-	unsigned extra_flags = 0;
 	int ret;
 
-	if (info->auto_zc_fallback)
-		extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
-
-	ret = ublk_queue_init(q, extra_flags);
+	ret = ublk_thread_init(q);
 	if (ret) {
-		ublk_err("ublk dev %d queue %d init queue failed\n",
+		ublk_err("ublk dev %d queue %d thread init failed\n",
 				dev_id, q->q_id);
 		return NULL;
 	}
@@ -813,7 +827,7 @@ static void *ublk_io_handler_fn(void *data)
 	} while (1);
 
 	ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id);
-	ublk_queue_deinit(q);
+	ublk_thread_deinit(q);
 	return NULL;
 }
 
@@ -857,6 +871,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 {
 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
 	struct ublk_queue_info *qinfo;
+	unsigned extra_flags = 0;
 	cpu_set_t *affinity_buf;
 	void *thread_ret;
 	sem_t queue_sem;
@@ -878,14 +893,23 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	if (ret)
 		return ret;
 
+	if (ctx->auto_zc_fallback)
+		extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+
 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
 		dev->q[i].dev = dev;
 		dev->q[i].q_id = i;
 
+		ret = ublk_queue_init(&dev->q[i], extra_flags);
+		if (ret) {
+			ublk_err("ublk dev %d queue %d init queue failed\n",
+				 dinfo->dev_id, i);
+			goto fail;
+		}
+
 		qinfo[i].q = &dev->q[i];
 		qinfo[i].queue_sem = &queue_sem;
 		qinfo[i].affinity = &affinity_buf[i];
-		qinfo[i].auto_zc_fallback = ctx->auto_zc_fallback;
 		pthread_create(&dev->q[i].thread, NULL,
 				ublk_io_handler_fn,
 				&qinfo[i]);
@@ -918,6 +942,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	for (i = 0; i < dinfo->nr_hw_queues; i++)
 		pthread_join(dev->q[i].thread, &thread_ret);
  fail:
+	for (i = 0; i < dinfo->nr_hw_queues; i++)
+		ublk_queue_deinit(&dev->q[i]);
 	ublk_dev_unprep(dev);
 	ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
 
-- 
cgit v1.2.3


From b9848ca7a7643b13feed9ebb67f5150eb731610c Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:14 -0600
Subject: selftests: ublk: kublk: move per-thread data out of ublk_queue

Towards the goal of decoupling ublk_queues from ublk server threads,
move resources/data that should be per-thread rather than per-queue out
of ublk_queue and into a new struct ublk_thread.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-5-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 224 +++++++++++++++++++----------------
 tools/testing/selftests/ublk/kublk.h |  37 ++++--
 2 files changed, 144 insertions(+), 117 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 2d6d163b7448..40431a8357a8 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -348,8 +348,8 @@ static void ublk_ctrl_dump(struct ublk_dev *dev)
 
 		for (i = 0; i < info->nr_hw_queues; i++) {
 			ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
-			printf("\tqueue %u: tid %d affinity(%s)\n",
-					i, dev->q[i].tid, buf);
+			printf("\tqueue %u: affinity(%s)\n",
+					i, buf);
 		}
 		free(affinity);
 	}
@@ -419,18 +419,16 @@ static void ublk_queue_deinit(struct ublk_queue *q)
 		free(q->ios[i].buf_addr);
 }
 
-static void ublk_thread_deinit(struct ublk_queue *q)
+static void ublk_thread_deinit(struct ublk_thread *t)
 {
-	q->tid = 0;
+	io_uring_unregister_buffers(&t->ring);
 
-	io_uring_unregister_buffers(&q->ring);
+	io_uring_unregister_ring_fd(&t->ring);
 
-	io_uring_unregister_ring_fd(&q->ring);
-
-	if (q->ring.ring_fd > 0) {
-		io_uring_unregister_files(&q->ring);
-		close(q->ring.ring_fd);
-		q->ring.ring_fd = -1;
+	if (t->ring.ring_fd > 0) {
+		io_uring_unregister_files(&t->ring);
+		close(t->ring.ring_fd);
+		t->ring.ring_fd = -1;
 	}
 }
 
@@ -445,7 +443,6 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	q->tgt_ops = dev->tgt.ops;
 	q->state = 0;
 	q->q_depth = depth;
-	q->cmd_inflight = 0;
 
 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
 		q->state |= UBLKSRV_NO_BUF;
@@ -470,6 +467,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	for (i = 0; i < q->q_depth; i++) {
 		q->ios[i].buf_addr = NULL;
 		q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE;
+		q->ios[i].tag = i;
 
 		if (q->state & UBLKSRV_NO_BUF)
 			continue;
@@ -490,47 +488,46 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	return -ENOMEM;
 }
 
-static int ublk_thread_init(struct ublk_queue *q)
+static int ublk_thread_init(struct ublk_thread *t)
 {
-	struct ublk_dev *dev = q->dev;
+	struct ublk_dev *dev = t->dev;
 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
 	int ret;
 
-	q->tid = gettid();
-
-	ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
+	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
 			IORING_SETUP_COOP_TASKRUN |
 			IORING_SETUP_SINGLE_ISSUER |
 			IORING_SETUP_DEFER_TASKRUN);
 	if (ret < 0) {
-		ublk_err("ublk dev %d queue %d setup io_uring failed %d\n",
-				q->dev->dev_info.dev_id, q->q_id, ret);
+		ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
+				dev->dev_info.dev_id, t->idx, ret);
 		goto fail;
 	}
 
 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
-		ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+		ret = io_uring_register_buffers_sparse(
+			&t->ring, dev->dev_info.queue_depth);
 		if (ret) {
-			ublk_err("ublk dev %d queue %d register spare buffers failed %d",
-					dev->dev_info.dev_id, q->q_id, ret);
+			ublk_err("ublk dev %d thread %d register spare buffers failed %d",
+					dev->dev_info.dev_id, t->idx, ret);
 			goto fail;
 		}
 	}
 
-	io_uring_register_ring_fd(&q->ring);
+	io_uring_register_ring_fd(&t->ring);
 
-	ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds);
+	ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
 	if (ret) {
-		ublk_err("ublk dev %d queue %d register files failed %d\n",
-				q->dev->dev_info.dev_id, q->q_id, ret);
+		ublk_err("ublk dev %d thread %d register files failed %d\n",
+				t->dev->dev_info.dev_id, t->idx, ret);
 		goto fail;
 	}
 
 	return 0;
 fail:
-	ublk_thread_deinit(q);
-	ublk_err("ublk dev %d queue %d thread init failed\n",
-			dev->dev_info.dev_id, q->q_id);
+	ublk_thread_deinit(t);
+	ublk_err("ublk dev %d thread %d init failed\n",
+			dev->dev_info.dev_id, t->idx);
 	return -ENOMEM;
 }
 
@@ -589,8 +586,10 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
 	sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
 }
 
-int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
+int ublk_queue_io_cmd(struct ublk_io *io)
 {
+	struct ublk_thread *t = io->t;
+	struct ublk_queue *q = ublk_io_to_queue(io);
 	struct ublksrv_io_cmd *cmd;
 	struct io_uring_sqe *sqe[1];
 	unsigned int cmd_op = 0;
@@ -615,13 +614,13 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 	else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
 		cmd_op = UBLK_U_IO_FETCH_REQ;
 
-	if (io_uring_sq_space_left(&q->ring) < 1)
-		io_uring_submit(&q->ring);
+	if (io_uring_sq_space_left(&t->ring) < 1)
+		io_uring_submit(&t->ring);
 
-	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
+	ublk_io_alloc_sqes(io, sqe, 1);
 	if (!sqe[0]) {
-		ublk_err("%s: run out of sqe %d, tag %d\n",
-				__func__, q->q_id, tag);
+		ublk_err("%s: run out of sqe. thread %u, tag %d\n",
+				__func__, t->idx, io->tag);
 		return -1;
 	}
 
@@ -636,7 +635,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 	sqe[0]->opcode	= IORING_OP_URING_CMD;
 	sqe[0]->flags	= IOSQE_FIXED_FILE;
 	sqe[0]->rw_flags	= 0;
-	cmd->tag	= tag;
+	cmd->tag	= io->tag;
 	cmd->q_id	= q->q_id;
 	if (!(q->state & UBLKSRV_NO_BUF))
 		cmd->addr	= (__u64) (uintptr_t) io->buf_addr;
@@ -644,37 +643,46 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 		cmd->addr	= 0;
 
 	if (q->state & UBLKSRV_AUTO_BUF_REG)
-		ublk_set_auto_buf_reg(q, sqe[0], tag);
+		ublk_set_auto_buf_reg(q, sqe[0], io->tag);
 
-	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
+	user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
 	io_uring_sqe_set_data64(sqe[0], user_data);
 
 	io->flags = 0;
 
-	q->cmd_inflight += 1;
+	t->cmd_inflight += 1;
 
-	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
-			__func__, q->q_id, tag, cmd_op,
-			io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
+	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
+			__func__, t->idx, q->q_id, io->tag, cmd_op,
+			io->flags, !!(t->state & UBLKSRV_THREAD_STOPPING));
 	return 1;
 }
 
-static void ublk_submit_fetch_commands(struct ublk_queue *q)
+static void ublk_submit_fetch_commands(struct ublk_thread *t)
 {
+	/*
+	 * Service exclusively the queue whose q_id matches our thread
+	 * index. This may change in the future.
+	 */
+	struct ublk_queue *q = &t->dev->q[t->idx];
+	struct ublk_io *io;
 	int i = 0;
 
-	for (i = 0; i < q->q_depth; i++)
-		ublk_queue_io_cmd(q, &q->ios[i], i);
+	for (i = 0; i < q->q_depth; i++) {
+		io = &q->ios[i];
+		io->t = t;
+		ublk_queue_io_cmd(io);
+	}
 }
 
-static int ublk_queue_is_idle(struct ublk_queue *q)
+static int ublk_thread_is_idle(struct ublk_thread *t)
 {
-	return !io_uring_sq_ready(&q->ring) && !q->io_inflight;
+	return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
 }
 
-static int ublk_queue_is_done(struct ublk_queue *q)
+static int ublk_thread_is_done(struct ublk_thread *t)
 {
-	return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q);
+	return (t->state & UBLKSRV_THREAD_STOPPING) && ublk_thread_is_idle(t);
 }
 
 static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
@@ -692,15 +700,16 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
 		q->tgt_ops->tgt_io_done(q, tag, cqe);
 }
 
-static void ublk_handle_cqe(struct ublk_dev *dev,
+static void ublk_handle_cqe(struct ublk_thread *t,
 		struct io_uring_cqe *cqe, void *data)
 {
+	struct ublk_dev *dev = t->dev;
 	unsigned q_id = user_data_to_q_id(cqe->user_data);
 	struct ublk_queue *q = &dev->q[q_id];
 	unsigned tag = user_data_to_tag(cqe->user_data);
 	unsigned cmd_op = user_data_to_op(cqe->user_data);
 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
-		!(q->state & UBLKSRV_QUEUE_STOPPING);
+		!(t->state & UBLKSRV_THREAD_STOPPING);
 	struct ublk_io *io;
 
 	if (cqe->res < 0 && cqe->res != -ENODEV)
@@ -711,7 +720,7 @@ static void ublk_handle_cqe(struct ublk_dev *dev,
 			__func__, cqe->res, q->q_id, tag, cmd_op,
 			is_target_io(cqe->user_data),
 			user_data_to_tgt_data(cqe->user_data),
-			(q->state & UBLKSRV_QUEUE_STOPPING));
+			(t->state & UBLKSRV_THREAD_STOPPING));
 
 	/* Don't retrieve io in case of target io */
 	if (is_target_io(cqe->user_data)) {
@@ -720,10 +729,10 @@ static void ublk_handle_cqe(struct ublk_dev *dev,
 	}
 
 	io = &q->ios[tag];
-	q->cmd_inflight--;
+	t->cmd_inflight--;
 
 	if (!fetch) {
-		q->state |= UBLKSRV_QUEUE_STOPPING;
+		t->state |= UBLKSRV_THREAD_STOPPING;
 		io->flags &= ~UBLKSRV_NEED_FETCH_RQ;
 	}
 
@@ -733,7 +742,7 @@ static void ublk_handle_cqe(struct ublk_dev *dev,
 			q->tgt_ops->queue_io(q, tag);
 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
 		io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
-		ublk_queue_io_cmd(q, io, tag);
+		ublk_queue_io_cmd(io);
 	} else {
 		/*
 		 * COMMIT_REQ will be completed immediately since no fetching
@@ -747,87 +756,92 @@ static void ublk_handle_cqe(struct ublk_dev *dev,
 	}
 }
 
-static int ublk_reap_events_uring(struct ublk_queue *q)
+static int ublk_reap_events_uring(struct ublk_thread *t)
 {
 	struct io_uring_cqe *cqe;
 	unsigned head;
 	int count = 0;
 
-	io_uring_for_each_cqe(&q->ring, head, cqe) {
-		ublk_handle_cqe(q->dev, cqe, NULL);
+	io_uring_for_each_cqe(&t->ring, head, cqe) {
+		ublk_handle_cqe(t, cqe, NULL);
 		count += 1;
 	}
-	io_uring_cq_advance(&q->ring, count);
+	io_uring_cq_advance(&t->ring, count);
 
 	return count;
 }
 
-static int ublk_process_io(struct ublk_queue *q)
+static int ublk_process_io(struct ublk_thread *t)
 {
 	int ret, reapped;
 
-	ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n",
-				q->dev->dev_info.dev_id,
-				q->q_id, io_uring_sq_ready(&q->ring),
-				q->cmd_inflight,
-				(q->state & UBLKSRV_QUEUE_STOPPING));
+	ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
+				t->dev->dev_info.dev_id,
+				t->idx, io_uring_sq_ready(&t->ring),
+				t->cmd_inflight,
+				(t->state & UBLKSRV_THREAD_STOPPING));
 
-	if (ublk_queue_is_done(q))
+	if (ublk_thread_is_done(t))
 		return -ENODEV;
 
-	ret = io_uring_submit_and_wait(&q->ring, 1);
-	reapped = ublk_reap_events_uring(q);
+	ret = io_uring_submit_and_wait(&t->ring, 1);
+	reapped = ublk_reap_events_uring(t);
 
-	ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n",
-			ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
-			(q->state & UBLKSRV_QUEUE_IDLE));
+	ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
+			ret, reapped, (t->state & UBLKSRV_THREAD_STOPPING),
+			(t->state & UBLKSRV_THREAD_IDLE));
 
 	return reapped;
 }
 
-static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
+static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
 		cpu_set_t *cpuset)
 {
         if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
-                ublk_err("ublk dev %u queue %u set affinity failed",
-                                q->dev->dev_info.dev_id, q->q_id);
+		ublk_err("ublk dev %u thread %u set affinity failed",
+				t->dev->dev_info.dev_id, t->idx);
 }
 
-struct ublk_queue_info {
-	struct ublk_queue 	*q;
-	sem_t 			*queue_sem;
+struct ublk_thread_info {
+	struct ublk_dev 	*dev;
+	unsigned		idx;
+	sem_t 			*ready;
 	cpu_set_t 		*affinity;
 };
 
 static void *ublk_io_handler_fn(void *data)
 {
-	struct ublk_queue_info *info = data;
-	struct ublk_queue *q = info->q;
-	int dev_id = q->dev->dev_info.dev_id;
+	struct ublk_thread_info *info = data;
+	struct ublk_thread *t = &info->dev->threads[info->idx];
+	int dev_id = info->dev->dev_info.dev_id;
 	int ret;
 
-	ret = ublk_thread_init(q);
+	t->dev = info->dev;
+	t->idx = info->idx;
+
+	ret = ublk_thread_init(t);
 	if (ret) {
-		ublk_err("ublk dev %d queue %d thread init failed\n",
-				dev_id, q->q_id);
+		ublk_err("ublk dev %d thread %u init failed\n",
+				dev_id, t->idx);
 		return NULL;
 	}
 	/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
-	ublk_queue_set_sched_affinity(q, info->affinity);
-	sem_post(info->queue_sem);
+	ublk_thread_set_sched_affinity(t, info->affinity);
+	sem_post(info->ready);
 
-	ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
-			q->tid, dev_id, q->q_id);
+	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
+			gettid(), dev_id, t->idx);
 
 	/* submit all io commands to ublk driver */
-	ublk_submit_fetch_commands(q);
+	ublk_submit_fetch_commands(t);
 	do {
-		if (ublk_process_io(q) < 0)
+		if (ublk_process_io(t) < 0)
 			break;
 	} while (1);
 
-	ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id);
-	ublk_thread_deinit(q);
+	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
+		 gettid(), dev_id, t->idx);
+	ublk_thread_deinit(t);
 	return NULL;
 }
 
@@ -870,21 +884,20 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 {
 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
-	struct ublk_queue_info *qinfo;
+	struct ublk_thread_info *tinfo;
 	unsigned extra_flags = 0;
 	cpu_set_t *affinity_buf;
 	void *thread_ret;
-	sem_t queue_sem;
+	sem_t ready;
 	int ret, i;
 
 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
 
-	qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
-			dinfo->nr_hw_queues);
-	if (!qinfo)
+	tinfo = calloc(sizeof(struct ublk_thread_info), dinfo->nr_hw_queues);
+	if (!tinfo)
 		return -ENOMEM;
 
-	sem_init(&queue_sem, 0, 0);
+	sem_init(&ready, 0, 0);
 	ret = ublk_dev_prep(ctx, dev);
 	if (ret)
 		return ret;
@@ -907,17 +920,18 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 			goto fail;
 		}
 
-		qinfo[i].q = &dev->q[i];
-		qinfo[i].queue_sem = &queue_sem;
-		qinfo[i].affinity = &affinity_buf[i];
-		pthread_create(&dev->q[i].thread, NULL,
+		tinfo[i].dev = dev;
+		tinfo[i].idx = i;
+		tinfo[i].ready = &ready;
+		tinfo[i].affinity = &affinity_buf[i];
+		pthread_create(&dev->threads[i].thread, NULL,
 				ublk_io_handler_fn,
-				&qinfo[i]);
+				&tinfo[i]);
 	}
 
 	for (i = 0; i < dinfo->nr_hw_queues; i++)
-		sem_wait(&queue_sem);
-	free(qinfo);
+		sem_wait(&ready);
+	free(tinfo);
 	free(affinity_buf);
 
 	/* everything is fine now, start us */
@@ -940,7 +954,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 
 	/* wait until we are terminated */
 	for (i = 0; i < dinfo->nr_hw_queues; i++)
-		pthread_join(dev->q[i].thread, &thread_ret);
+		pthread_join(dev->threads[i].thread, &thread_ret);
  fail:
 	for (i = 0; i < dinfo->nr_hw_queues; i++)
 		ublk_queue_deinit(&dev->q[i]);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 64da26725fe1..3a2ae095bee1 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -51,10 +51,12 @@
 #define UBLK_IO_MAX_BYTES               (1 << 20)
 #define UBLK_MAX_QUEUES_SHIFT		5
 #define UBLK_MAX_QUEUES                 (1 << UBLK_MAX_QUEUES_SHIFT)
+#define UBLK_MAX_THREADS_SHIFT		5
+#define UBLK_MAX_THREADS		(1 << UBLK_MAX_THREADS_SHIFT)
 #define UBLK_QUEUE_DEPTH                1024
 
 #define UBLK_DBG_DEV            (1U << 0)
-#define UBLK_DBG_QUEUE          (1U << 1)
+#define UBLK_DBG_THREAD         (1U << 1)
 #define UBLK_DBG_IO_CMD         (1U << 2)
 #define UBLK_DBG_IO             (1U << 3)
 #define UBLK_DBG_CTRL_CMD       (1U << 4)
@@ -62,6 +64,7 @@
 
 struct ublk_dev;
 struct ublk_queue;
+struct ublk_thread;
 
 struct stripe_ctx {
 	/* stripe */
@@ -130,6 +133,7 @@ struct ublk_io {
 
 	unsigned short tgt_ios;
 	void *private_data;
+	struct ublk_thread *t;
 };
 
 struct ublk_tgt_ops {
@@ -168,28 +172,37 @@ struct ublk_tgt {
 struct ublk_queue {
 	int q_id;
 	int q_depth;
-	unsigned int cmd_inflight;
-	unsigned int io_inflight;
 	struct ublk_dev *dev;
 	const struct ublk_tgt_ops *tgt_ops;
 	struct ublksrv_io_desc *io_cmd_buf;
-	struct io_uring ring;
+
 	struct ublk_io ios[UBLK_QUEUE_DEPTH];
-#define UBLKSRV_QUEUE_STOPPING	(1U << 0)
-#define UBLKSRV_QUEUE_IDLE	(1U << 1)
 #define UBLKSRV_NO_BUF		(1U << 2)
 #define UBLKSRV_ZC		(1U << 3)
 #define UBLKSRV_AUTO_BUF_REG		(1U << 4)
 #define UBLKSRV_AUTO_BUF_REG_FALLBACK	(1U << 5)
 	unsigned state;
-	pid_t tid;
+};
+
+struct ublk_thread {
+	struct ublk_dev *dev;
+	struct io_uring ring;
+	unsigned int cmd_inflight;
+	unsigned int io_inflight;
+
 	pthread_t thread;
+	unsigned idx;
+
+#define UBLKSRV_THREAD_STOPPING	(1U << 0)
+#define UBLKSRV_THREAD_IDLE	(1U << 1)
+	unsigned state;
 };
 
 struct ublk_dev {
 	struct ublk_tgt tgt;
 	struct ublksrv_ctrl_dev_info  dev_info;
 	struct ublk_queue q[UBLK_MAX_QUEUES];
+	struct ublk_thread threads[UBLK_MAX_THREADS];
 
 	int fds[MAX_BACK_FILES + 1];	/* fds[0] points to /dev/ublkcN */
 	int nr_fds;
@@ -214,7 +227,7 @@ struct ublk_dev {
 
 
 extern unsigned int ublk_dbg_mask;
-extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag);
+extern int ublk_queue_io_cmd(struct ublk_io *io);
 
 
 static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
@@ -299,7 +312,7 @@ static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io)
 static inline int ublk_io_alloc_sqes(struct ublk_io *io,
 		struct io_uring_sqe *sqes[], int nr_sqes)
 {
-	struct io_uring *ring = &ublk_io_to_queue(io)->ring;
+	struct io_uring *ring = &io->t->ring;
 	unsigned left = io_uring_sq_space_left(ring);
 	int i;
 
@@ -390,7 +403,7 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res)
 
 	ublk_mark_io_done(io, res);
 
-	return ublk_queue_io_cmd(q, io, tag);
+	return ublk_queue_io_cmd(io);
 }
 
 static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued)
@@ -400,7 +413,7 @@ static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int qu
 	else {
 		struct ublk_io *io = ublk_get_io(q, tag);
 
-		q->io_inflight += queued;
+		io->t->io_inflight += queued;
 		io->tgt_ios = queued;
 		io->result = 0;
 	}
@@ -410,7 +423,7 @@ static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag)
 {
 	struct ublk_io *io = ublk_get_io(q, tag);
 
-	q->io_inflight--;
+	io->t->io_inflight--;
 
 	return --io->tgt_ios == 0;
 }
-- 
cgit v1.2.3


From abe54c16034631db01aba02e06da569b33002ab1 Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:15 -0600
Subject: selftests: ublk: kublk: decouple ublk_queues from ublk server threads

Add support in kublk for decoupled ublk_queues and ublk server threads.
kublk now has two modes of operation:

- (preexisting mode) threads and queues are paired 1:1, and each thread
  services all the I/Os of one queue
- (new mode) thread and queue counts are independently configurable.
  threads service I/Os in a way that balances load across threads even
  if load is not balanced over queues.

The default is the preexisting mode. The new mode is activated by
passing the --per_io_tasks flag.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-6-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/file_backed.c |   4 +-
 tools/testing/selftests/ublk/kublk.c       | 105 ++++++++++++++++++++++++-----
 tools/testing/selftests/ublk/kublk.h       |   5 ++
 tools/testing/selftests/ublk/null.c        |   6 +-
 tools/testing/selftests/ublk/stripe.c      |   4 +-
 5 files changed, 100 insertions(+), 24 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 922a87108b9f..cfa59b631693 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -54,7 +54,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 
 	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 	sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
@@ -66,7 +66,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 	sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
 	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 
-	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
+	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	return 2;
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 40431a8357a8..a98e14e4c245 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -505,8 +505,11 @@ static int ublk_thread_init(struct ublk_thread *t)
 	}
 
 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
+		unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
+		unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
+		max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
 		ret = io_uring_register_buffers_sparse(
-			&t->ring, dev->dev_info.queue_depth);
+			&t->ring, max_nr_ios_per_thread);
 		if (ret) {
 			ublk_err("ublk dev %d thread %d register spare buffers failed %d",
 					dev->dev_info.dev_id, t->idx, ret);
@@ -578,7 +581,7 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
 	if (q->tgt_ops->buf_index)
 		buf.index = q->tgt_ops->buf_index(q, tag);
 	else
-		buf.index = tag;
+		buf.index = q->ios[tag].buf_index;
 
 	if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
@@ -660,18 +663,44 @@ int ublk_queue_io_cmd(struct ublk_io *io)
 
 static void ublk_submit_fetch_commands(struct ublk_thread *t)
 {
-	/*
-	 * Service exclusively the queue whose q_id matches our thread
-	 * index. This may change in the future.
-	 */
-	struct ublk_queue *q = &t->dev->q[t->idx];
+	struct ublk_queue *q;
 	struct ublk_io *io;
-	int i = 0;
+	int i = 0, j = 0;
 
-	for (i = 0; i < q->q_depth; i++) {
-		io = &q->ios[i];
-		io->t = t;
-		ublk_queue_io_cmd(io);
+	if (t->dev->per_io_tasks) {
+		/*
+		 * Lexicographically order all the (qid,tag) pairs, with
+		 * qid taking priority (so (1,0) > (0,1)). Then make
+		 * this thread the daemon for every Nth entry in this
+		 * list (N is the number of threads), starting at this
+		 * thread's index. This ensures that each queue is
+		 * handled by as many ublk server threads as possible,
+		 * so that load that is concentrated on one or a few
+		 * queues can make use of all ublk server threads.
+		 */
+		const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
+		int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
+		for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
+			int q_id = i / dinfo->queue_depth;
+			int tag = i % dinfo->queue_depth;
+			q = &t->dev->q[q_id];
+			io = &q->ios[tag];
+			io->t = t;
+			io->buf_index = j++;
+			ublk_queue_io_cmd(io);
+		}
+	} else {
+		/*
+		 * Service exclusively the queue whose q_id matches our
+		 * thread index.
+		 */
+		struct ublk_queue *q = &t->dev->q[t->idx];
+		for (i = 0; i < q->q_depth; i++) {
+			io = &q->ios[i];
+			io->t = t;
+			io->buf_index = i;
+			ublk_queue_io_cmd(io);
+		}
 	}
 }
 
@@ -826,7 +855,8 @@ static void *ublk_io_handler_fn(void *data)
 		return NULL;
 	}
 	/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
-	ublk_thread_set_sched_affinity(t, info->affinity);
+	if (info->affinity)
+		ublk_thread_set_sched_affinity(t, info->affinity);
 	sem_post(info->ready);
 
 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
@@ -893,7 +923,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 
 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
 
-	tinfo = calloc(sizeof(struct ublk_thread_info), dinfo->nr_hw_queues);
+	tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
 	if (!tinfo)
 		return -ENOMEM;
 
@@ -919,17 +949,29 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 				 dinfo->dev_id, i);
 			goto fail;
 		}
+	}
 
+	for (i = 0; i < dev->nthreads; i++) {
 		tinfo[i].dev = dev;
 		tinfo[i].idx = i;
 		tinfo[i].ready = &ready;
-		tinfo[i].affinity = &affinity_buf[i];
+
+		/*
+		 * If threads are not tied 1:1 to queues, setting thread
+		 * affinity based on queue affinity makes little sense.
+		 * However, thread CPU affinity has significant impact
+		 * on performance, so to compare fairly, we'll still set
+		 * thread CPU affinity based on queue affinity where
+		 * possible.
+		 */
+		if (dev->nthreads == dinfo->nr_hw_queues)
+			tinfo[i].affinity = &affinity_buf[i];
 		pthread_create(&dev->threads[i].thread, NULL,
 				ublk_io_handler_fn,
 				&tinfo[i]);
 	}
 
-	for (i = 0; i < dinfo->nr_hw_queues; i++)
+	for (i = 0; i < dev->nthreads; i++)
 		sem_wait(&ready);
 	free(tinfo);
 	free(affinity_buf);
@@ -953,7 +995,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
 
 	/* wait until we are terminated */
-	for (i = 0; i < dinfo->nr_hw_queues; i++)
+	for (i = 0; i < dev->nthreads; i++)
 		pthread_join(dev->threads[i].thread, &thread_ret);
  fail:
 	for (i = 0; i < dinfo->nr_hw_queues; i++)
@@ -1063,6 +1105,7 @@ wait:
 
 static int __cmd_dev_add(const struct dev_ctx *ctx)
 {
+	unsigned nthreads = ctx->nthreads;
 	unsigned nr_queues = ctx->nr_hw_queues;
 	const char *tgt_type = ctx->tgt_type;
 	unsigned depth = ctx->queue_depth;
@@ -1086,6 +1129,23 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 		return -EINVAL;
 	}
 
+	/* default to 1:1 threads:queues if nthreads is unspecified */
+	if (!nthreads)
+		nthreads = nr_queues;
+
+	if (nthreads > UBLK_MAX_THREADS) {
+		ublk_err("%s: %u is too many threads (max %u)\n",
+				__func__, nthreads, UBLK_MAX_THREADS);
+		return -EINVAL;
+	}
+
+	if (nthreads != nr_queues && !ctx->per_io_tasks) {
+		ublk_err("%s: threads %u must be same as queues %u if "
+			"not using per_io_tasks\n",
+			__func__, nthreads, nr_queues);
+		return -EINVAL;
+	}
+
 	dev = ublk_ctrl_init();
 	if (!dev) {
 		ublk_err("%s: can't alloc dev id %d, type %s\n",
@@ -1109,6 +1169,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	if ((features & UBLK_F_QUIESCE) &&
 			(info->flags & UBLK_F_USER_RECOVERY))
 		info->flags |= UBLK_F_QUIESCE;
+	dev->nthreads = nthreads;
+	dev->per_io_tasks = ctx->per_io_tasks;
 	dev->tgt.ops = ops;
 	dev->tgt.sq_depth = depth;
 	dev->tgt.cq_depth = depth;
@@ -1307,6 +1369,7 @@ static int cmd_dev_get_features(void)
 		[const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
 		[const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
 		[const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
+		[const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
@@ -1401,8 +1464,10 @@ static void __cmd_create_help(char *exe, bool recovery)
 			exe, recovery ? "recover" : "add");
 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
 	printf("\t[-e 0|1 ] [-i 0|1]\n");
+	printf("\t[--nthreads threads] [--per_io_tasks]\n");
 	printf("\t[target options] [backfile1] [backfile2] ...\n");
 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
+	printf("\tdefault: nthreads=nr_queues");
 
 	for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
@@ -1459,6 +1524,8 @@ int main(int argc, char *argv[])
 		{ "auto_zc",		0,	NULL,  0 },
 		{ "auto_zc_fallback", 	0,	NULL,  0 },
 		{ "size",		1,	NULL, 's'},
+		{ "nthreads",		1,	NULL,  0 },
+		{ "per_io_tasks",	0,	NULL,  0 },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1534,6 +1601,10 @@ int main(int argc, char *argv[])
 				ctx.flags |= UBLK_F_AUTO_BUF_REG;
 			if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
 				ctx.auto_zc_fallback = 1;
+			if (!strcmp(longopts[option_idx].name, "nthreads"))
+				ctx.nthreads = strtol(optarg, NULL, 10);
+			if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
+				ctx.per_io_tasks = 1;
 			break;
 		case '?':
 			/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 3a2ae095bee1..6be601536b3d 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -80,6 +80,7 @@ struct dev_ctx {
 	char tgt_type[16];
 	unsigned long flags;
 	unsigned nr_hw_queues;
+	unsigned short nthreads;
 	unsigned queue_depth;
 	int dev_id;
 	int nr_files;
@@ -89,6 +90,7 @@ struct dev_ctx {
 	unsigned int	fg:1;
 	unsigned int	recovery:1;
 	unsigned int	auto_zc_fallback:1;
+	unsigned int	per_io_tasks:1;
 
 	int _evtfd;
 	int _shmid;
@@ -131,6 +133,7 @@ struct ublk_io {
 
 	int result;
 
+	unsigned short buf_index;
 	unsigned short tgt_ios;
 	void *private_data;
 	struct ublk_thread *t;
@@ -203,6 +206,8 @@ struct ublk_dev {
 	struct ublksrv_ctrl_dev_info  dev_info;
 	struct ublk_queue q[UBLK_MAX_QUEUES];
 	struct ublk_thread threads[UBLK_MAX_THREADS];
+	unsigned nthreads;
+	unsigned per_io_tasks;
 
 	int fds[MAX_BACK_FILES + 1];	/* fds[0] points to /dev/ublkcN */
 	int nr_fds;
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 9acc7e0d271b..afe0b99d77ee 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -62,7 +62,7 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
 
 	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
@@ -70,7 +70,7 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
 	__setup_nop_io(tag, iod, sqe[1], q->q_id);
 	sqe[1]->flags |= IOSQE_IO_HARDLINK;
 
-	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
+	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
@@ -136,7 +136,7 @@ static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
 {
 	if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
 		return (unsigned short)-1;
-	return tag;
+	return q->ios[tag].buf_index;
 }
 
 const struct ublk_tgt_ops null_tgt_ops = {
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 97079c3121ef..37d50bbf5f5e 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -141,7 +141,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra);
 
 	if (zc) {
-		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index);
 		sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 		sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
@@ -167,7 +167,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 	if (zc) {
 		struct io_uring_sqe *unreg = sqe[s->nr + 1];
 
-		io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag);
+		io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index);
 		unreg->user_data = build_user_data(
 			tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
 	}
-- 
cgit v1.2.3


From 236918d3e9ac45d593c2f74e1df598483a508d2f Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:16 -0600
Subject: selftests: ublk: add functional test for per io daemons

Add a new test test_generic_12 which:

- sets up a ublk server with per_io_tasks and a different number of ublk
  server threads and ublk_queues. This is possible now that these
  objects are decoupled
- runs some I/O load from a single CPU
- verifies that all the ublk server threads handle some I/O

Before this changeset, this test fails, since I/O issued from one CPU is
always handled by the one ublk server thread. After this changeset, the
test passes.

In the future, the last check above may be strengthened to "verify that
all ublk server threads handle the same amount of I/O." However, this
requires some adjustments/bugfixes to tag allocation, so this work is
postponed to a followup.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-7-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile              |  1 +
 tools/testing/selftests/ublk/test_generic_12.sh    | 55 ++++++++++++++++++++++
 .../selftests/ublk/trace/count_ios_per_tid.bt      | 11 +++++
 3 files changed, 67 insertions(+)
 create mode 100755 tools/testing/selftests/ublk/test_generic_12.sh
 create mode 100644 tools/testing/selftests/ublk/trace/count_ios_per_tid.bt

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 4dde8838261d..5d7f4ecfb816 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -19,6 +19,7 @@ TEST_PROGS += test_generic_08.sh
 TEST_PROGS += test_generic_09.sh
 TEST_PROGS += test_generic_10.sh
 TEST_PROGS += test_generic_11.sh
+TEST_PROGS += test_generic_12.sh
 
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh
new file mode 100755
index 000000000000..7abbb00d251d
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_12.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_12"
+ERR_CODE=0
+
+if ! _have_program bpftrace; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "null" "do imbalanced load, it should be balanced over I/O threads"
+
+NTHREADS=6
+dev_id=$(_add_ublk_dev -t null -q 4 -d 16 --nthreads $NTHREADS --per_io_tasks)
+_check_add_dev $TID $?
+
+dev_t=$(_get_disk_dev_t "$dev_id")
+bpftrace trace/count_ios_per_tid.bt "$dev_t" > "$UBLK_TMP" 2>&1 &
+btrace_pid=$!
+sleep 2
+
+if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then
+	_cleanup_test "null"
+	exit "$UBLK_SKIP_CODE"
+fi
+
+# do imbalanced I/O on the ublk device
+# pin to cpu 0 to prevent migration/only target one queue
+fio --name=write_seq \
+    --filename=/dev/ublkb"${dev_id}" \
+    --ioengine=libaio --iodepth=16 \
+    --rw=write \
+    --size=512M \
+    --direct=1 \
+    --bs=4k \
+    --cpus_allowed=0 > /dev/null 2>&1
+ERR_CODE=$?
+kill "$btrace_pid"
+wait
+
+# check that every task handles some I/O, even though all I/O was issued
+# from a single CPU. when ublk gets support for round-robin tag
+# allocation, this check can be strengthened to assert that every thread
+# handles the same number of I/Os
+NR_THREADS_THAT_HANDLED_IO=$(grep -c '@' ${UBLK_TMP})
+if [[ $NR_THREADS_THAT_HANDLED_IO -ne $NTHREADS ]]; then
+        echo "only $NR_THREADS_THAT_HANDLED_IO handled I/O! expected $NTHREADS"
+        cat "$UBLK_TMP"
+        ERR_CODE=255
+fi
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
new file mode 100644
index 000000000000..f4aa63ff2938
--- /dev/null
+++ b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
@@ -0,0 +1,11 @@
+/*
+ * Tabulates and prints I/O completions per thread for the given device
+ *
+ * $1: dev_t
+*/
+tracepoint:block:block_rq_complete
+{
+	if (args.dev == $1) {
+		@[tid] = count();
+	}
+}
-- 
cgit v1.2.3


From 17574aa2a06b1f5bc447433ceaaa3df3543cc632 Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Thu, 29 May 2025 17:47:17 -0600
Subject: selftests: ublk: add stress test for per io daemons

Add a new test_stress_06 for the per io daemons feature. This is just a
copy of test_stress_01 with the per_io_tasks flag added, with varying
amounts of nthreads. This test is able to reproduce a panic which was
caught manually during development [1]; in the current version of this
patch set, it passes.

Note that this commit also makes all stress tests using the
run_io_and_remove helper more stressful by additionally exercising the
batch submit (queue_rqs) path.

[1] https://lore.kernel.org/linux-block/aDgwGoGCEpwd1mFY@fedora/

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-8-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile          |  1 +
 tools/testing/selftests/ublk/test_common.sh    |  5 ++++
 tools/testing/selftests/ublk/test_stress_06.sh | 36 ++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)
 create mode 100755 tools/testing/selftests/ublk/test_stress_06.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 5d7f4ecfb816..1fb1a95d452c 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -38,6 +38,7 @@ TEST_PROGS += test_stress_02.sh
 TEST_PROGS += test_stress_03.sh
 TEST_PROGS += test_stress_04.sh
 TEST_PROGS += test_stress_05.sh
+TEST_PROGS += test_stress_06.sh
 
 TEST_GEN_PROGS_EXTENDED = kublk
 
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 0145569ee7e9..8a4dbd09feb0 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -278,6 +278,11 @@ __run_io_and_remove()
 	fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
 		--rw=randrw --norandommap --iodepth=256 --size="${size}" --numjobs="$(nproc)" \
 		--runtime=20 --time_based > /dev/null 2>&1 &
+	fio --name=batchjob --filename=/dev/ublkb"${dev_id}" --ioengine=io_uring \
+		--rw=randrw --norandommap --iodepth=256 --size="${size}" \
+		--numjobs="$(nproc)" --runtime=20 --time_based \
+		--iodepth_batch_submit=32 --iodepth_batch_complete_min=32 \
+		--force_async=7 > /dev/null 2>&1 &
 	sleep 2
 	if [ "${kill_server}" = "yes" ]; then
 		local state
diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh
new file mode 100755
index 000000000000..3aee8521032e
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_stress_06.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+TID="stress_06"
+ERR_CODE=0
+
+ublk_io_and_remove()
+{
+	run_io_and_remove "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "stress" "run IO and remove device with per_io_tasks"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_io_and_remove 8G -t null -q 4 --nthreads 5 --per_io_tasks &
+ublk_io_and_remove 256M -t loop -q 4 --nthreads 3 --per_io_tasks \
+        "${UBLK_BACKFILES[0]}" &
+ublk_io_and_remove 256M -t stripe -q 4 --nthreads 4 --per_io_tasks \
+        "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+_cleanup_test "stress"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From 4b65d5ae971430287855a89635a184c489bd02a5 Mon Sep 17 00:00:00 2001
From: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Date: Mon, 12 May 2025 14:41:07 +0530
Subject: selftests/bpf: Fix bpf selftest build error

On linux-next, build for bpf selftest displays an error due to
mismatch in the expected function signature of bpf_testmod_test_read
and bpf_testmod_test_write.

Commit 97d06802d10a ("sysfs: constify bin_attribute argument of bin_attribute::read/write()")
changed the required type for struct bin_attribute to const struct bin_attribute.

To resolve the error, update corresponding signature for the callback.

Fixes: 97d06802d10a ("sysfs: constify bin_attribute argument of bin_attribute::read/write()")
Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Closes: https://lore.kernel.org/all/e915da49-2b9a-4c4c-a34f-877f378129f6@linux.ibm.com/
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Link: https://lore.kernel.org/r/20250512091108.2015615-1-skb99@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index e6c248e3ae54..e9e918cdf31f 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -385,7 +385,7 @@ int bpf_testmod_fentry_ok;
 
 noinline ssize_t
 bpf_testmod_test_read(struct file *file, struct kobject *kobj,
-		      struct bin_attribute *bin_attr,
+		      const struct bin_attribute *bin_attr,
 		      char *buf, loff_t off, size_t len)
 {
 	struct bpf_testmod_test_read_ctx ctx = {
@@ -465,7 +465,7 @@ ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO);
 
 noinline ssize_t
 bpf_testmod_test_write(struct file *file, struct kobject *kobj,
-		      struct bin_attribute *bin_attr,
+		      const struct bin_attribute *bin_attr,
 		      char *buf, loff_t off, size_t len)
 {
 	struct bpf_testmod_test_write_ctx ctx = {
@@ -567,7 +567,7 @@ static void testmod_unregister_uprobe(void)
 
 static ssize_t
 bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
-			 struct bin_attribute *bin_attr,
+			 const struct bin_attribute *bin_attr,
 			 char *buf, loff_t off, size_t len)
 {
 	unsigned long offset = 0;
-- 
cgit v1.2.3


From baa39c169dd526cb0186187fc44ec462266efcc6 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 29 May 2025 13:11:51 -0700
Subject: selftests/bpf: Fix selftest
 btf_tag/btf_type_tag_percpu_vmlinux_helper failure

Ihor Solodrai reported selftest 'btf_tag/btf_type_tag_percpu_vmlinux_helper'
failure ([1]) during 6.16 merge window. The failure log:

  ...
  7: (15) if r0 == 0x0 goto pc+1        ; R0=ptr_css_rstat_cpu()
  ; *(volatile int *)rstat; @ btf_type_tag_percpu.c:68
  8: (61) r1 = *(u32 *)(r0 +0)
  cannot access ptr member updated_children with moff 0 in struct css_rstat_cpu with off 0 size 4

Two changes are needed. First, 'struct cgroup_rstat_cpu' needs to be
replaced with 'struct css_rstat_cpu' to be consistent with new data
structure. Second, layout of 'css_rstat_cpu' is changed compared
to 'cgroup_rstat_cpu'. The first member becomes a pointer so
the bpf prog needs to do 8-byte load instead of 4-byte load.

  [1] https://lore.kernel.org/bpf/6f688f2e-7d26-423a-9029-d1b1ef1c938a@linux.dev/

Cc: Ihor Solodrai <ihor.solodrai@linux.dev>
Cc: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20250529201151.1787575-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
index 69f81cb555ca..d93f68024cc6 100644
--- a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
+++ b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
@@ -57,15 +57,15 @@ int BPF_PROG(test_percpu_load, struct cgroup *cgrp, const char *path)
 SEC("tp_btf/cgroup_mkdir")
 int BPF_PROG(test_percpu_helper, struct cgroup *cgrp, const char *path)
 {
-	struct cgroup_rstat_cpu *rstat;
+	struct css_rstat_cpu *rstat;
 	__u32 cpu;
 
 	cpu = bpf_get_smp_processor_id();
-	rstat = (struct cgroup_rstat_cpu *)bpf_per_cpu_ptr(
+	rstat = (struct css_rstat_cpu *)bpf_per_cpu_ptr(
 			cgrp->self.rstat_cpu, cpu);
 	if (rstat) {
 		/* READ_ONCE */
-		*(volatile int *)rstat;
+		*(volatile long *)rstat;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From da12597a1d8c1f91ae509520e9e1bf90648feb93 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 2 Jun 2025 21:21:13 +0800
Subject: selftests: ublk: cover PER_IO_DAEMON in more stress tests

We have stress_03, stress_04 and stress_05 for checking new feature vs.
stress IO & device removal & ublk server crash & recovery, so let the
three existing stress tests cover PER_IO_DAEMON.

Then stress_06 can be removed, since the same test function is included in
stress_03.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250602132113.1398645-1-ming.lei@redhat.com
Reviewed-by: Uday Shankar <ushankar@purestorage.com>
[axboe: remove test_stress_06.sh from Makefile too]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile          |  1 -
 tools/testing/selftests/ublk/test_stress_03.sh |  8 ++++++
 tools/testing/selftests/ublk/test_stress_04.sh |  7 +++++
 tools/testing/selftests/ublk/test_stress_05.sh |  7 +++++
 tools/testing/selftests/ublk/test_stress_06.sh | 36 --------------------------
 5 files changed, 22 insertions(+), 37 deletions(-)
 delete mode 100755 tools/testing/selftests/ublk/test_stress_06.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 1fb1a95d452c..5d7f4ecfb816 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -38,7 +38,6 @@ TEST_PROGS += test_stress_02.sh
 TEST_PROGS += test_stress_03.sh
 TEST_PROGS += test_stress_04.sh
 TEST_PROGS += test_stress_05.sh
-TEST_PROGS += test_stress_06.sh
 
 TEST_GEN_PROGS_EXTENDED = kublk
 
diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh
index 7d728ce50774..6eef282d569f 100755
--- a/tools/testing/selftests/ublk/test_stress_03.sh
+++ b/tools/testing/selftests/ublk/test_stress_03.sh
@@ -41,5 +41,13 @@ if _have_feature "AUTO_BUF_REG"; then
 fi
 wait
 
+if _have_feature "PER_IO_DAEMON"; then
+	ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks &
+	ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+	ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+	ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks &
+fi
+wait
+
 _cleanup_test "stress"
 _show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index 9bcfa64ea1f0..40d1437ca298 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -38,6 +38,13 @@ if _have_feature "AUTO_BUF_REG"; then
 	ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
 	ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
 fi
+
+if _have_feature "PER_IO_DAEMON"; then
+	ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+	ublk_io_and_kill_daemon 256M -t loop -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+	ublk_io_and_kill_daemon 256M -t stripe -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+	ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+fi
 wait
 
 _cleanup_test "stress"
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index bcfc904cefc6..566cfd90d192 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -69,5 +69,12 @@ if _have_feature "AUTO_BUF_REG"; then
 	done
 fi
 
+if _have_feature "PER_IO_DAEMON"; then
+	ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" &
+	ublk_io_and_remove 256M -t loop -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" "${UBLK_BACKFILES[0]}" &
+	ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue"  &
+fi
+wait
+
 _cleanup_test "stress"
 _show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh
deleted file mode 100755
index 3aee8521032e..000000000000
--- a/tools/testing/selftests/ublk/test_stress_06.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_06"
-ERR_CODE=0
-
-ublk_io_and_remove()
-{
-	run_io_and_remove "$@"
-	ERR_CODE=$?
-	if [ ${ERR_CODE} -ne 0 ]; then
-		echo "$TID failure: $*"
-		_show_result $TID $ERR_CODE
-	fi
-}
-
-if ! _have_program fio; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-_prep_test "stress" "run IO and remove device with per_io_tasks"
-
-_create_backfile 0 256M
-_create_backfile 1 128M
-_create_backfile 2 128M
-
-ublk_io_and_remove 8G -t null -q 4 --nthreads 5 --per_io_tasks &
-ublk_io_and_remove 256M -t loop -q 4 --nthreads 3 --per_io_tasks \
-        "${UBLK_BACKFILES[0]}" &
-ublk_io_and_remove 256M -t stripe -q 4 --nthreads 4 --per_io_tasks \
-        "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-_cleanup_test "stress"
-_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From f6695269dc52d133ecf6468aa7cbfe29987630ed Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 30 May 2025 06:58:00 -0700
Subject: Revert "kunit: configs: Enable CONFIG_INIT_STACK_ALL_PATTERN in
 all_tests"

This reverts commit a571a9a1b120264e24b41eddf1ac5140131bfa84.

The commit in question breaks kunit for older compilers:

$ gcc --version
 gcc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5)

$ ./tools/testing/kunit/kunit.py run  --alltests --json --arch=x86_64
 Configuring KUnit Kernel ...
 Regenerating .config ...
 Populating config with:
 $ make ARCH=x86_64 O=.kunit olddefconfig
 ERROR:root:Not all Kconfig options selected in kunitconfig were in the generated .config.
 This is probably due to unsatisfied dependencies.
 Missing: CONFIG_INIT_STACK_ALL_PATTERN=y

Link: https://lore.kernel.org/20250529083811.778bc31b@kernel.org
Fixes: a571a9a1b120 ("kunit: configs: Enable CONFIG_INIT_STACK_ALL_PATTERN in all_tests")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/20250530135800.13437-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/kunit/configs/all_tests.config | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config
index e70c502a16df..422e186cf3cf 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -10,7 +10,6 @@ CONFIG_KUNIT_EXAMPLE_TEST=y
 CONFIG_KUNIT_ALL_TESTS=y
 
 CONFIG_FORTIFY_SOURCE=y
-CONFIG_INIT_STACK_ALL_PATTERN=y
 
 CONFIG_IIO=y
 
-- 
cgit v1.2.3


From d3f2a9587ebe68f5067f9ff624f9a83dfb911f60 Mon Sep 17 00:00:00 2001
From: Bui Quang Minh <minhquangbui99@gmail.com>
Date: Sun, 1 Jun 2025 21:29:13 +0700
Subject: selftests: net: build net/lib dependency in all target

We have the logic to include net/lib automatically for net related
selftests. However, currently, this logic is only in install target
which means only `make install` will have net/lib included. This commit
adds the logic to all target so that all `make`, `make run_tests` and
`make install` will have net/lib included in net related selftests.

Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
Link: https://patch.msgid.link/20250601142914.13379-1-minhquangbui99@gmail.com
Fixes: b86761ff6374 ("selftests: net: add scaffolding for Netlink tests in Python")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6aa11cd3db42..339b31e6a6b5 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -205,7 +205,7 @@ export KHDR_INCLUDES
 
 all:
 	@ret=1;							\
-	for TARGET in $(TARGETS); do				\
+	for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do	\
 		BUILD_TARGET=$$BUILD/$$TARGET;			\
 		mkdir $$BUILD_TARGET  -p;			\
 		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET	\
-- 
cgit v1.2.3


From fdf4064aaebe4379e6d441141bed83d51b52ad04 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@openvpn.net>
Date: Tue, 20 May 2025 16:42:39 +0200
Subject: selftest/net/ovpn: fix TCP socket creation

TCP sockets cannot be created with AF_UNSPEC, but
one among the supported family must be used.

Since commit 944f8b6abab6 ("selftest/net/ovpn: extend
coverage with more test cases") the default address
family for all tests was changed from AF_INET to AF_UNSPEC,
thus breaking all TCP cases.

Restore AF_INET as default address family for TCP listeners.

Fixes: 944f8b6abab6 ("selftest/net/ovpn: extend coverage with more test cases")
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
---
 tools/testing/selftests/net/ovpn/ovpn-cli.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c
index de9c26f98b2e..9201f2905f2c 100644
--- a/tools/testing/selftests/net/ovpn/ovpn-cli.c
+++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c
@@ -2166,6 +2166,7 @@ static int ovpn_parse_cmd_args(struct ovpn_ctx *ovpn, int argc, char *argv[])
 
 		ovpn->peers_file = argv[4];
 
+		ovpn->sa_family = AF_INET;
 		if (argc > 5 && !strcmp(argv[5], "ipv6"))
 			ovpn->sa_family = AF_INET6;
 		break;
-- 
cgit v1.2.3


From 9c7e8b31da035fe81399891b2630a8e0c4b09137 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@openvpn.net>
Date: Thu, 22 May 2025 15:33:18 +0200
Subject: selftest/net/ovpn: fix missing file

test-large-mtu.sh is referenced by the Makefile
but does not exist.

Add it along the other scripts.

Fixes: 944f8b6abab6 ("selftest/net/ovpn: extend coverage with more test cases")
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
---
 tools/testing/selftests/net/ovpn/test-large-mtu.sh | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100755 tools/testing/selftests/net/ovpn/test-large-mtu.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/ovpn/test-large-mtu.sh b/tools/testing/selftests/net/ovpn/test-large-mtu.sh
new file mode 100755
index 000000000000..ce2a2cb64f72
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-large-mtu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+MTU="1500"
+
+source test.sh
-- 
cgit v1.2.3


From a2f4c1ae163b815dc81e3cab97c3149fdc6639e3 Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Tue, 3 Jun 2025 17:38:33 -0600
Subject: selftests: ublk: kublk: improve behavior on init failure

Some failure modes are handled poorly by kublk. For example, if ublk_drv
is built as a module but not currently loaded into the kernel, ./kublk
add ... just hangs forever. This happens because in this case (and a few
others), the worker process does not notify its parent (via a write to
the shared eventfd) that it has tried and failed to initialize, so the
parent hangs forever. Fix this by ensuring that we always notify the
parent process of any initialization failure, and have the parent print
a (not very descriptive) log line when this happens.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250603-ublk_init_fail-v1-1-87c91486230e@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index a98e14e4c245..e2d2042810d4 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1112,7 +1112,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	__u64 features;
 	const struct ublk_tgt_ops *ops;
 	struct ublksrv_ctrl_dev_info *info;
-	struct ublk_dev *dev;
+	struct ublk_dev *dev = NULL;
 	int dev_id = ctx->dev_id;
 	int ret, i;
 
@@ -1120,13 +1120,15 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	if (!ops) {
 		ublk_err("%s: no such tgt type, type %s\n",
 				__func__, tgt_type);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto fail;
 	}
 
 	if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
 		ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
 				__func__, nr_queues, depth);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto fail;
 	}
 
 	/* default to 1:1 threads:queues if nthreads is unspecified */
@@ -1136,30 +1138,37 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	if (nthreads > UBLK_MAX_THREADS) {
 		ublk_err("%s: %u is too many threads (max %u)\n",
 				__func__, nthreads, UBLK_MAX_THREADS);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto fail;
 	}
 
 	if (nthreads != nr_queues && !ctx->per_io_tasks) {
 		ublk_err("%s: threads %u must be same as queues %u if "
 			"not using per_io_tasks\n",
 			__func__, nthreads, nr_queues);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto fail;
 	}
 
 	dev = ublk_ctrl_init();
 	if (!dev) {
 		ublk_err("%s: can't alloc dev id %d, type %s\n",
 				__func__, dev_id, tgt_type);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	/* kernel doesn't support get_features */
 	ret = ublk_ctrl_get_features(dev, &features);
-	if (ret < 0)
-		return -EINVAL;
+	if (ret < 0) {
+		ret = -EINVAL;
+		goto fail;
+	}
 
-	if (!(features & UBLK_F_CMD_IOCTL_ENCODE))
-		return -ENOTSUP;
+	if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
+		ret = -ENOTSUP;
+		goto fail;
+	}
 
 	info = &dev->dev_info;
 	info->dev_id = ctx->dev_id;
@@ -1200,7 +1209,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 fail:
 	if (ret < 0)
 		ublk_send_dev_event(ctx, dev, -1);
-	ublk_ctrl_deinit(dev);
+	if (dev)
+		ublk_ctrl_deinit(dev);
 	return ret;
 }
 
@@ -1262,6 +1272,8 @@ run:
 		shmctl(ctx->_shmid, IPC_RMID, NULL);
 		/* wait for child and detach from it */
 		wait(NULL);
+		if (exit_code == EXIT_FAILURE)
+			ublk_err("%s: command failed\n", __func__);
 		exit(exit_code);
 	} else {
 		exit(EXIT_FAILURE);
-- 
cgit v1.2.3


From febe7eda74d105437c7532b4a76ff14eb6007828 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 23 May 2025 14:20:45 +0200
Subject: selftests: netfilter: nft_concat_range.sh: prefer per element
 counters for testing

The selftest uses following rule:
  ... @test counter name "test"

Then sends a packet, then checks if the named counter did increment or
not.

This is fine for the 'no-match' test case: If anything matches the
counter increments and the test fails as expected.

But for the 'should match' test cases this isn't optimal.
Consider buggy matching, where the packet matches entry x, but it
should have matched entry y.

In that case the test would erronously pass.

Rework the selftest to use per-element counters to avoid this.

After sending packet that should have matched entry x, query the
relevant element via 'nft reset element' and check that its counter
had incremented.

The 'nomatch' case isn't altered, no entry should match so the named
counter must be 0, changing it to the per-element counter would then
pass if another entry matches.

The downside of this change is a slight increase in test run-time by
a few seconds.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../selftests/net/netfilter/nft_concat_range.sh    | 40 ++++++++++++++++------
 1 file changed, 30 insertions(+), 10 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
index efea93cf23d4..86b8ce742700 100755
--- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
@@ -419,6 +419,7 @@ table inet filter {
 
 	set test {
 		type ${type_spec}
+		counter
 		flags interval,timeout
 	}
 
@@ -1158,8 +1159,17 @@ del() {
 	fi
 }
 
-# Return packet count from 'test' counter in 'inet filter' table
+# Return packet count for elem $1 from 'test' counter in 'inet filter' table
 count_packets() {
+	found=0
+	for token in $(nft reset element inet filter test "${1}" ); do
+		[ ${found} -eq 1 ] && echo "${token}" && return
+		[ "${token}" = "packets" ] && found=1
+	done
+}
+
+# Return packet count from 'test' counter in 'inet filter' table
+count_packets_nomatch() {
 	found=0
 	for token in $(nft list counter inet filter test); do
 		[ ${found} -eq 1 ] && echo "${token}" && return
@@ -1206,6 +1216,10 @@ perf() {
 
 # Set MAC addresses, send single packet, check that it matches, reset counter
 send_match() {
+	local elem="$1"
+
+	shift
+
 	ip link set veth_a address "$(format_mac "${1}")"
 	ip -n B link set veth_b address "$(format_mac "${2}")"
 
@@ -1216,7 +1230,7 @@ send_match() {
 		eval src_"$f"=\$\(format_\$f "${2}"\)
 	done
 	eval send_\$proto
-	if [ "$(count_packets)" != "1" ]; then
+	if [ "$(count_packets "$elem")" != "1" ]; then
 		err "${proto} packet to:"
 		err "  $(for f in ${dst}; do
 			 eval format_\$f "${1}"; printf ' '; done)"
@@ -1242,7 +1256,7 @@ send_nomatch() {
 		eval src_"$f"=\$\(format_\$f "${2}"\)
 	done
 	eval send_\$proto
-	if [ "$(count_packets)" != "0" ]; then
+	if [ "$(count_packets_nomatch)" != "0" ]; then
 		err "${proto} packet to:"
 		err "  $(for f in ${dst}; do
 			 eval format_\$f "${1}"; printf ' '; done)"
@@ -1262,6 +1276,8 @@ send_nomatch() {
 test_correctness_main() {
 	range_size=1
 	for i in $(seq "${start}" $((start + count))); do
+		local elem=""
+
 		end=$((start + range_size))
 
 		# Avoid negative or zero-sized port ranges
@@ -1272,15 +1288,16 @@ test_correctness_main() {
 		srcstart=$((start + src_delta))
 		srcend=$((end + src_delta))
 
-		add "$(format)" || return 1
+		elem="$(format)"
+		add "$elem" || return 1
 		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
-			send_match "${j}" $((j + src_delta)) || return 1
+			send_match "$elem" "${j}" $((j + src_delta)) || return 1
 		done
 		send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1
 
 		# Delete elements now and then
 		if [ $((i % 3)) -eq 0 ]; then
-			del "$(format)" || return 1
+			del "$elem" || return 1
 			for j in $(seq "$start" \
 				   $((range_size / 2 + 1)) ${end}); do
 				send_nomatch "${j}" $((j + src_delta)) \
@@ -1572,14 +1589,17 @@ test_timeout() {
 
 	range_size=1
 	for i in $(seq "$start" $((start + count))); do
+		local elem=""
+
 		end=$((start + range_size))
 		srcstart=$((start + src_delta))
 		srcend=$((end + src_delta))
 
-		add "$(format)" || return 1
+		elem="$(format)"
+		add "$elem" || return 1
 
 		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
-			send_match "${j}" $((j + src_delta)) || return 1
+			send_match "$elem" "${j}" $((j + src_delta)) || return 1
 		done
 
 		range_size=$((range_size + 1))
@@ -1737,7 +1757,7 @@ test_bug_reload() {
 		srcend=$((end + src_delta))
 
 		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
-			send_match "${j}" $((j + src_delta)) || return 1
+			send_match "$(format)" "${j}" $((j + src_delta)) || return 1
 		done
 
 		range_size=$((range_size + 1))
@@ -1817,7 +1837,7 @@ test_bug_avx2_mismatch()
 	dst_addr6="$a2"
 	send_icmp6
 
-	if [ "$(count_packets)" -gt "0" ]; then
+	if [ "$(count_packets "{ icmpv6 . $a1 }")" -gt "0" ]; then
 		err "False match for $a2"
 		return 1
 	fi
-- 
cgit v1.2.3


From 38399f2b0fe4d44226bfb7eba9e137251c8b2571 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 23 May 2025 14:20:46 +0200
Subject: selftests: netfilter: nft_concat_range.sh: add datapath check for map
 fill bug

commit 0935ee6032df ("selftests: netfilter: add test case for recent mismatch bug")
added a regression check for incorrect initial fill of the result map
that was fixed with 791a615b7ad2 ("netfilter: nf_set_pipapo: fix initial map fill").

The test used 'nft get element', i.e., control plane checks for
match/nomatch results.

The control plane however doesn't use avx2 version, so we need to
send+match packets.

As the additional packet match/nomatch is slow, don't do this for
every element added/removed: add and use maybe_send_(no)match
helpers and use them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../selftests/net/netfilter/nft_concat_range.sh    | 62 ++++++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
index 86b8ce742700..cd12b8b5ac0e 100755
--- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
@@ -378,7 +378,7 @@ display		net,port,proto
 type_spec	ipv4_addr . inet_service . inet_proto
 chain_spec	ip daddr . udp dport . meta l4proto
 dst		addr4 port proto
-src
+src		 
 start		1
 count		9
 src_delta	9
@@ -1269,6 +1269,42 @@ send_nomatch() {
 	fi
 }
 
+maybe_send_nomatch() {
+	local elem="$1"
+	local what="$4"
+
+	[ $((RANDOM%20)) -gt 0 ] && return
+
+	dst_addr4="$2"
+	dst_port="$3"
+	send_udp
+
+	if [ "$(count_packets_nomatch)" != "0" ]; then
+		err "Packet to $dst_addr4:$dst_port did match $what"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+}
+
+maybe_send_match() {
+	local elem="$1"
+	local what="$4"
+
+	[ $((RANDOM%20)) -gt 0 ] && return
+
+	dst_addr4="$2"
+	dst_port="$3"
+	send_udp
+
+	if [ "$(count_packets "{ $elem }")" != "1" ]; then
+		err "Packet to $dst_addr4:$dst_port did not match $what"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+	nft reset counter inet filter test >/dev/null
+	nft reset element inet filter test "{ $elem }" >/dev/null
+}
+
 # Correctness test template:
 # - add ranged element, check that packets match it
 # - check that packets outside range don't match it
@@ -1776,22 +1812,34 @@ test_bug_net_port_proto_match() {
 	range_size=1
 	for i in $(seq 1 10); do
 		for j in $(seq 1 20) ; do
-			elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+			local dport=$j
+
+			elem=$(printf "10.%d.%d.0/24 . %d-%d0 . 6-17 " ${i} ${j} ${dport} "$((dport+1))")
+
+			# too slow, do not test all addresses
+			maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d1" $((dport+1))) "before add" || return 1
 
 			nft "add element inet filter test { $elem }" || return 1
+
+			maybe_send_match "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d" $dport) "after add" || return 1
+
 			nft "get element inet filter test { $elem }" | grep -q "$elem"
 			if [ $? -ne 0 ];then
 				local got=$(nft "get element inet filter test { $elem }")
 				err "post-add: should have returned $elem but got $got"
 				return 1
 			fi
+
+			maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d1" $((dport+1))) "out-of-range" || return 1
 		done
 	done
 
 	# recheck after set was filled
 	for i in $(seq 1 10); do
 		for j in $(seq 1 20) ; do
-			elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+			local dport=$j
+
+			elem=$(printf "10.%d.%d.0/24 . %d-%d0 . 6-17 " ${i} ${j} ${dport} "$((dport+1))")
 
 			nft "get element inet filter test { $elem }" | grep -q "$elem"
 			if [ $? -ne 0 ];then
@@ -1799,6 +1847,9 @@ test_bug_net_port_proto_match() {
 				err "post-fill: should have returned $elem but got $got"
 				return 1
 			fi
+
+			maybe_send_match "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d" $dport) "recheck" || return 1
+			maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d1" $((dport+1))) "recheck out-of-range" || return 1
 		done
 	done
 
@@ -1806,9 +1857,10 @@ test_bug_net_port_proto_match() {
 	for i in $(seq 1 10); do
 		for j in $(seq 1 20) ; do
 			local rnd=$((RANDOM%10))
+			local dport=$j
 			local got=""
 
-			elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+			elem=$(printf "10.%d.%d.0/24 . %d-%d0 . 6-17 " ${i} ${j} ${dport} "$((dport+1))")
 			if [ $rnd -gt 0 ];then
 				continue
 			fi
@@ -1819,6 +1871,8 @@ test_bug_net_port_proto_match() {
 				err "post-delete: query for $elem returned $got instead of error."
 				return 1
 			fi
+
+			maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d" $dport) "match after deletion" || return 1
 		done
 	done
 
-- 
cgit v1.2.3


From 3c3c3248496a3a1848ec5d923f2eee0edf60226e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 May 2025 12:34:03 +0200
Subject: selftests: netfilter: nft_nat.sh: add test for reverse clash with nat

This will fail without the previous bug fix because we erronously
believe that the clashing entry went way.

However, the clash exists in the opposite direction due to an
existing nat mapping:
 PASS: IP statless for ns2-LgTIuS
 ERROR: failed to test udp ns1-x4iyOW to ns2-LgTIuS with dnat rule step 2, result: ""

This is partially adapted from test instructions from the below
ubuntu tracker.

Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2109889
Signed-off-by: Florian Westphal <fw@strlen.de>
Tested-by: Shaun Brady <brady.1345@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/net/netfilter/nft_nat.sh | 81 ++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 5 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh
index 9e39de26455f..a954754b99b3 100755
--- a/tools/testing/selftests/net/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/net/netfilter/nft_nat.sh
@@ -866,6 +866,24 @@ EOF
 	ip netns exec "$ns0" nft delete table $family nat
 }
 
+file_cmp()
+{
+	local infile="$1"
+	local outfile="$2"
+
+	if ! cmp "$infile" "$outfile";then
+		echo -n "Infile "
+		ls -l "$infile"
+		echo -n "Outfile "
+		ls -l "$outfile"
+		echo "ERROR: in and output file mismatch when checking $msg" 1>&1
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
 test_stateless_nat_ip()
 {
 	local lret=0
@@ -966,11 +984,7 @@ EOF
 
 	wait
 
-	if ! cmp "$INFILE" "$OUTFILE";then
-		ls -l "$INFILE" "$OUTFILE"
-		echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2
-		lret=1
-	fi
+	file_cmp "$INFILE" "$OUTFILE" "udp with stateless nat" || lret=1
 
 	:> "$OUTFILE"
 
@@ -991,6 +1005,62 @@ EOF
 	return $lret
 }
 
+test_dnat_clash()
+{
+	local lret=0
+
+	if ! socat -h > /dev/null 2>&1;then
+		echo "SKIP: Could not run dnat clash test without socat tool"
+		[ $ret -eq 0 ] && ret=$ksft_skip
+		return $ksft_skip
+	fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+flush ruleset
+table ip dnat-test {
+ chain prerouting {
+  type nat hook prerouting priority dstnat; policy accept;
+  ip daddr 10.0.2.1 udp dport 1234 counter dnat to 10.0.1.1:1234
+ }
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add dnat rules"
+		[ $ret -eq 0 ] && ret=$ksft_skip
+		return $ksft_skip
+	fi
+
+	local udpdaddr="10.0.2.1"
+	for i in 1 2;do
+		echo "PING $udpdaddr" > "$INFILE"
+		echo "PONG 10.0.1.1 step $i" | ip netns exec "$ns0" timeout 3 socat STDIO UDP4-LISTEN:1234,bind=10.0.1.1 > "$OUTFILE" 2>/dev/null &
+		local lpid=$!
+
+		busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1234 "-u"
+
+		result=$(ip netns exec "$ns1" timeout 3 socat STDIO UDP4-SENDTO:"$udpdaddr:1234,sourceport=4321" < "$INFILE")
+		udpdaddr="10.0.1.1"
+
+		if [ "$result" != "PONG 10.0.1.1 step $i" ] ; then
+			echo "ERROR: failed to test udp $ns1 to $ns2 with dnat rule step $i, result: \"$result\"" 1>&2
+			lret=1
+			ret=1
+		fi
+
+		wait
+
+		file_cmp "$INFILE" "$OUTFILE" "udp dnat step $i" || lret=1
+
+		:> "$OUTFILE"
+	done
+
+	test $lret -eq 0 && echo "PASS: IP dnat clash $ns1:$ns2"
+
+	ip netns exec "$ns0" nft flush ruleset
+
+	return $lret
+}
+
 # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99
 for i in "$ns0" "$ns1" "$ns2" ;do
 ip netns exec "$i" nft -f /dev/stdin <<EOF
@@ -1147,6 +1217,7 @@ $test_inet_nat && test_redirect6 inet
 
 test_port_shadowing
 test_stateless_nat_ip
+test_dnat_clash
 
 if [ $ret -ne 0 ];then
 	echo -n "FAIL: "
-- 
cgit v1.2.3


From 1a9dcf69c7a97e733aa2fc026db22f22928ca7b7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 28 May 2025 10:55:19 +0200
Subject: selftests/futex: getopt() requires int as return value.

Mark reported that futex_priv_hash fails on ARM64.
It turns out that the command line parsing does not terminate properly
and ends in the default case assuming an invalid option was passed.

Use an int as the return type for getopt().

Closes: https://lore.kernel.org/all/31869a69-063f-44a3-a079-ba71b2506cce@sirena.org.uk/
Fixes: 3163369407baf ("selftests/futex: Add futex_numa_mpol")
Fixes: cda95faef7bcf ("selftests/futex: Add futex_priv_hash")
Reported-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250528085521.1938355-2-bigeasy@linutronix.de
---
 tools/testing/selftests/futex/functional/futex_numa_mpol.c | 2 +-
 tools/testing/selftests/futex/functional/futex_priv_hash.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
index 20a9d3ecf743..564dbd02d2f4 100644
--- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c
+++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
@@ -144,7 +144,7 @@ int main(int argc, char *argv[])
 	struct futex32_numa *futex_numa;
 	int mem_size, i;
 	void *futex_ptr;
-	char c;
+	int c;
 
 	while ((c = getopt(argc, argv, "chv:")) != -1) {
 		switch (c) {
diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c
index 2dca18fefedc..24a92dc94eb8 100644
--- a/tools/testing/selftests/futex/functional/futex_priv_hash.c
+++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 	pthread_mutexattr_t mutex_attr_pi;
 	int use_global_hash = 0;
 	int ret;
-	char c;
+	int c;
 
 	while ((c = getopt(argc, argv, "cghv:")) != -1) {
 		switch (c) {
-- 
cgit v1.2.3


From 0ecb4232fc65e659ca7020f8bb2e0fc347acfb7d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 28 May 2025 10:55:20 +0200
Subject: selftests/futex: Set the home_node in futex_numa_mpol

The test fails at the MPOL step if multiple nodes are available. The
reason is that mbind() sets the policy but the home_node, which is
retrieved by the futex code, is not set. This causes to retrieve the
current node and with multiple nodes it fails on one of the iterations.

Use numa_set_mempolicy_home_node() to set the expected node.
Use ksft_exit_fail_msg() to fail and exit in order not to confuse ktap.

Fixes: 3163369407baf ("selftests/futex: Add futex_numa_mpol")
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250528085521.1938355-3-bigeasy@linutronix.de
---
 tools/testing/selftests/futex/functional/futex_numa_mpol.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
index 564dbd02d2f4..a9ecfb2d3932 100644
--- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c
+++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
@@ -210,6 +210,10 @@ int main(int argc, char *argv[])
 		ret = mbind(futex_ptr, mem_size, MPOL_BIND, &nodemask,
 			    sizeof(nodemask) * 8, 0);
 		if (ret == 0) {
+			ret = numa_set_mempolicy_home_node(futex_ptr, mem_size, i, 0);
+			if (ret != 0)
+				ksft_exit_fail_msg("Failed to set home node: %m, %d\n", errno);
+
 			ksft_print_msg("Node %d test\n", i);
 			futex_numa->futex = 0;
 			futex_numa->numa = FUTEX_NO_NODE;
@@ -220,8 +224,8 @@ int main(int argc, char *argv[])
 			if (0)
 				test_futex_mpol(futex_numa, 0);
 			if (futex_numa->numa != i) {
-				ksft_test_result_fail("Returned NUMA node is %d expected %d\n",
-						      futex_numa->numa, i);
+				ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n",
+						   futex_numa->numa, i);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 9a9864fd09c7e52440c05e70609bf5ee8da425d0 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 5 Jun 2025 12:36:10 +0200
Subject: KVM: arm64: selftests: Fix help text for arch_timer_edge_cases

Fix the help text for arch_timer_edge_cases to show the correct
option for setting the wait time.

Signed-off-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20250605103613.14544-2-sebott@redhat.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index a36a7e2db434..c4716e0c1438 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -986,7 +986,7 @@ static void test_print_help(char *name)
 	pr_info("\t-b: Test both physical and virtual timers (default: true)\n");
 	pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n",
 	     LONG_WAIT_TEST_MS);
-	pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n",
+	pr_info("\t-w: Delta (in ms) used for wait times (default: %u)\n",
 		WAIT_TEST_MS);
 	pr_info("\t-p: Test physical timer (default: true)\n");
 	pr_info("\t-v: Test virtual timer (default: true)\n");
-- 
cgit v1.2.3


From 050632ae6571d0f148fa0d4b90b6409a12645461 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 5 Jun 2025 12:36:11 +0200
Subject: KVM: arm64: selftests: Fix thread migration in arch_timer_edge_cases

arch_timer_edge_cases tries to migrate itself across host cpus. Before
the first test, it migrates to cpu 0 by setting up an affinity mask with
only bit 0 set. After that it looks for the next possible cpu in the
current affinity mask which still has only bit 0 set. So there is no
migration at all.

Fix this by reading the default mask at start and use this to find
the next cpu in each iteration.

Signed-off-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20250605103613.14544-3-sebott@redhat.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index c4716e0c1438..a813b4c6c817 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -849,17 +849,17 @@ static void guest_code(enum arch_timer timer)
 	GUEST_DONE();
 }
 
+static cpu_set_t default_cpuset;
+
 static uint32_t next_pcpu(void)
 {
 	uint32_t max = get_nprocs();
 	uint32_t cur = sched_getcpu();
 	uint32_t next = cur;
-	cpu_set_t cpuset;
+	cpu_set_t cpuset = default_cpuset;
 
 	TEST_ASSERT(max > 1, "Need at least two physical cpus");
 
-	sched_getaffinity(0, sizeof(cpuset), &cpuset);
-
 	do {
 		next = (next + 1) % CPU_SETSIZE;
 	} while (!CPU_ISSET(next, &cpuset));
@@ -1046,6 +1046,8 @@ int main(int argc, char *argv[])
 	if (!parse_args(argc, argv))
 		exit(KSFT_SKIP);
 
+	sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset);
+
 	if (test_args.test_virtual) {
 		test_vm_create(&vm, &vcpu, VIRTUAL);
 		test_run(vm, vcpu);
-- 
cgit v1.2.3


From 05ce38d489dbc96eb1dd700b287f949cb8cc0610 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 5 Jun 2025 12:36:12 +0200
Subject: KVM: arm64: selftests: Fix xVAL init in arch_timer_edge_cases

arch_timer_edge_cases hits the following assertion in < 10% of the test runs:

==== Test Assertion Failure ====
  arm64/arch_timer_edge_cases.c:490: timer_get_cntct(timer) >= DEF_CNT + (timer_get_cntfrq() * (uint64_t)(delta_2_ms) / 1000)
  pid=17110 tid=17110 errno=4 - Interrupted system call
     1  0x0000000000404ec7: test_run at arch_timer_edge_cases.c:945
     2  0x0000000000401fa3: main at arch_timer_edge_cases.c:1074
     3  0x0000ffffa774b587: ?? ??:0
     4  0x0000ffffa774b65f: ?? ??:0
     5  0x000000000040206f: _start at ??:?
  timer_get_cntct(timer) >= DEF_CNT + msec_to_cycles(delta_2_ms)

Enabling the timer without proper xval initialization in set_tval_irq()
resulted in an early interrupt during timer reprogramming. Make sure
to set the xval before setting the enable bit.

Signed-off-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20250605103613.14544-4-sebott@redhat.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index a813b4c6c817..be8bbedc933b 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -191,8 +191,8 @@ static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles,
 {
 	atomic_set(&shared_data.handled, 0);
 	atomic_set(&shared_data.spurious, 0);
-	timer_set_ctl(timer, ctl);
 	timer_set_tval(timer, tval_cycles);
+	timer_set_ctl(timer, ctl);
 }
 
 static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl,
-- 
cgit v1.2.3


From fad4cf944839da7f5c3376243aa353295c88f588 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 5 Jun 2025 12:36:13 +0200
Subject: KVM: arm64: selftests: Determine effective counter width in
 arch_timer_edge_cases

arch_timer_edge_cases uses ~0 as the maximum counter value, however there's
no architectural guarantee that this is valid.

Figure out the effective counter width based on the effective frequency
like it's done by the kernel.

This also serves as a workaround for AC03_CPU_14 that led to the
following assertion failure on ampere-one machines:

==== Test Assertion Failure ====
  arm64/arch_timer_edge_cases.c:169: timer_condition == istatus
  pid=11236 tid=11236 errno=4 - Interrupted system call
     1  0x0000000000404ce7: test_run at arch_timer_edge_cases.c:938
     2  0x0000000000401ebb: main at arch_timer_edge_cases.c:1053
     3  0x0000ffff9fa8625b: ?? ??:0
     4  0x0000ffff9fa8633b: ?? ??:0
     5  0x0000000000401fef: _start at ??:?
  0x1 != 0x0 (timer_condition != istatus)

Note that the following subtest only worked since the counter initialized
with CVAL_MAX would instantly overflow (which is no longer the case):

	test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm);

To fix this we could swap CVAL_MAX for 0 here but since that is already
done by test_move_counters_behind_timers() let's remove that subtest.

Link: https://lore.kernel.org/kvmarm/ac1de1d2-ef2b-d439-dc48-8615e121b07b@redhat.com
Link: https://amperecomputing.com/assets/AmpereOne_Developer_ER_v0_80_20240823_28945022f4.pdf
Signed-off-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20250605103613.14544-5-sebott@redhat.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 .../selftests/kvm/arm64/arch_timer_edge_cases.c    | 27 ++++++++++++++--------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index be8bbedc933b..b4d22b3ab7cc 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -22,7 +22,8 @@
 #include "gic.h"
 #include "vgic.h"
 
-static const uint64_t CVAL_MAX = ~0ULL;
+/* Depends on counter width. */
+static uint64_t CVAL_MAX;
 /* tval is a signed 32-bit int. */
 static const int32_t TVAL_MAX = INT32_MAX;
 static const int32_t TVAL_MIN = INT32_MIN;
@@ -30,8 +31,8 @@ static const int32_t TVAL_MIN = INT32_MIN;
 /* After how much time we say there is no IRQ. */
 static const uint32_t TIMEOUT_NO_IRQ_US = 50000;
 
-/* A nice counter value to use as the starting one for most tests. */
-static const uint64_t DEF_CNT = (CVAL_MAX / 2);
+/* Counter value to use as the starting one for most tests. Set to CVAL_MAX/2 */
+static uint64_t DEF_CNT;
 
 /* Number of runs. */
 static const uint32_t NR_TEST_ITERS_DEF = 5;
@@ -732,12 +733,6 @@ static void test_move_counters_ahead_of_timers(enum arch_timer timer)
 		test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1,
 					wm);
 	}
-
-	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
-		sleep_method_t sm = sleep_method[i];
-
-		test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm);
-	}
 }
 
 /*
@@ -975,6 +970,8 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
 	test_init_timer_irq(*vm, *vcpu);
 	vgic_v3_setup(*vm, 1, 64);
 	sync_global_to_guest(*vm, test_args);
+	sync_global_to_guest(*vm, CVAL_MAX);
+	sync_global_to_guest(*vm, DEF_CNT);
 }
 
 static void test_print_help(char *name)
@@ -1035,6 +1032,17 @@ static bool parse_args(int argc, char *argv[])
 	return false;
 }
 
+static void set_counter_defaults(void)
+{
+	const uint64_t MIN_ROLLOVER_SECS = 40ULL * 365 * 24 * 3600;
+	uint64_t freq = read_sysreg(CNTFRQ_EL0);
+	uint64_t width = ilog2(MIN_ROLLOVER_SECS * freq);
+
+	width = clamp(width, 56, 64);
+	CVAL_MAX = GENMASK_ULL(width - 1, 0);
+	DEF_CNT = CVAL_MAX / 2;
+}
+
 int main(int argc, char *argv[])
 {
 	struct kvm_vcpu *vcpu;
@@ -1047,6 +1055,7 @@ int main(int argc, char *argv[])
 		exit(KSFT_SKIP);
 
 	sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset);
+	set_counter_defaults();
 
 	if (test_args.test_virtual) {
 		test_vm_create(&vm, &vcpu, VIRTUAL);
-- 
cgit v1.2.3


From 7eb6b63aa3c3b406512db15943ce9eb114095b51 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 3 Jun 2025 17:16:52 -0700
Subject: selftests: drv-net: add configs for the TSO test

Add missing config options for the tso.py test, specifically
to make sure the kernel is built with vxlan and gre tunnels.

I noticed this while adding a TSO-capable device QEMU to the CI.
Previously we only run virtio tests and it doesn't report LSO
stats on the QEMU we have.

Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test")
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250604001653.853008-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/config | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/hw/config

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config
new file mode 100644
index 000000000000..88ae719e6f8f
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/config
@@ -0,0 +1,5 @@
+CONFIG_IPV6=y
+CONFIG_IPV6_GRE=y
+CONFIG_NET_IPGRE=y
+CONFIG_NET_IPGRE_DEMUX=y
+CONFIG_VXLAN=y
-- 
cgit v1.2.3


From c68804c934e3197e34560744854c57cf88dff8e7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 3 Jun 2025 18:20:31 -0700
Subject: selftests: drv-net: tso: fix the GRE device name

The device type for IPv4 GRE is "gre" not "ipgre",
unlike for IPv6 which uses "ip6gre".

Not sure how I missed this when writing the test, perhaps
because all HW I have access to is on an IPv6-only network.

Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test")
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250604012031.891242-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/tso.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py
index e1ecb92f79d9..eec647e7ec19 100755
--- a/tools/testing/selftests/drivers/net/hw/tso.py
+++ b/tools/testing/selftests/drivers/net/hw/tso.py
@@ -216,7 +216,7 @@ def main() -> None:
             ("",            "6", "tx-tcp6-segmentation",          None),
             ("vxlan",        "", "tx-udp_tnl-segmentation",       ("vxlan",  True,  "id 100 dstport 4789 noudpcsum")),
             ("vxlan_csum",   "", "tx-udp_tnl-csum-segmentation",  ("vxlan",  False, "id 100 dstport 4789 udpcsum")),
-            ("gre",         "4", "tx-gre-segmentation",           ("ipgre",  False,  "")),
+            ("gre",         "4", "tx-gre-segmentation",           ("gre",    False,  "")),
             ("gre",         "6", "tx-gre-segmentation",           ("ip6gre", False,  "")),
         )
 
-- 
cgit v1.2.3


From e6854be4d80ea266a7be64a65a0322bcdfa72807 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 3 Jun 2025 18:20:55 -0700
Subject: selftests: drv-net: tso: make bkg() wait for socat to quit

Commit 846742f7e32f ("selftests: drv-net: add a warning for
bkg + shell + terminate") added a warning for bkg() used
with terminate=True. The tso test was missed as we didn't
have it running anywhere in NIPA. Add exit_wait=True, to avoid:

  # Warning: combining shell and terminate is risky!
  #          SIGTERM may not reach the child on zsh/ksh!

getting printed twice for every variant.

Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test")
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250604012055.891431-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/tso.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py
index eec647e7ec19..3370827409aa 100755
--- a/tools/testing/selftests/drivers/net/hw/tso.py
+++ b/tools/testing/selftests/drivers/net/hw/tso.py
@@ -39,7 +39,7 @@ def run_one_stream(cfg, ipver, remote_v4, remote_v6, should_lso):
     port = rand_port()
     listen_cmd = f"socat -{ipver} -t 2 -u TCP-LISTEN:{port},reuseport /dev/null,ignoreeof"
 
-    with bkg(listen_cmd, host=cfg.remote) as nc:
+    with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc:
         wait_port_listen(port, host=cfg.remote)
 
         if ipver == "4":
-- 
cgit v1.2.3


From ee0d03053e7009a3a3532fb37f6c94bfa0a8cca3 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Fri, 11 Apr 2025 10:46:00 +0800
Subject: RISC-V: vDSO: Wire up getrandom() vDSO implementation

Hook up the generic vDSO implementation to the generic vDSO getrandom
implementation by providing the required __arch_chacha20_blocks_nostack
and getrandom_syscall implementations. Also wire up the selftests.

The benchmark result:

	vdso: 25000000 times in 2.466341333 seconds
	libc: 25000000 times in 41.447720005 seconds
	syscall: 25000000 times in 41.043926672 seconds

	vdso: 25000000 x 256 times in 162.286219353 seconds
	libc: 25000000 x 256 times in 2953.855018685 seconds
	syscall: 25000000 x 256 times in 2796.268546000 seconds

[ alex: - Fix dynamic relocation
        - Squash Nathan's fix https://lore.kernel.org/all/20250423-riscv-fix-compat_vdso-lld-v2-1-b7bbbc244501@kernel.org/
	- Add comment from Loongarch ]

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Link: https://lore.kernel.org/r/20250411024600.16045-1-xry111@xry111.site
Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
index d6e09af7c0a9..a4a82e1c28a9 100644
--- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S
+++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
@@ -11,6 +11,8 @@
 #include "../../../../arch/loongarch/vdso/vgetrandom-chacha.S"
 #elif defined(__powerpc__) || defined(__powerpc64__)
 #include "../../../../arch/powerpc/kernel/vdso/vgetrandom-chacha.S"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S"
 #elif defined(__s390x__)
 #include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S"
 #elif defined(__x86_64__)
-- 
cgit v1.2.3


From 6fb6223347d5d9512875120267c117e7437f0db6 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 29 May 2025 15:56:49 +0000
Subject: selftests/mm: extract read_sysfs and write_sysfs into vm_util

Extract read_sysfs and write_sysfs into vm_util.  Meanwhile, rename the
function in thuge-gen that has the same name as read_sysfs.

Link: https://lkml.kernel.org/r/20250529155650.4017699-4-pulehui@huaweicloud.com
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/ksm_tests.c | 32 ++--------------------------
 tools/testing/selftests/mm/thuge-gen.c |  6 +++---
 tools/testing/selftests/mm/vm_util.c   | 38 ++++++++++++++++++++++++++++++++++
 tools/testing/selftests/mm/vm_util.h   |  2 ++
 4 files changed, 45 insertions(+), 33 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index dcdd5bb20f3d..e80deac1436b 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -58,40 +58,12 @@ int debug;
 
 static int ksm_write_sysfs(const char *file_path, unsigned long val)
 {
-	FILE *f = fopen(file_path, "w");
-
-	if (!f) {
-		fprintf(stderr, "f %s\n", file_path);
-		perror("fopen");
-		return 1;
-	}
-	if (fprintf(f, "%lu", val) < 0) {
-		perror("fprintf");
-		fclose(f);
-		return 1;
-	}
-	fclose(f);
-
-	return 0;
+	return write_sysfs(file_path, val);
 }
 
 static int ksm_read_sysfs(const char *file_path, unsigned long *val)
 {
-	FILE *f = fopen(file_path, "r");
-
-	if (!f) {
-		fprintf(stderr, "f %s\n", file_path);
-		perror("fopen");
-		return 1;
-	}
-	if (fscanf(f, "%lu", val) != 1) {
-		perror("fscanf");
-		fclose(f);
-		return 1;
-	}
-	fclose(f);
-
-	return 0;
+	return read_sysfs(file_path, val);
 }
 
 static void ksm_print_sysfs(void)
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index a41bc1234b37..95b6f043a3cb 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -77,7 +77,7 @@ void show(unsigned long ps)
 	system(buf);
 }
 
-unsigned long read_sysfs(int warn, char *fmt, ...)
+unsigned long thuge_read_sysfs(int warn, char *fmt, ...)
 {
 	char *line = NULL;
 	size_t linelen = 0;
@@ -106,7 +106,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
 
 unsigned long read_free(unsigned long ps)
 {
-	return read_sysfs(ps != getpagesize(),
+	return thuge_read_sysfs(ps != getpagesize(),
 			  "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
 			  ps >> 10);
 }
@@ -195,7 +195,7 @@ void find_pagesizes(void)
 	}
 	globfree(&g);
 
-	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+	if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
 		ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
 				   largest * NUM_PAGES);
 
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 61d7bf1f8c62..5492e3f784df 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -486,3 +486,41 @@ int close_procmap(struct procmap_fd *procmap)
 {
 	return close(procmap->fd);
 }
+
+int write_sysfs(const char *file_path, unsigned long val)
+{
+	FILE *f = fopen(file_path, "w");
+
+	if (!f) {
+		fprintf(stderr, "f %s\n", file_path);
+		perror("fopen");
+		return 1;
+	}
+	if (fprintf(f, "%lu", val) < 0) {
+		perror("fprintf");
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	return 0;
+}
+
+int read_sysfs(const char *file_path, unsigned long *val)
+{
+	FILE *f = fopen(file_path, "r");
+
+	if (!f) {
+		fprintf(stderr, "f %s\n", file_path);
+		perror("fopen");
+		return 1;
+	}
+	if (fscanf(f, "%lu", val) != 1) {
+		perror("fscanf");
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index adb5d294a220..b8136d12a0f8 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -88,6 +88,8 @@ int open_procmap(pid_t pid, struct procmap_fd *procmap_out);
 int query_procmap(struct procmap_fd *procmap);
 bool find_vma_procmap(struct procmap_fd *procmap, void *address);
 int close_procmap(struct procmap_fd *procmap);
+int write_sysfs(const char *file_path, unsigned long val);
+int read_sysfs(const char *file_path, unsigned long *val);
 
 static inline int open_self_procmap(struct procmap_fd *procmap_out)
 {
-- 
cgit v1.2.3


From efe99fabeb11b030c89a7dc5a5e7a7558d0dc7ec Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 29 May 2025 15:56:50 +0000
Subject: selftests/mm: add test about uprobe pte be orphan during vma merge

Add test about uprobe pte be orphan during vma merge.

[akpm@linux-foundation.org: include sys/syscall.h, per Lorenzo]
Link: https://lkml.kernel.org/r/20250529155650.4017699-5-pulehui@huaweicloud.com
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/merge.c | 43 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index c76646cdf6e6..bbae66fc5038 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -2,11 +2,14 @@
 
 #define _GNU_SOURCE
 #include "../kselftest_harness.h"
+#include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/mman.h>
+#include <sys/syscall.h>
 #include <sys/wait.h>
+#include <linux/perf_event.h>
 #include "vm_util.h"
 
 FIXTURE(merge)
@@ -452,4 +455,44 @@ TEST_F(merge, forked_source_vma)
 	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
 }
 
+TEST_F(merge, handle_uprobe_upon_merged_vma)
+{
+	const size_t attr_sz = sizeof(struct perf_event_attr);
+	unsigned int page_size = self->page_size;
+	const char *probe_file = "./foo";
+	char *carveout = self->carveout;
+	struct perf_event_attr attr;
+	unsigned long type;
+	void *ptr1, *ptr2;
+	int fd;
+
+	fd = open(probe_file, O_RDWR|O_CREAT, 0600);
+	ASSERT_GE(fd, 0);
+
+	ASSERT_EQ(ftruncate(fd, page_size), 0);
+	ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0);
+
+	memset(&attr, 0, attr_sz);
+	attr.size = attr_sz;
+	attr.type = type;
+	attr.config1 = (__u64)(long)probe_file;
+	attr.config2 = 0x0;
+
+	ASSERT_GE(syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0), 0);
+
+	ptr1 = mmap(&carveout[page_size], 10 * page_size, PROT_EXEC,
+		    MAP_PRIVATE | MAP_FIXED, fd, 0);
+	ASSERT_NE(ptr1, MAP_FAILED);
+
+	ptr2 = mremap(ptr1, page_size, 2 * page_size,
+		      MREMAP_MAYMOVE | MREMAP_FIXED, ptr1 + 5 * page_size);
+	ASSERT_NE(ptr2, MAP_FAILED);
+
+	ASSERT_NE(mremap(ptr2, page_size, page_size,
+			 MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED);
+
+	close(fd);
+	remove(probe_file);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:53 +0200
Subject: mm/hugetlb: unshare page tables during VMA split, not before

Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split().  This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.

Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens.  At that
point, both the VMA and the rmap(s) are write-locked.

An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:

1. from hugetlb_split(), holding:
    - mmap lock (exclusively)
    - VMA lock
    - file rmap lock (exclusively)
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
   call us with only the mmap lock held (in shared mode), but currently
   only runs while holding mmap lock (exclusively) and VMA lock

Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.

[jannh@google.com: v2]
  Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 441feb21aa5a..4505b1c31be1 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 	(void)next;
 }
 
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
 static inline void vma_iter_free(struct vma_iterator *vmi)
 {
 	mas_destroy(&vmi->mas);
-- 
cgit v1.2.3


From 7054674ee9b9c0c62c2a254243f876f577d36db2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Jun 2025 14:50:54 +0200
Subject: selftests/mount_setattr: adapt detached mount propagation test

Make sure that detached trees don't receive mount propagation.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 .../selftests/mount_setattr/mount_setattr_test.c        | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 8b378c91debf..b1e4618399be 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -2079,24 +2079,9 @@ TEST_F(mount_setattr, detached_tree_propagation)
 	 * means that the device information will be different for any
 	 * statx() that was taken from /mnt/A before the mount compared
 	 * to one after the mount.
-	 *
-	 * Since we already now that the device information between the
-	 * stx1 and stx2 samples are identical we also now that stx2 and
-	 * stx3 device information will necessarily differ.
 	 */
 	ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor);
-
-	/*
-	 * If mount propagation worked correctly then the tmpfs mount
-	 * that was created after the mount namespace was unshared will
-	 * have propagated onto /mnt/A in the detached mount tree.
-	 *
-	 * Verify that the device information for stx3 and stx4 are
-	 * identical. It is already established that stx3 is different
-	 * from both stx1 and stx2 sampled before the tmpfs mount was
-	 * done so if stx3 and stx4 are identical the proof is done.
-	 */
-	ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor);
+	ASSERT_EQ(stx1.stx_dev_minor, stx4.stx_dev_minor);
 
 	EXPECT_EQ(close(fd_tree), 0);
 }
-- 
cgit v1.2.3


From f287822688eeb44ae1cf6ac45701d965efc33218 Mon Sep 17 00:00:00 2001
From: "Xin Li (Intel)" <xin@zytor.com>
Date: Mon, 9 Jun 2025 01:40:54 -0700
Subject: selftests/x86: Add a test to detect infinite SIGTRAP handler loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When FRED is enabled, if the Trap Flag (TF) is set without an external
debugger attached, it can lead to an infinite loop in the SIGTRAP
handler.  To avoid this, the software event flag in the augmented SS
must be cleared, ensuring that no single-step trap remains pending when
ERETU completes.

This test checks for that specific scenario—verifying whether the kernel
correctly prevents an infinite SIGTRAP loop in this edge case when FRED
is enabled.

The test should _always_ pass with IDT event delivery, thus no need to
disable the test even when FRED is not enabled.

Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Sohil Mehta <sohil.mehta@intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250609084054.2083189-3-xin%40zytor.com
---
 tools/testing/selftests/x86/Makefile       |   2 +-
 tools/testing/selftests/x86/sigtrap_loop.c | 101 +++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigtrap_loop.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index f703fcfe9f7c..83148875a12c 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" trivial_program.c -no-pie)
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
 			check_initial_reg_state sigreturn iopl ioperm \
-			test_vsyscall mov_ss_trap \
+			test_vsyscall mov_ss_trap sigtrap_loop \
 			syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_BOTHBITS += nx_stack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
diff --git a/tools/testing/selftests/x86/sigtrap_loop.c b/tools/testing/selftests/x86/sigtrap_loop.c
new file mode 100644
index 000000000000..9d065479e89f
--- /dev/null
+++ b/tools/testing/selftests/x86/sigtrap_loop.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Intel Corporation
+ */
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ucontext.h>
+
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+#else
+# define REG_IP REG_EIP
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+
+	return;
+}
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
+	static unsigned int loop_count_on_same_ip;
+	static unsigned long last_trap_ip;
+
+	if (last_trap_ip == ctx->uc_mcontext.gregs[REG_IP]) {
+		printf("\tTrapped at %016lx\n", last_trap_ip);
+
+		/*
+		 * If the same IP is hit more than 10 times in a row, it is
+		 * _considered_ an infinite loop.
+		 */
+		if (++loop_count_on_same_ip > 10) {
+			printf("[FAIL]\tDetected SIGTRAP infinite loop\n");
+			exit(1);
+		}
+
+		return;
+	}
+
+	loop_count_on_same_ip = 0;
+	last_trap_ip = ctx->uc_mcontext.gregs[REG_IP];
+	printf("\tTrapped at %016lx\n", last_trap_ip);
+}
+
+int main(int argc, char *argv[])
+{
+	sethandler(SIGTRAP, sigtrap, 0);
+
+	/*
+	 * Set the Trap Flag (TF) to single-step the test code, therefore to
+	 * trigger a SIGTRAP signal after each instruction until the TF is
+	 * cleared.
+	 *
+	 * Because the arithmetic flags are not significant here, the TF is
+	 * set by pushing 0x302 onto the stack and then popping it into the
+	 * flags register.
+	 *
+	 * Four instructions in the following asm code are executed with the
+	 * TF set, thus the SIGTRAP handler is expected to run four times.
+	 */
+	printf("[RUN]\tSIGTRAP infinite loop detection\n");
+	asm volatile(
+#ifdef __x86_64__
+		/*
+		 * Avoid clobbering the redzone
+		 *
+		 * Equivalent to "sub $128, %rsp", however -128 can be encoded
+		 * in a single byte immediate while 128 uses 4 bytes.
+		 */
+		"add $-128, %rsp\n\t"
+#endif
+		"push $0x302\n\t"
+		"popf\n\t"
+		"nop\n\t"
+		"nop\n\t"
+		"push $0x202\n\t"
+		"popf\n\t"
+#ifdef __x86_64__
+		"sub $-128, %rsp\n\t"
+#endif
+	);
+
+	printf("[OK]\tNo SIGTRAP infinite loop detected\n");
+	return 0;
+}
-- 
cgit v1.2.3


From 567766954b2d5d6c29f568ad4382d6351ea48079 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 Jun 2025 17:12:45 -0700
Subject: selftests: net: add test case for NAT46 looping back dst

Simple test for crash involving multicast loopback and stale dst.
Reuse exising NAT46 program.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250610001245.1981782-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile   |  1 +
 tools/testing/selftests/net/nat6to4.sh | 15 +++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100755 tools/testing/selftests/net/nat6to4.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ea84b88bcb30..ab996bd22a5f 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -27,6 +27,7 @@ TEST_PROGS += amt.sh
 TEST_PROGS += unicast_extensions.sh
 TEST_PROGS += udpgro_fwd.sh
 TEST_PROGS += udpgro_frglist.sh
+TEST_PROGS += nat6to4.sh
 TEST_PROGS += veth.sh
 TEST_PROGS += ioam6.sh
 TEST_PROGS += gro.sh
diff --git a/tools/testing/selftests/net/nat6to4.sh b/tools/testing/selftests/net/nat6to4.sh
new file mode 100755
index 000000000000..0ee859b622a4
--- /dev/null
+++ b/tools/testing/selftests/net/nat6to4.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NS="ns-peer-$(mktemp -u XXXXXX)"
+
+ip netns add "${NS}"
+ip -netns "${NS}" link set lo up
+ip -netns "${NS}" route add default via 127.0.0.2 dev lo
+
+tc -n "${NS}" qdisc add dev lo ingress
+tc -n "${NS}" filter add dev lo ingress prio 4 protocol ip \
+   bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action
+
+ip netns exec "${NS}" \
+   bash -c 'echo 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789abc | socat - UDP4-DATAGRAM:224.1.0.1:6666,ip-multicast-loop=1'
-- 
cgit v1.2.3


From b1a529bdb9641392b4f5fc91545598793c0b3b96 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 5 Jun 2025 22:34:31 +0100
Subject: selftests/mm: skip failed memfd setups in gup_longterm

Unlike the other cases gup_longterm's memfd tests previously skipped the
test when failing to set up the file descriptor to test.  Restore this
behavior to avoid hitting failures when hugetlb isn't configured.

Link: https://lkml.kernel.org/r/20250605-selftest-mm-gup-longterm-tweaks-v1-1-2fae34b05958@kernel.org
Fixes: 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm")
Signed-off-by: Mark Brown <broonie@kernel.org>
Reported-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Closes: https://lkml.kernel.org/r/a76fc252-0fe3-4d4b-a9a1-4a2895c2680d@lucifer.local
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/gup_longterm.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c
index 8a97ac5176a4..29047d2e0c49 100644
--- a/tools/testing/selftests/mm/gup_longterm.c
+++ b/tools/testing/selftests/mm/gup_longterm.c
@@ -298,8 +298,11 @@ static void run_with_memfd(test_fn fn, const char *desc)
 	log_test_start("%s ... with memfd", desc);
 
 	fd = memfd_create("test", 0);
-	if (fd < 0)
+	if (fd < 0) {
 		ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
+		log_test_result(KSFT_SKIP);
+		return;
+	}
 
 	fn(fd, pagesize);
 	close(fd);
@@ -366,6 +369,8 @@ static void run_with_memfd_hugetlb(test_fn fn, const char *desc,
 	fd = memfd_create("test", flags);
 	if (fd < 0) {
 		ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
+		log_test_result(KSFT_SKIP);
+		return;
 	}
 
 	fn(fd, hugetlbsize);
-- 
cgit v1.2.3


From 56c5d291e88538621029e5c7c5f60540d37846a8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Thu, 12 Jun 2025 10:19:58 +0300
Subject: selftests: drv-net: rss_ctx: Add test for ntuple rules targeting
 default RSS context

Add test_rss_default_context_rule() to verify that ntuple rules can
correctly direct traffic to the default RSS context (context 0).

The test creates two ntuple rules with explicit location priorities:
- A high-priority rule (loc 0) directing specific port traffic to
  context 0.
- A low-priority rule (loc 1) directing all other TCP traffic to context
  1.

This validates that:
1. Rules targeting the default context function properly.
2. Traffic steering works as expected when mixing default and
   additional RSS contexts.

The test was written by AI, and reviewed by humans.

Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250612071958.1696361-3-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/rss_ctx.py | 59 ++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index ca60ae325c22..7bb552f8b182 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -747,6 +747,62 @@ def test_rss_ntuple_addition(cfg):
                                                           'noise' : (0,) })
 
 
+def test_rss_default_context_rule(cfg):
+    """
+    Allocate a port, direct this port to context 0, then create a new RSS
+    context and steer all TCP traffic to it (context 1).  Verify that:
+      * Traffic to the specific port continues to use queues of the main
+        context (0/1).
+      * Traffic to any other TCP port is redirected to the new context
+        (queues 2/3).
+    """
+
+    require_ntuple(cfg)
+
+    queue_cnt = len(_get_rx_cnts(cfg))
+    if queue_cnt < 4:
+        try:
+            ksft_pr(f"Increasing queue count {queue_cnt} -> 4")
+            ethtool(f"-L {cfg.ifname} combined 4")
+            defer(ethtool, f"-L {cfg.ifname} combined {queue_cnt}")
+        except Exception as exc:
+            raise KsftSkipEx("Not enough queues for the test") from exc
+
+    # Use queues 0 and 1 for the main context
+    ethtool(f"-X {cfg.ifname} equal 2")
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    # Create a new RSS context that uses queues 2 and 3
+    ctx_id = ethtool_create(cfg, "-X", "context new start 2 equal 2")
+    defer(ethtool, f"-X {cfg.ifname} context {ctx_id} delete")
+
+    # Generic low-priority rule: redirect all TCP traffic to the new context.
+    # Give it an explicit higher location number (lower priority).
+    flow_generic = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} context {ctx_id} loc 1"
+    ethtool(f"-N {cfg.ifname} {flow_generic}")
+    defer(ethtool, f"-N {cfg.ifname} delete 1")
+
+    # Specific high-priority rule for a random port that should stay on context 0.
+    # Assign loc 0 so it is evaluated before the generic rule.
+    port_main = rand_port()
+    flow_main = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port_main} context 0 loc 0"
+    ethtool(f"-N {cfg.ifname} {flow_main}")
+    defer(ethtool, f"-N {cfg.ifname} delete 0")
+
+    _ntuple_rule_check(cfg, 1, ctx_id)
+
+    # Verify that traffic matching the specific rule still goes to queues 0/1
+    _send_traffic_check(cfg, port_main, "context 0",
+                        { 'target': (0, 1),
+                          'empty' : (2, 3) })
+
+    # And that traffic for any other port is steered to the new context
+    port_other = rand_port()
+    _send_traffic_check(cfg, port_other, f"context {ctx_id}",
+                        { 'target': (2, 3),
+                          'noise' : (0, 1) })
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
         cfg.context_cnt = None
@@ -760,7 +816,8 @@ def main() -> None:
                   test_rss_context_overlap, test_rss_context_overlap2,
                   test_rss_context_out_of_order, test_rss_context4_create_with_cfg,
                   test_flow_add_context_missing,
-                  test_delete_rss_context_busy, test_rss_ntuple_addition],
+                  test_delete_rss_context_busy, test_rss_ntuple_addition,
+                  test_rss_default_context_rule],
                  args=(cfg, ))
     ksft_exit()
 
-- 
cgit v1.2.3


From bb666b7c27073b986b75699e51a7102910f58060 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 9 Jun 2025 17:57:49 +0100
Subject: mm: add mmap_prepare() compatibility layer for nested file systems

Nested file systems, that is those which invoke call_mmap() within their
own f_op->mmap() handlers, may encounter underlying file systems which
provide the f_op->mmap_prepare() hook introduced by commit c84bf6dd2b83
("mm: introduce new .mmap_prepare() file callback").

We have a chicken-and-egg scenario here - until all file systems are
converted to using .mmap_prepare(), we cannot convert these nested
handlers, as we can't call f_op->mmap from an .mmap_prepare() hook.

So we have to do it the other way round - invoke the .mmap_prepare() hook
from an .mmap() one.

in order to do so, we need to convert VMA state into a struct vm_area_desc
descriptor, invoking the underlying file system's f_op->mmap_prepare()
callback passing a pointer to this, and then setting VMA state accordingly
and safely.

This patch achieves this via the compat_vma_mmap_prepare() function, which
we invoke from call_mmap() if f_op->mmap_prepare() is specified in the
passed in file pointer.

We place the fundamental logic into mm/vma.h where VMA manipulation
belongs.  We also update the VMA userland tests to accommodate the
changes.

The compat_vma_mmap_prepare() function and its associated machinery is
temporary, and will be removed once the conversion of file systems is
complete.

We carefully place this code so it can be used with CONFIG_MMU and also
with cutting edge nommu silicon.

[akpm@linux-foundation.org: export compat_vma_mmap_prepare tp fix build]
[lorenzo.stoakes@oracle.com: remove unused declarations]
  Link: https://lkml.kernel.org/r/ac3ae324-4c65-432a-8c6d-2af988b18ac8@lucifer.local
Link: https://lkml.kernel.org/r/20250609165749.344976-1-lorenzo.stoakes@oracle.com
Fixes: c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file callback").
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Jann Horn <jannh@google.com>
Closes: https://lore.kernel.org/linux-mm/CAG48ez04yOEVx1ekzOChARDDBZzAKwet8PEoPM4Ln3_rk91AzQ@mail.gmail.com/
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 4505b1c31be1..14718ca23a05 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -159,6 +159,14 @@ typedef __bitwise unsigned int vm_fault_t;
 
 #define ASSERT_EXCLUSIVE_WRITER(x)
 
+/**
+ * swap - swap values of @a and @b
+ * @a: first value
+ * @b: second value
+ */
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
 struct kref {
 	refcount_t refcount;
 };
@@ -1468,4 +1476,12 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
 	(void)vma;
 }
 
+static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+	/* Changing an anonymous vma with this is illegal */
+	get_file(file);
+	swap(vma->vm_file, file);
+	fput(file);
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From a766cfbbeb3a74397965a8fa2e9a402026d3e1d8 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 12 Jun 2025 22:28:56 -0700
Subject: bpf: Mark dentry->d_inode as trusted_or_null

LSM hooks such as security_path_mknod() and security_inode_rename() have
access to newly allocated negative dentry, which has NULL d_inode.
Therefore, it is necessary to do the NULL pointer check for d_inode.

Also add selftests that checks the verifier enforces the NULL pointer
check.

Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20250613052857.1992233-1-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/verifier_vfs_accept.c  | 18 ++++++++++++++++++
 .../testing/selftests/bpf/progs/verifier_vfs_reject.c  | 15 +++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
index a7c0a553aa50..3e2d76ee8050 100644
--- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2024 Google LLC. */
 
 #include <vmlinux.h>
+#include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
@@ -82,4 +83,21 @@ int BPF_PROG(path_d_path_from_file_argument, struct file *file)
 	return 0;
 }
 
+SEC("lsm.s/inode_rename")
+__success
+int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
+	     struct inode *new_dir, struct dentry *new_dentry,
+	     unsigned int flags)
+{
+	struct inode *inode = new_dentry->d_inode;
+	ino_t ino;
+
+	if (!inode)
+		return 0;
+	ino = inode->i_ino;
+	if (ino == 0)
+		return -EACCES;
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
index d6d3f4fcb24c..4b392c6c8fc4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2024 Google LLC. */
 
 #include <vmlinux.h>
+#include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <linux/limits.h>
@@ -158,4 +159,18 @@ int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f)
 	return 0;
 }
 
+SEC("lsm.s/inode_rename")
+__failure __msg("invalid mem access 'trusted_ptr_or_null_'")
+int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
+	     struct inode *new_dir, struct dentry *new_dentry,
+	     unsigned int flags)
+{
+	struct inode *inode = new_dentry->d_inode;
+	ino_t ino;
+
+	ino = inode->i_ino;
+	if (ino == 0)
+		return -EACCES;
+	return 0;
+}
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 3168276591210d147ed9e6dc267f8a04007593c7 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Tue, 17 Jun 2025 14:20:59 -0700
Subject: selftests: netdevsim: improve lib.sh include in peer.sh

Fix the peer.sh test to run from INSTALL_PATH.

Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://patch.msgid.link/20250617212102.175711-2-dw@davidwei.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/netdevsim/peer.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/peer.sh b/tools/testing/selftests/drivers/net/netdevsim/peer.sh
index 1bb46ec435d4..7f32b5600925 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/peer.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/peer.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0-only
 
-source ../../../net/lib.sh
+lib_dir=$(dirname $0)/../../../net
+source $lib_dir/lib.sh
 
 NSIM_DEV_1_ID=$((256 + RANDOM % 256))
 NSIM_DEV_1_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_DEV_1_ID
-- 
cgit v1.2.3


From c65b5bb2329e36340eba76f05752f8f1eb15bee0 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Tue, 17 Jun 2025 14:21:00 -0700
Subject: selftests: net: add passive TFO test binary

Add a simple passive TFO server and client test binary. This will be
used to test the SO_INCOMING_NAPI_ID of passive TFO accepted sockets.

Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://patch.msgid.link/20250617212102.175711-3-dw@davidwei.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/.gitignore |   1 +
 tools/testing/selftests/net/Makefile   |   1 +
 tools/testing/selftests/net/tfo.c      | 171 +++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+)
 create mode 100644 tools/testing/selftests/net/tfo.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 532bb732bc6d..c6dd2a335cf4 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -50,6 +50,7 @@ tap
 tcp_fastopen_backup_key
 tcp_inq
 tcp_mmap
+tfo
 timestamping
 tls
 toeplitz
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ab996bd22a5f..ab8da438fd78 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -110,6 +110,7 @@ TEST_GEN_PROGS += proc_net_pktgen
 TEST_PROGS += lwt_dst_cache_ref_loop.sh
 TEST_PROGS += skf_net_off.sh
 TEST_GEN_FILES += skf_net_off
+TEST_GEN_FILES += tfo
 
 # YNL files, must be before "include ..lib.mk"
 YNL_GEN_FILES := busy_poller netlink-dumps
diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c
new file mode 100644
index 000000000000..eb3cac5e583c
--- /dev/null
+++ b/tools/testing/selftests/net/tfo.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <errno.h>
+
+static int cfg_server;
+static int cfg_client;
+static int cfg_port = 8000;
+static struct sockaddr_in6 cfg_addr;
+static char *cfg_outfile;
+
+static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
+{
+	int ret;
+
+	sin6->sin6_family = AF_INET6;
+	sin6->sin6_port = htons(port);
+
+	ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr);
+	if (ret != 1) {
+		/* fallback to plain IPv4 */
+		ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]);
+		if (ret != 1)
+			return -1;
+
+		/* add ::ffff prefix */
+		sin6->sin6_addr.s6_addr32[0] = 0;
+		sin6->sin6_addr.s6_addr32[1] = 0;
+		sin6->sin6_addr.s6_addr16[4] = 0;
+		sin6->sin6_addr.s6_addr16[5] = 0xffff;
+	}
+
+	return 0;
+}
+
+static void run_server(void)
+{
+	unsigned long qlen = 32;
+	int fd, opt, connfd;
+	socklen_t len;
+	char buf[64];
+	FILE *outfile;
+
+	outfile = fopen(cfg_outfile, "w");
+	if (!outfile)
+		error(1, errno, "fopen() outfile");
+
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket()");
+
+	opt = 1;
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0)
+		error(1, errno, "setsockopt(SO_REUSEADDR)");
+
+	if (setsockopt(fd, SOL_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)) < 0)
+		error(1, errno, "setsockopt(TCP_FASTOPEN)");
+
+	if (bind(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)) < 0)
+		error(1, errno, "bind()");
+
+	if (listen(fd, 5) < 0)
+		error(1, errno, "listen()");
+
+	len = sizeof(cfg_addr);
+	connfd = accept(fd, (struct sockaddr *)&cfg_addr, &len);
+	if (connfd < 0)
+		error(1, errno, "accept()");
+
+	len = sizeof(opt);
+	if (getsockopt(connfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &opt, &len) < 0)
+		error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)");
+
+	read(connfd, buf, 64);
+	fprintf(outfile, "%d\n", opt);
+
+	fclose(outfile);
+	close(connfd);
+	close(fd);
+}
+
+static void run_client(void)
+{
+	int fd;
+	char *msg = "Hello, world!";
+
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket()");
+
+	sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr));
+
+	close(fd);
+}
+
+static void usage(const char *filepath)
+{
+	error(1, 0, "Usage: %s (-s|-c) -h<server_ip> -p<port> -o<outfile> ", filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	struct sockaddr_in6 *addr6 = (void *) &cfg_addr;
+	char *addr = NULL;
+	int ret;
+	int c;
+
+	if (argc <= 1)
+		usage(argv[0]);
+
+	while ((c = getopt(argc, argv, "sch:p:o:")) != -1) {
+		switch (c) {
+		case 's':
+			if (cfg_client)
+				error(1, 0, "Pass one of -s or -c");
+			cfg_server = 1;
+			break;
+		case 'c':
+			if (cfg_server)
+				error(1, 0, "Pass one of -s or -c");
+			cfg_client = 1;
+			break;
+		case 'h':
+			addr = optarg;
+			break;
+		case 'p':
+			cfg_port = strtoul(optarg, NULL, 0);
+			break;
+		case 'o':
+			cfg_outfile = strdup(optarg);
+			if (!cfg_outfile)
+				error(1, 0, "outfile invalid");
+			break;
+		}
+	}
+
+	if (cfg_server && addr)
+		error(1, 0, "Server cannot have -h specified");
+
+	memset(addr6, 0, sizeof(*addr6));
+	addr6->sin6_family = AF_INET6;
+	addr6->sin6_port = htons(cfg_port);
+	addr6->sin6_addr = in6addr_any;
+	if (addr) {
+		ret = parse_address(addr, cfg_port, addr6);
+		if (ret)
+			error(1, 0, "Client address parse error: %s", addr);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+
+	if (cfg_server)
+		run_server();
+	else if (cfg_client)
+		run_client();
+
+	return 0;
+}
-- 
cgit v1.2.3


From 137e7b5cceda2fd634ff47fde6c4226092d09442 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Tue, 17 Jun 2025 14:21:01 -0700
Subject: selftests: net: add test for passive TFO socket NAPI ID

Add a test that checks that the NAPI ID of a passive TFO socket is valid
i.e. not zero.

Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://patch.msgid.link/20250617212102.175711-4-dw@davidwei.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile       |   1 +
 tools/testing/selftests/net/tfo_passive.sh | 112 +++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100755 tools/testing/selftests/net/tfo_passive.sh

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ab8da438fd78..332f387615d7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -111,6 +111,7 @@ TEST_PROGS += lwt_dst_cache_ref_loop.sh
 TEST_PROGS += skf_net_off.sh
 TEST_GEN_FILES += skf_net_off
 TEST_GEN_FILES += tfo
+TEST_PROGS += tfo_passive.sh
 
 # YNL files, must be before "include ..lib.mk"
 YNL_GEN_FILES := busy_poller netlink-dumps
diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh
new file mode 100755
index 000000000000..80bf11fdc046
--- /dev/null
+++ b/tools/testing/selftests/net/tfo_passive.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source lib.sh
+
+NSIM_SV_ID=$((256 + RANDOM % 256))
+NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID
+NSIM_CL_ID=$((512 + RANDOM % 256))
+NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID
+
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device
+NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device
+NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device
+
+SERVER_IP=192.168.1.1
+CLIENT_IP=192.168.1.2
+SERVER_PORT=48675
+
+setup_ns()
+{
+	set -e
+	ip netns add nssv
+	ip netns add nscl
+
+	NSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_SV_SYS/net -exec basename {} \;)
+	NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_CL_SYS/net -exec basename {} \;)
+
+	ip link set $NSIM_SV_NAME netns nssv
+	ip link set $NSIM_CL_NAME netns nscl
+
+	ip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAME
+	ip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAME
+
+	ip netns exec nssv ip link set dev $NSIM_SV_NAME up
+	ip netns exec nscl ip link set dev $NSIM_CL_NAME up
+
+	# Enable passive TFO
+	ip netns exec nssv sysctl -w net.ipv4.tcp_fastopen=519 > /dev/null
+
+	set +e
+}
+
+cleanup_ns()
+{
+	ip netns del nscl
+	ip netns del nssv
+}
+
+###
+### Code start
+###
+
+modprobe netdevsim
+
+# linking
+
+echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW
+udevadm settle
+
+setup_ns
+
+NSIM_SV_FD=$((256 + RANDOM % 256))
+exec {NSIM_SV_FD}</var/run/netns/nssv
+NSIM_SV_IFIDX=$(ip netns exec nssv cat /sys/class/net/$NSIM_SV_NAME/ifindex)
+
+NSIM_CL_FD=$((256 + RANDOM % 256))
+exec {NSIM_CL_FD}</var/run/netns/nscl
+NSIM_CL_IFIDX=$(ip netns exec nscl cat /sys/class/net/$NSIM_CL_NAME/ifindex)
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX $NSIM_CL_FD:$NSIM_CL_IFIDX" > \
+     $NSIM_DEV_SYS_LINK
+
+if [ $? -ne 0 ]; then
+	echo "linking netdevsim1 with netdevsim2 should succeed"
+	cleanup_ns
+	exit 1
+fi
+
+out_file=$(mktemp)
+
+timeout -k 1s 30s ip netns exec nssv ./tfo        \
+				-s                \
+				-p ${SERVER_PORT} \
+				-o ${out_file}&
+
+wait_local_port_listen nssv ${SERVER_PORT} tcp
+
+ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT}
+
+wait
+
+res=$(cat $out_file)
+rm $out_file
+
+if [ $res -eq 0 ]; then
+	echo "got invalid NAPI ID from passive TFO socket"
+	cleanup_ns
+	exit 1
+fi
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
+
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
+
+cleanup_ns
+
+modprobe -r netdevsim
+
+exit 0
-- 
cgit v1.2.3


From d4adf1c9ee7722545450608bcb095fb31512f0c6 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 18 Jun 2025 17:57:40 -0400
Subject: bpf: Adjust free target to avoid global starvation of LRU map

BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the
map is full, due to percpu reservations and force shrink before
neighbor stealing. Once a CPU is unable to borrow from the global map,
it will once steal one elem from a neighbor and after that each time
flush this one element to the global list and immediately recycle it.

Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map
with 79 CPUs. CPU 79 will observe this behavior even while its
neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%).

CPUs need not be active concurrently. The issue can appear with
affinity migration, e.g., irqbalance. Each CPU can reserve and then
hold onto its 128 elements indefinitely.

Avoid global list exhaustion by limiting aggregate percpu caches to
half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count.
This change has no effect on sufficiently large tables.

Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable
lru->free_target. The extra field fits in a hole in struct bpf_lru.
The cacheline is already warm where read in the hot path. The field is
only accessed with the lru lock held.

Tested-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_lru_map.c | 72 +++++++++++++++---------------
 1 file changed, 35 insertions(+), 37 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index fda7589c5023..4ae83f4b7fc7 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try)
 	return ret;
 }
 
+/* Inverse of how bpf_common_lru_populate derives target_free from map_size. */
+static unsigned int __map_size(unsigned int tgt_free)
+{
+	return tgt_free * nr_cpus * 2;
+}
+
 /* Size of the LRU map is 2
  * Add key=1 (+1 key)
  * Add key=2 (+1 key)
@@ -231,11 +237,11 @@ static void test_lru_sanity0(int map_type, int map_flags)
 	printf("Pass\n");
 }
 
-/* Size of the LRU map is 1.5*tgt_free
- * Insert 1 to tgt_free (+tgt_free keys)
- * Lookup 1 to tgt_free/2
- * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys)
- * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU
+/* Verify that unreferenced elements are recycled before referenced ones.
+ * Insert elements.
+ * Reference a subset of these.
+ * Insert more, enough to trigger recycling.
+ * Verify that unreferenced are recycled.
  */
 static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 {
@@ -257,7 +263,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 	batch_size = tgt_free / 2;
 	assert(batch_size * 2 == tgt_free);
 
-	map_size = tgt_free + batch_size;
+	map_size = __map_size(tgt_free) + batch_size;
 	lru_map_fd = create_map(map_type, map_flags, map_size);
 	assert(lru_map_fd != -1);
 
@@ -266,13 +272,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 
 	value[0] = 1234;
 
-	/* Insert 1 to tgt_free (+tgt_free keys) */
-	end_key = 1 + tgt_free;
+	/* Insert map_size - batch_size keys */
+	end_key = 1 + __map_size(tgt_free);
 	for (key = 1; key < end_key; key++)
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
 
-	/* Lookup 1 to tgt_free/2 */
+	/* Lookup 1 to batch_size */
 	end_key = 1 + batch_size;
 	for (key = 1; key < end_key; key++) {
 		assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
@@ -280,12 +286,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 					    BPF_NOEXIST));
 	}
 
-	/* Insert 1+tgt_free to 2*tgt_free
-	 * => 1+tgt_free/2 to LOCALFREE_TARGET will be
+	/* Insert another map_size - batch_size keys
+	 * Map will contain 1 to batch_size plus these latest, i.e.,
+	 * => previous 1+batch_size to map_size - batch_size will have been
 	 * removed by LRU
 	 */
-	key = 1 + tgt_free;
-	end_key = key + tgt_free;
+	key = 1 + __map_size(tgt_free);
+	end_key = key + __map_size(tgt_free);
 	for (; key < end_key; key++) {
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
@@ -301,17 +308,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 	printf("Pass\n");
 }
 
-/* Size of the LRU map 1.5 * tgt_free
- * Insert 1 to tgt_free (+tgt_free keys)
- * Update 1 to tgt_free/2
- *   => The original 1 to tgt_free/2 will be removed due to
- *      the LRU shrink process
- * Re-insert 1 to tgt_free/2 again and do a lookup immeidately
- * Insert 1+tgt_free to tgt_free*3/2
- * Insert 1+tgt_free*3/2 to tgt_free*5/2
- *   => Key 1+tgt_free to tgt_free*3/2
- *      will be removed from LRU because it has never
- *      been lookup and ref bit is not set
+/* Verify that insertions exceeding map size will recycle the oldest.
+ * Verify that unreferenced elements are recycled before referenced.
  */
 static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 {
@@ -334,7 +332,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 	batch_size = tgt_free / 2;
 	assert(batch_size * 2 == tgt_free);
 
-	map_size = tgt_free + batch_size;
+	map_size = __map_size(tgt_free) + batch_size;
 	lru_map_fd = create_map(map_type, map_flags, map_size);
 	assert(lru_map_fd != -1);
 
@@ -343,8 +341,8 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 
 	value[0] = 1234;
 
-	/* Insert 1 to tgt_free (+tgt_free keys) */
-	end_key = 1 + tgt_free;
+	/* Insert map_size - batch_size keys */
+	end_key = 1 + __map_size(tgt_free);
 	for (key = 1; key < end_key; key++)
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
@@ -357,8 +355,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 	 * shrink the inactive list to get tgt_free
 	 * number of free nodes.
 	 *
-	 * Hence, the oldest key 1 to tgt_free/2
-	 * are removed from the LRU list.
+	 * Hence, the oldest key is removed from the LRU list.
 	 */
 	key = 1;
 	if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
@@ -370,8 +367,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 					   BPF_EXIST));
 	}
 
-	/* Re-insert 1 to tgt_free/2 again and do a lookup
-	 * immeidately.
+	/* Re-insert 1 to batch_size again and do a lookup immediately.
 	 */
 	end_key = 1 + batch_size;
 	value[0] = 4321;
@@ -387,17 +383,18 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 
 	value[0] = 1234;
 
-	/* Insert 1+tgt_free to tgt_free*3/2 */
-	end_key = 1 + tgt_free + batch_size;
-	for (key = 1 + tgt_free; key < end_key; key++)
+	/* Insert batch_size new elements */
+	key = 1 + __map_size(tgt_free);
+	end_key = key + batch_size;
+	for (; key < end_key; key++)
 		/* These newly added but not referenced keys will be
 		 * gone during the next LRU shrink.
 		 */
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
 
-	/* Insert 1+tgt_free*3/2 to  tgt_free*5/2 */
-	end_key = key + tgt_free;
+	/* Insert map_size - batch_size elements */
+	end_key += __map_size(tgt_free);
 	for (; key < end_key; key++) {
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
@@ -500,7 +497,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
 		lru_map_fd = create_map(map_type, map_flags,
 					3 * tgt_free * nr_cpus);
 	else
-		lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
+		lru_map_fd = create_map(map_type, map_flags,
+					3 * __map_size(tgt_free));
 	assert(lru_map_fd != -1);
 
 	expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,
-- 
cgit v1.2.3


From 56a14984505b11674df5e01407748236bc4bc8f8 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Sun, 8 Jun 2025 17:54:02 +0800
Subject: KVM: arm64: selftests: Close the GIC FD in arch_timer_edge_cases

Close the GIC FD to free the reference it holds to the VM so that we can
correctly clean up the VM. This also gets rid of the

	"KVM: debugfs: duplicate directory 395722-4"

warning when running arch_timer_edge_cases.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Reviewed-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20250608095402.1131-1-yuzenghui@huawei.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 .../testing/selftests/kvm/arm64/arch_timer_edge_cases.c  | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index b4d22b3ab7cc..4e71740a098b 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -954,6 +954,8 @@ static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 	pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
 }
 
+static int gic_fd;
+
 static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
 			   enum arch_timer timer)
 {
@@ -968,12 +970,20 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
 	vcpu_args_set(*vcpu, 1, timer);
 
 	test_init_timer_irq(*vm, *vcpu);
-	vgic_v3_setup(*vm, 1, 64);
+	gic_fd = vgic_v3_setup(*vm, 1, 64);
+	__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3");
+
 	sync_global_to_guest(*vm, test_args);
 	sync_global_to_guest(*vm, CVAL_MAX);
 	sync_global_to_guest(*vm, DEF_CNT);
 }
 
+static void test_vm_cleanup(struct kvm_vm *vm)
+{
+	close(gic_fd);
+	kvm_vm_free(vm);
+}
+
 static void test_print_help(char *name)
 {
 	pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n"
@@ -1060,13 +1070,13 @@ int main(int argc, char *argv[])
 	if (test_args.test_virtual) {
 		test_vm_create(&vm, &vcpu, VIRTUAL);
 		test_run(vm, vcpu);
-		kvm_vm_free(vm);
+		test_vm_cleanup(vm);
 	}
 
 	if (test_args.test_physical) {
 		test_vm_create(&vm, &vcpu, PHYSICAL);
 		test_run(vm, vcpu);
-		kvm_vm_free(vm);
+		test_vm_cleanup(vm);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 965f87700adbcc6d72430f524d88135027f5bba3 Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com>
Date: Mon, 9 Jun 2025 12:06:07 +0000
Subject: selftests/mm: increase timeout from 180 to 900 seconds

The mm selftests are timing out with the current 180-second limit.
Testing shows that run_vmtests.sh takes approximately 11 minutes
(664 seconds) to complete.

Increase the timeout to 900 seconds (15 minutes) to provide sufficient
buffer for the tests to complete successfully.

Link: https://lkml.kernel.org/r/20250609120606.73145-2-shivankg@amd.com
Signed-off-by: Shivank Garg <shivankg@amd.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/settings | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings
index a953c96aa16e..e2206265f67c 100644
--- a/tools/testing/selftests/mm/settings
+++ b/tools/testing/selftests/mm/settings
@@ -1 +1 @@
-timeout=180
+timeout=900
-- 
cgit v1.2.3


From 223731cd63004cb07edfc6257d53565995c00cbb Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Fri, 13 Jun 2025 09:19:12 +0530
Subject: selftests/mm: add configs to fix testcase failure

If CONFIG_UPROBES is not set, a merge subtest fails:

Failure log:

  7151 12:46:54.627936  # # #  RUN           merge.handle_uprobe_upon_merged_vma ...
  7152 12:46:54.639014  # # f /sys/bus/event_source/devices/uprobe/type
  7153 12:46:54.639306  # # fopen: No such file or directory
  7154 12:46:54.650451  # # # merge.c:473:handle_uprobe_upon_merged_vma:Expected read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) (1) == 0 (0)
  7155 12:46:54.650730  # # # handle_uprobe_upon_merged_vma: Test terminated by assertion
  7156 12:46:54.661750  # # #          FAIL  merge.handle_uprobe_upon_merged_vma
  7157 12:46:54.662030  # # not ok 8 merge.handle_uprobe_upon_merged_vma

CONFIG_UPROBES is enabled by CONFIG_UPROBE_EVENTS, which gets enabled by
CONFIG_FTRACE. Therefore add these configs to selftests/mm/config so that
CI systems can include this config in the kernel build. To be completely
safe, add CONFIG_PROFILING too, to enable the dependency chain
PROFILING -> PERF_EVENTS -> UPROBE_EVENTS -> UPROBES.

Link: https://lkml.kernel.org/r/20250613034912.53791-1-dev.jain@arm.com
Fixes: efe99fabeb11 ("selftests/mm: add test about uprobe pte be orphan during vma merge")
Signed-off-by: Dev Jain <dev.jain@arm.com>
Reported-by: Aishwarya <aishwarya.tcv@arm.com>
Closes: https://lore.kernel.org/all/20250610103729.72440-1-aishwarya.tcv@arm.com/
Tested-by: Aishwarya TCV <aishwarya.tcv@arm.com>
Tested-by : Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Pu Lehui <pulehui@huawei.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/config | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
index a28baa536332..deba93379c80 100644
--- a/tools/testing/selftests/mm/config
+++ b/tools/testing/selftests/mm/config
@@ -8,3 +8,6 @@ CONFIG_GUP_TEST=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_MEM_SOFT_DIRTY=y
 CONFIG_ANON_VMA_NAME=y
+CONFIG_FTRACE=y
+CONFIG_PROFILING=y
+CONFIG_UPROBES=y
-- 
cgit v1.2.3


From 3333871296efd52ef6f6d5cce5a92dc7b06ba879 Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pfalcato@suse.de>
Date: Tue, 10 Jun 2025 13:22:09 +0100
Subject: selftests/mm: skip uprobe vma merge test if uprobes are not enabled

If uprobes are not enabled, the test currently fails with:

7151 12:46:54.627936  # # #  RUN           merge.handle_uprobe_upon_merged_vma ...
7152 12:46:54.639014  # # f /sys/bus/event_source/devices/uprobe/type
7153 12:46:54.639306  # # fopen: No such file or directory
7154 12:46:54.650451  # # # merge.c:473:handle_uprobe_upon_merged_vma:Expected
read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) (1) == 0 (0)
7155 12:46:54.650730  # # # handle_uprobe_upon_merged_vma: Test terminated by assertion
7156 12:46:54.661750  # # #          FAIL  merge.handle_uprobe_upon_merged_vma
7157 12:46:54.662030  # # not ok 8 merge.handle_uprobe_upon_merged_vma

Skipping is a more sane and friendly behavior here.

Link: https://lkml.kernel.org/r/20250610122209.3177587-1-pfalcato@suse.de
Fixes: efe99fabeb11 ("selftests/mm: add test about uprobe pte be orphan during vma merge")
Signed-off-by: Pedro Falcato <pfalcato@suse.de>
Reported-by: Aishwarya <aishwarya.tcv@arm.com>
Closes: https://lore.kernel.org/linux-mm/20250610103729.72440-1-aishwarya.tcv@arm.com/
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by : Donet Tom <donettom@linux.ibm.com>
Reviewed-by : Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Pu Lehui <pulehui@huawei.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/merge.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index bbae66fc5038..cc26480098ae 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -470,7 +470,9 @@ TEST_F(merge, handle_uprobe_upon_merged_vma)
 	ASSERT_GE(fd, 0);
 
 	ASSERT_EQ(ftruncate(fd, page_size), 0);
-	ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0);
+	if (read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) != 0) {
+		SKIP(goto out, "Failed to read uprobe sysfs file, skipping");
+	}
 
 	memset(&attr, 0, attr_sz);
 	attr.size = attr_sz;
@@ -491,6 +493,7 @@ TEST_F(merge, handle_uprobe_upon_merged_vma)
 	ASSERT_NE(mremap(ptr2, page_size, page_size,
 			 MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED);
 
+out:
 	close(fd);
 	remove(probe_file);
 }
-- 
cgit v1.2.3


From b8a205486ed5c0c5c0386e472157a81ce686af25 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 19 Jun 2025 16:06:03 +0200
Subject: selftests/bpf: Convert test_sysctl to prog_tests

Convert test_sysctl test to prog_tests with minimal change to the
tests themselves.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250619140603.148942-3-jmarchan@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |    1 -
 tools/testing/selftests/bpf/Makefile               |    5 +-
 .../testing/selftests/bpf/prog_tests/test_sysctl.c | 1612 +++++++++++++++++++
 tools/testing/selftests/bpf/test_sysctl.c          | 1633 --------------------
 4 files changed, 1614 insertions(+), 1637 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_sysctl.c
 delete mode 100644 tools/testing/selftests/bpf/test_sysctl.c

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index e2a2c46c008b..3d8378972d26 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -21,7 +21,6 @@ test_lirc_mode2_user
 flow_dissector_load
 test_tcpnotify_user
 test_libbpf
-test_sysctl
 xdping
 test_cpp
 *.d
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index cf5ed3bee573..910d8d6402ef 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -73,7 +73,7 @@ endif
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
 	test_sockmap \
-	test_tcpnotify_user test_sysctl \
+	test_tcpnotify_user \
 	test_progs-no_alu32
 TEST_INST_SUBDIRS := no_alu32
 
@@ -220,7 +220,7 @@ ifeq ($(VMLINUX_BTF),)
 $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
 endif
 
-# Define simple and short `make test_progs`, `make test_sysctl`, etc targets
+# Define simple and short `make test_progs`, `make test_maps`, etc targets
 # to build individual tests.
 # NOTE: Semicolon at the end is critical to override lib.mk's default static
 # rule for binaries.
@@ -329,7 +329,6 @@ NETWORK_HELPERS := $(OUTPUT)/network_helpers.o
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
 $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
-$(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tag: $(TESTING_HELPERS)
 $(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS)
 $(OUTPUT)/xdping: $(TESTING_HELPERS)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_sysctl.c b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c
new file mode 100644
index 000000000000..273dd41ca09e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c
@@ -0,0 +1,1612 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH			"/foo"
+#define MAX_INSNS		512
+#define FIXUP_SYSCTL_VALUE	0
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sysctl_test {
+	const char *descr;
+	size_t fixup_value_insn;
+	struct bpf_insn	insns[MAX_INSNS];
+	const char *prog_file;
+	enum bpf_attach_type attach_type;
+	const char *sysctl;
+	int open_flags;
+	int seek;
+	const char *newval;
+	const char *oldval;
+	enum {
+		LOAD_REJECT,
+		ATTACH_REJECT,
+		OP_EPERM,
+		SUCCESS,
+	} result;
+};
+
+static struct sysctl_test tests[] = {
+	{
+		.descr = "sysctl wrong attach_type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = 0,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = ATTACH_REJECT,
+	},
+	{
+		.descr = "sysctl:read allow all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl:read deny all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "ctx:write sysctl:read read ok",
+		.insns = {
+			/* If (write) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, write)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "ctx:write sysctl:write read ok",
+		.insns = {
+			/* If (write) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, write)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/domainname",
+		.open_flags = O_WRONLY,
+		.newval = "(none)", /* same as default, should fail anyway */
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "ctx:write sysctl:write read ok narrow",
+		.insns = {
+			/* u64 w = (u16)write & 1; */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+			BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, write)),
+#else
+			BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, write) + 2),
+#endif
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_7, 1),
+			/* return 1 - w; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/domainname",
+		.open_flags = O_WRONLY,
+		.newval = "(none)", /* same as default, should fail anyway */
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "ctx:write sysctl:read write reject",
+		.insns = {
+			/* write = X */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct bpf_sysctl, write)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = LOAD_REJECT,
+	},
+	{
+		.descr = "ctx:file_pos sysctl:read read ok",
+		.insns = {
+			/* If (file_pos == X) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, file_pos)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.seek = 3,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "ctx:file_pos sysctl:read read ok narrow",
+		.insns = {
+			/* If (file_pos == X) */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+			BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, file_pos)),
+#else
+			BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sysctl, file_pos) + 3),
+#endif
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 4, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.seek = 4,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "ctx:file_pos sysctl:read write ok",
+		.insns = {
+			/* file_pos = X */
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct bpf_sysctl, file_pos)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.oldval = "nux\n",
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_name sysctl_value:base ok",
+		.insns = {
+			/* sysctl_get_name arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_name arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 8),
+
+			/* sysctl_get_name arg4 (flags) */
+			BPF_MOV64_IMM(BPF_REG_4, BPF_F_SYSCTL_BASE_NAME),
+
+			/* sysctl_get_name(ctx, buf, buf_len, flags) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, sizeof("tcp_mem") - 1, 6),
+			/*     buf == "tcp_mem\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x7463705f6d656d00ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_name sysctl_value:base E2BIG truncated",
+		.insns = {
+			/* sysctl_get_name arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_name arg3 (buf_len) too small */
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+
+			/* sysctl_get_name arg4 (flags) */
+			BPF_MOV64_IMM(BPF_REG_4, BPF_F_SYSCTL_BASE_NAME),
+
+			/* sysctl_get_name(ctx, buf, buf_len, flags) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
+
+			/*     buf[0:7] == "tcp_me\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x7463705f6d650000ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_name sysctl:full ok",
+		.insns = {
+			/* sysctl_get_name arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_name arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 17),
+
+			/* sysctl_get_name arg4 (flags) */
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+
+			/* sysctl_get_name(ctx, buf, buf_len, flags) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 16, 14),
+
+			/*     buf[0:8] == "net/ipv4" && */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x6e65742f69707634ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
+
+			/*     buf[8:16] == "/tcp_mem" && */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x2f7463705f6d656dULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
+
+			/*     buf[16:24] == "\0") */
+			BPF_LD_IMM64(BPF_REG_8, 0x0ULL),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_name sysctl:full E2BIG truncated",
+		.insns = {
+			/* sysctl_get_name arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_name arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 16),
+
+			/* sysctl_get_name arg4 (flags) */
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+
+			/* sysctl_get_name(ctx, buf, buf_len, flags) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 10),
+
+			/*     buf[0:8] == "net/ipv4" && */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x6e65742f69707634ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
+
+			/*     buf[8:16] == "/tcp_me\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x2f7463705f6d6500ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_name sysctl:full E2BIG truncated small",
+		.insns = {
+			/* sysctl_get_name arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_name arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+
+			/* sysctl_get_name arg4 (flags) */
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+
+			/* sysctl_get_name(ctx, buf, buf_len, flags) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
+
+			/*     buf[0:8] == "net/ip\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x6e65742f69700000ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_current_value sysctl:read ok, gt",
+		.insns = {
+			/* sysctl_get_current_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_current_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 8),
+
+			/* sysctl_get_current_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
+
+			/*     buf[0:6] == "Linux\n\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_current_value sysctl:read ok, eq",
+		.insns = {
+			/* sysctl_get_current_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_7, BPF_REG_0, 7),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_current_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+
+			/* sysctl_get_current_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
+
+			/*     buf[0:6] == "Linux\n\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_current_value sysctl:read E2BIG truncated",
+		.insns = {
+			/* sysctl_get_current_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_7, BPF_REG_0, 6),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_current_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 6),
+
+			/* sysctl_get_current_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
+
+			/*     buf[0:6] == "Linux\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x4c696e7578000000ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "kernel/ostype",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_current_value sysctl:read EINVAL",
+		.insns = {
+			/* sysctl_get_current_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_current_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 8),
+
+			/* sysctl_get_current_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 4),
+
+			/*     buf[0:8] is NUL-filled) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv6/conf/lo/stable_secret", /* -EIO */
+		.open_flags = O_RDONLY,
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "sysctl_get_current_value sysctl:write ok",
+		.fixup_value_insn = 6,
+		.insns = {
+			/* sysctl_get_current_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_current_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 8),
+
+			/* sysctl_get_current_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 6),
+
+			/*     buf[0:4] == expected) */
+			BPF_LD_IMM64(BPF_REG_8, FIXUP_SYSCTL_VALUE),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_WRONLY,
+		.newval = "600", /* same as default, should fail anyway */
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "sysctl_get_new_value sysctl:read EINVAL",
+		.insns = {
+			/* sysctl_get_new_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_new_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 8),
+
+			/* sysctl_get_new_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_get_new_value sysctl:write ok",
+		.insns = {
+			/* sysctl_get_new_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_new_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 4),
+
+			/* sysctl_get_new_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+
+			/*     buf[0:4] == "606\0") */
+			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+				    bpf_ntohl(0x36303600), 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_WRONLY,
+		.newval = "606",
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "sysctl_get_new_value sysctl:write ok long",
+		.insns = {
+			/* sysctl_get_new_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_new_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 24),
+
+			/* sysctl_get_new_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 23, 14),
+
+			/*     buf[0:8] == "3000000 " && */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x3330303030303020ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
+
+			/*     buf[8:16] == "4000000 " && */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x3430303030303020ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
+
+			/*     buf[16:24] == "6000000\0") */
+			BPF_LD_IMM64(BPF_REG_8,
+				     bpf_be64_to_cpu(0x3630303030303000ULL)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_WRONLY,
+		.newval = "3000000 4000000 6000000",
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "sysctl_get_new_value sysctl:write E2BIG",
+		.insns = {
+			/* sysctl_get_new_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_7, BPF_REG_0, 3),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_get_new_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 3),
+
+			/* sysctl_get_new_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 4),
+
+			/*     buf[0:3] == "60\0") */
+			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+				    bpf_ntohl(0x36300000), 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_WRONLY,
+		.newval = "606",
+		.result = OP_EPERM,
+	},
+	{
+		.descr = "sysctl_set_new_value sysctl:read EINVAL",
+		.insns = {
+			/* sysctl_set_new_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x36303000)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_set_new_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 3),
+
+			/* sysctl_set_new_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_set_new_value),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		.descr = "sysctl_set_new_value sysctl:write ok",
+		.fixup_value_insn = 2,
+		.insns = {
+			/* sysctl_set_new_value arg2 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_LD_IMM64(BPF_REG_0, FIXUP_SYSCTL_VALUE),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+			/* sysctl_set_new_value arg3 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_3, 3),
+
+			/* sysctl_set_new_value(ctx, buf, buf_len) */
+			BPF_EMIT_CALL(BPF_FUNC_sysctl_set_new_value),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_WRONLY,
+		.newval = "606",
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtoul one number string",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x36303000)),
+			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+			/*     res == expected) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 600, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtoul multi number string",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			/* "600 602\0" */
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3630302036303200ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 18),
+			/*     res == expected) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 600, 16),
+
+			/*     arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/*     arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_0),
+
+			/*     arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/*     arg4 (res) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/*     if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 4),
+			/*         res == expected) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 602, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtoul buf_len = 0, reject",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x36303000)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = LOAD_REJECT,
+	},
+	{
+		"bpf_strtoul supported base, ok",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x30373700)),
+			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 8),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+			/*     res == expected) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 63, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtoul unsupported base, EINVAL",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x36303000)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 3),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtoul buf with spaces only, EINVAL",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x0d0c0a09)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtoul negative number, EINVAL",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			/* " -6\0" */
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x0a2d3600)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtol negative number, ok",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			/* " -6\0" */
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x0a2d3600)),
+			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 10),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+			/*     res == expected) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, -6, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtol hex number, ok",
+		.insns = {
+			/* arg1 (buf) */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			/* "0xfe" */
+			BPF_MOV64_IMM(BPF_REG_0,
+				      bpf_ntohl(0x30786665)),
+			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 4),
+			/*     res == expected) */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 254, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtol max long",
+		.insns = {
+			/* arg1 (buf) 9223372036854775807 */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3932323333373230ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3336383534373735ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3830370000000000ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 19),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+			/* if (ret == expected && */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 19, 6),
+			/*     res == expected) */
+			BPF_LD_IMM64(BPF_REG_8, 0x7fffffffffffffffULL),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"bpf_strtol overflow, ERANGE",
+		.insns = {
+			/* arg1 (buf) 9223372036854775808 */
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3932323333373230ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3336383534373735ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+			BPF_LD_IMM64(BPF_REG_0,
+				     bpf_be64_to_cpu(0x3830380000000000ULL)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
+
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+			/* arg2 (buf_len) */
+			BPF_MOV64_IMM(BPF_REG_2, 19),
+
+			/* arg3 (flags) */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+
+			/* arg4 (res) */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+			BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+			/* if (ret == expected) */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -ERANGE, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+	{
+		"C prog: deny all writes",
+		.prog_file = "./test_sysctl_prog.bpf.o",
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_WRONLY,
+		.newval = "123 456 789",
+		.result = OP_EPERM,
+	},
+	{
+		"C prog: deny access by name",
+		.prog_file = "./test_sysctl_prog.bpf.o",
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/route/mtu_expires",
+		.open_flags = O_RDONLY,
+		.result = OP_EPERM,
+	},
+	{
+		"C prog: read tcp_mem",
+		.prog_file = "./test_sysctl_prog.bpf.o",
+		.attach_type = BPF_CGROUP_SYSCTL,
+		.sysctl = "net/ipv4/tcp_mem",
+		.open_flags = O_RDONLY,
+		.result = SUCCESS,
+	},
+};
+
+static size_t probe_prog_length(const struct bpf_insn *fp)
+{
+	size_t len;
+
+	for (len = MAX_INSNS - 1; len > 0; --len)
+		if (fp[len].code != 0 || fp[len].imm != 0)
+			break;
+	return len + 1;
+}
+
+static int fixup_sysctl_value(const char *buf, size_t buf_len,
+			      struct bpf_insn *prog, size_t insn_num)
+{
+	union {
+		uint8_t raw[sizeof(uint64_t)];
+		uint64_t num;
+	} value = {};
+
+	if (buf_len > sizeof(value)) {
+		log_err("Value is too big (%zd) to use in fixup", buf_len);
+		return -1;
+	}
+	if (prog[insn_num].code != (BPF_LD | BPF_DW | BPF_IMM)) {
+		log_err("Can fixup only BPF_LD_IMM64 insns");
+		return -1;
+	}
+
+	memcpy(value.raw, buf, buf_len);
+	prog[insn_num].imm = (uint32_t)value.num;
+	prog[insn_num + 1].imm = (uint32_t)(value.num >> 32);
+
+	return 0;
+}
+
+static int load_sysctl_prog_insns(struct sysctl_test *test,
+				  const char *sysctl_path)
+{
+	struct bpf_insn *prog = test->insns;
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+	int ret, insn_cnt;
+
+	insn_cnt = probe_prog_length(prog);
+
+	if (test->fixup_value_insn) {
+		char buf[128];
+		ssize_t len;
+		int fd;
+
+		fd = open(sysctl_path, O_RDONLY | O_CLOEXEC);
+		if (fd < 0) {
+			log_err("open(%s) failed", sysctl_path);
+			return -1;
+		}
+		len = read(fd, buf, sizeof(buf));
+		if (len == -1) {
+			log_err("read(%s) failed", sysctl_path);
+			close(fd);
+			return -1;
+		}
+		close(fd);
+		if (fixup_sysctl_value(buf, len, prog, test->fixup_value_insn))
+			return -1;
+	}
+
+	opts.log_buf = bpf_log_buf;
+	opts.log_size = BPF_LOG_BUF_SIZE;
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SYSCTL, NULL, "GPL", prog, insn_cnt, &opts);
+	if (ret < 0 && test->result != LOAD_REJECT) {
+		log_err(">>> Loading program error.\n"
+			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
+	}
+
+	return ret;
+}
+
+static int load_sysctl_prog_file(struct sysctl_test *test)
+{
+	struct bpf_object *obj;
+	int prog_fd;
+
+	if (bpf_prog_test_load(test->prog_file, BPF_PROG_TYPE_CGROUP_SYSCTL, &obj, &prog_fd)) {
+		if (test->result != LOAD_REJECT)
+			log_err(">>> Loading program (%s) error.\n",
+				test->prog_file);
+		return -1;
+	}
+
+	return prog_fd;
+}
+
+static int load_sysctl_prog(struct sysctl_test *test, const char *sysctl_path)
+{
+		return test->prog_file
+			? load_sysctl_prog_file(test)
+			: load_sysctl_prog_insns(test, sysctl_path);
+}
+
+static int access_sysctl(const char *sysctl_path,
+			 const struct sysctl_test *test)
+{
+	int err = 0;
+	int fd;
+
+	fd = open(sysctl_path, test->open_flags | O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
+		log_err("lseek(%d) failed", test->seek);
+		goto err;
+	}
+
+	if (test->open_flags == O_RDONLY) {
+		char buf[128];
+
+		if (read(fd, buf, sizeof(buf)) == -1)
+			goto err;
+		if (test->oldval &&
+		    strncmp(buf, test->oldval, strlen(test->oldval))) {
+			log_err("Read value %s != %s", buf, test->oldval);
+			goto err;
+		}
+	} else if (test->open_flags == O_WRONLY) {
+		if (!test->newval) {
+			log_err("New value for sysctl is not set");
+			goto err;
+		}
+		if (write(fd, test->newval, strlen(test->newval)) == -1)
+			goto err;
+	} else {
+		log_err("Unexpected sysctl access: neither read nor write");
+		goto err;
+	}
+
+	goto out;
+err:
+	err = -1;
+out:
+	close(fd);
+	return err;
+}
+
+static int run_test_case(int cgfd, struct sysctl_test *test)
+{
+	enum bpf_attach_type atype = test->attach_type;
+	char sysctl_path[128];
+	int progfd = -1;
+	int err = 0;
+
+	printf("Test case: %s .. ", test->descr);
+
+	snprintf(sysctl_path, sizeof(sysctl_path), "/proc/sys/%s",
+		 test->sysctl);
+
+	progfd = load_sysctl_prog(test, sysctl_path);
+	if (progfd < 0) {
+		if (test->result == LOAD_REJECT)
+			goto out;
+		else
+			goto err;
+	}
+
+	if (bpf_prog_attach(progfd, cgfd, atype, BPF_F_ALLOW_OVERRIDE) < 0) {
+		if (test->result == ATTACH_REJECT)
+			goto out;
+		else
+			goto err;
+	}
+
+	errno = 0;
+	if (access_sysctl(sysctl_path, test) == -1) {
+		if (test->result == OP_EPERM && errno == EPERM)
+			goto out;
+		else
+			goto err;
+	}
+
+	if (test->result != SUCCESS) {
+		log_err("Unexpected success");
+		goto err;
+	}
+
+	goto out;
+err:
+	err = -1;
+out:
+	/* Detaching w/o checking return code: best effort attempt. */
+	if (progfd != -1)
+		bpf_prog_detach(cgfd, atype);
+	close(progfd);
+	printf("[%s]\n", err ? "FAIL" : "PASS");
+	return err;
+}
+
+static int run_tests(int cgfd)
+{
+	int passes = 0;
+	int fails = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+		if (run_test_case(cgfd, &tests[i]))
+			++fails;
+		else
+			++passes;
+	}
+	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+	return fails ? -1 : 0;
+}
+
+void test_sysctl(void)
+{
+	int cgfd;
+
+	cgfd = cgroup_setup_and_join(CG_PATH);
+	if (!ASSERT_OK_FD(cgfd < 0, "create_cgroup"))
+		goto out;
+
+	if (!ASSERT_OK(run_tests(cgfd), "run_tests"))
+		goto out;
+
+out:
+	close(cgfd);
+	cleanup_cgroup_environment();
+	return;
+}
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
deleted file mode 100644
index bcdbd27f22f0..000000000000
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ /dev/null
@@ -1,1633 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2019 Facebook
-
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <linux/filter.h>
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include <bpf/bpf_endian.h>
-#include "bpf_util.h"
-#include "cgroup_helpers.h"
-#include "testing_helpers.h"
-
-#define CG_PATH			"/foo"
-#define MAX_INSNS		512
-#define FIXUP_SYSCTL_VALUE	0
-
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-
-struct sysctl_test {
-	const char *descr;
-	size_t fixup_value_insn;
-	struct bpf_insn	insns[MAX_INSNS];
-	const char *prog_file;
-	enum bpf_attach_type attach_type;
-	const char *sysctl;
-	int open_flags;
-	int seek;
-	const char *newval;
-	const char *oldval;
-	enum {
-		LOAD_REJECT,
-		ATTACH_REJECT,
-		OP_EPERM,
-		SUCCESS,
-	} result;
-};
-
-static struct sysctl_test tests[] = {
-	{
-		.descr = "sysctl wrong attach_type",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = 0,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = ATTACH_REJECT,
-	},
-	{
-		.descr = "sysctl:read allow all",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl:read deny all",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "ctx:write sysctl:read read ok",
-		.insns = {
-			/* If (write) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, write)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "ctx:write sysctl:write read ok",
-		.insns = {
-			/* If (write) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, write)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/domainname",
-		.open_flags = O_WRONLY,
-		.newval = "(none)", /* same as default, should fail anyway */
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "ctx:write sysctl:write read ok narrow",
-		.insns = {
-			/* u64 w = (u16)write & 1; */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-			BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, write)),
-#else
-			BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, write) + 2),
-#endif
-			BPF_ALU64_IMM(BPF_AND, BPF_REG_7, 1),
-			/* return 1 - w; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/domainname",
-		.open_flags = O_WRONLY,
-		.newval = "(none)", /* same as default, should fail anyway */
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "ctx:write sysctl:read write reject",
-		.insns = {
-			/* write = X */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct bpf_sysctl, write)),
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = LOAD_REJECT,
-	},
-	{
-		.descr = "ctx:file_pos sysctl:read read ok",
-		.insns = {
-			/* If (file_pos == X) */
-			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, file_pos)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.seek = 3,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "ctx:file_pos sysctl:read read ok narrow",
-		.insns = {
-			/* If (file_pos == X) */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-			BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, file_pos)),
-#else
-			BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
-				    offsetof(struct bpf_sysctl, file_pos) + 3),
-#endif
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 4, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.seek = 4,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "ctx:file_pos sysctl:read write ok",
-		.insns = {
-			/* file_pos = X */
-			BPF_MOV64_IMM(BPF_REG_0, 2),
-			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct bpf_sysctl, file_pos)),
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.oldval = "nux\n",
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_name sysctl_value:base ok",
-		.insns = {
-			/* sysctl_get_name arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_name arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 8),
-
-			/* sysctl_get_name arg4 (flags) */
-			BPF_MOV64_IMM(BPF_REG_4, BPF_F_SYSCTL_BASE_NAME),
-
-			/* sysctl_get_name(ctx, buf, buf_len, flags) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, sizeof("tcp_mem") - 1, 6),
-			/*     buf == "tcp_mem\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x7463705f6d656d00ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_name sysctl_value:base E2BIG truncated",
-		.insns = {
-			/* sysctl_get_name arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_name arg3 (buf_len) too small */
-			BPF_MOV64_IMM(BPF_REG_3, 7),
-
-			/* sysctl_get_name arg4 (flags) */
-			BPF_MOV64_IMM(BPF_REG_4, BPF_F_SYSCTL_BASE_NAME),
-
-			/* sysctl_get_name(ctx, buf, buf_len, flags) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
-
-			/*     buf[0:7] == "tcp_me\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x7463705f6d650000ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_name sysctl:full ok",
-		.insns = {
-			/* sysctl_get_name arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_name arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 17),
-
-			/* sysctl_get_name arg4 (flags) */
-			BPF_MOV64_IMM(BPF_REG_4, 0),
-
-			/* sysctl_get_name(ctx, buf, buf_len, flags) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 16, 14),
-
-			/*     buf[0:8] == "net/ipv4" && */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x6e65742f69707634ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
-
-			/*     buf[8:16] == "/tcp_mem" && */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x2f7463705f6d656dULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
-
-			/*     buf[16:24] == "\0") */
-			BPF_LD_IMM64(BPF_REG_8, 0x0ULL),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_name sysctl:full E2BIG truncated",
-		.insns = {
-			/* sysctl_get_name arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_name arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 16),
-
-			/* sysctl_get_name arg4 (flags) */
-			BPF_MOV64_IMM(BPF_REG_4, 0),
-
-			/* sysctl_get_name(ctx, buf, buf_len, flags) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 10),
-
-			/*     buf[0:8] == "net/ipv4" && */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x6e65742f69707634ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
-
-			/*     buf[8:16] == "/tcp_me\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x2f7463705f6d6500ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_name sysctl:full E2BIG truncated small",
-		.insns = {
-			/* sysctl_get_name arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_name arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 7),
-
-			/* sysctl_get_name arg4 (flags) */
-			BPF_MOV64_IMM(BPF_REG_4, 0),
-
-			/* sysctl_get_name(ctx, buf, buf_len, flags) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
-
-			/*     buf[0:8] == "net/ip\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x6e65742f69700000ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_current_value sysctl:read ok, gt",
-		.insns = {
-			/* sysctl_get_current_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_current_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 8),
-
-			/* sysctl_get_current_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
-
-			/*     buf[0:6] == "Linux\n\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_current_value sysctl:read ok, eq",
-		.insns = {
-			/* sysctl_get_current_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_B, BPF_REG_7, BPF_REG_0, 7),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_current_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 7),
-
-			/* sysctl_get_current_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
-
-			/*     buf[0:6] == "Linux\n\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_current_value sysctl:read E2BIG truncated",
-		.insns = {
-			/* sysctl_get_current_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_H, BPF_REG_7, BPF_REG_0, 6),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_current_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 6),
-
-			/* sysctl_get_current_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
-
-			/*     buf[0:6] == "Linux\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x4c696e7578000000ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "kernel/ostype",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_current_value sysctl:read EINVAL",
-		.insns = {
-			/* sysctl_get_current_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_current_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 8),
-
-			/* sysctl_get_current_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 4),
-
-			/*     buf[0:8] is NUL-filled) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv6/conf/lo/stable_secret", /* -EIO */
-		.open_flags = O_RDONLY,
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "sysctl_get_current_value sysctl:write ok",
-		.fixup_value_insn = 6,
-		.insns = {
-			/* sysctl_get_current_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_current_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 8),
-
-			/* sysctl_get_current_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 6),
-
-			/*     buf[0:4] == expected) */
-			BPF_LD_IMM64(BPF_REG_8, FIXUP_SYSCTL_VALUE),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_WRONLY,
-		.newval = "600", /* same as default, should fail anyway */
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "sysctl_get_new_value sysctl:read EINVAL",
-		.insns = {
-			/* sysctl_get_new_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_new_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 8),
-
-			/* sysctl_get_new_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_get_new_value sysctl:write ok",
-		.insns = {
-			/* sysctl_get_new_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_new_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 4),
-
-			/* sysctl_get_new_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
-
-			/*     buf[0:4] == "606\0") */
-			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
-				    bpf_ntohl(0x36303600), 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_WRONLY,
-		.newval = "606",
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "sysctl_get_new_value sysctl:write ok long",
-		.insns = {
-			/* sysctl_get_new_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_new_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 24),
-
-			/* sysctl_get_new_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 23, 14),
-
-			/*     buf[0:8] == "3000000 " && */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x3330303030303020ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
-
-			/*     buf[8:16] == "4000000 " && */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x3430303030303020ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
-
-			/*     buf[16:24] == "6000000\0") */
-			BPF_LD_IMM64(BPF_REG_8,
-				     bpf_be64_to_cpu(0x3630303030303000ULL)),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_WRONLY,
-		.newval = "3000000 4000000 6000000",
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "sysctl_get_new_value sysctl:write E2BIG",
-		.insns = {
-			/* sysctl_get_new_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_B, BPF_REG_7, BPF_REG_0, 3),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_get_new_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 3),
-
-			/* sysctl_get_new_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 4),
-
-			/*     buf[0:3] == "60\0") */
-			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
-				    bpf_ntohl(0x36300000), 2),
-
-			/* return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_JMP_A(1),
-
-			/* else return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_WRONLY,
-		.newval = "606",
-		.result = OP_EPERM,
-	},
-	{
-		.descr = "sysctl_set_new_value sysctl:read EINVAL",
-		.insns = {
-			/* sysctl_set_new_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x36303000)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_set_new_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 3),
-
-			/* sysctl_set_new_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_set_new_value),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		.descr = "sysctl_set_new_value sysctl:write ok",
-		.fixup_value_insn = 2,
-		.insns = {
-			/* sysctl_set_new_value arg2 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_LD_IMM64(BPF_REG_0, FIXUP_SYSCTL_VALUE),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
-
-			/* sysctl_set_new_value arg3 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_3, 3),
-
-			/* sysctl_set_new_value(ctx, buf, buf_len) */
-			BPF_EMIT_CALL(BPF_FUNC_sysctl_set_new_value),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_WRONLY,
-		.newval = "606",
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtoul one number string",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x36303000)),
-			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
-			/*     res == expected) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 600, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtoul multi number string",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			/* "600 602\0" */
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3630302036303200ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 8),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 18),
-			/*     res == expected) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 600, 16),
-
-			/*     arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/*     arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 8),
-			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_0),
-
-			/*     arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/*     arg4 (res) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/*     if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 4),
-			/*         res == expected) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 602, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtoul buf_len = 0, reject",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x36303000)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 0),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = LOAD_REJECT,
-	},
-	{
-		"bpf_strtoul supported base, ok",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x30373700)),
-			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 8),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
-			/*     res == expected) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 63, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtoul unsupported base, EINVAL",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x36303000)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 3),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtoul buf with spaces only, EINVAL",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x0d0c0a09)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtoul negative number, EINVAL",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			/* " -6\0" */
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x0a2d3600)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtoul),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtol negative number, ok",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			/* " -6\0" */
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x0a2d3600)),
-			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 10),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtol),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
-			/*     res == expected) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, -6, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtol hex number, ok",
-		.insns = {
-			/* arg1 (buf) */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			/* "0xfe" */
-			BPF_MOV64_IMM(BPF_REG_0,
-				      bpf_ntohl(0x30786665)),
-			BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 4),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtol),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 4),
-			/*     res == expected) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 254, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtol max long",
-		.insns = {
-			/* arg1 (buf) 9223372036854775807 */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3932323333373230ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3336383534373735ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3830370000000000ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 19),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtol),
-
-			/* if (ret == expected && */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 19, 6),
-			/*     res == expected) */
-			BPF_LD_IMM64(BPF_REG_8, 0x7fffffffffffffffULL),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
-			BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"bpf_strtol overflow, ERANGE",
-		.insns = {
-			/* arg1 (buf) 9223372036854775808 */
-			BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3932323333373230ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3336383534373735ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
-			BPF_LD_IMM64(BPF_REG_0,
-				     bpf_be64_to_cpu(0x3830380000000000ULL)),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
-
-			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
-
-			/* arg2 (buf_len) */
-			BPF_MOV64_IMM(BPF_REG_2, 19),
-
-			/* arg3 (flags) */
-			BPF_MOV64_IMM(BPF_REG_3, 0),
-
-			/* arg4 (res) */
-			BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-			BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-			BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
-
-			BPF_EMIT_CALL(BPF_FUNC_strtol),
-
-			/* if (ret == expected) */
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -ERANGE, 2),
-
-			/* return ALLOW; */
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_JMP_A(1),
-
-			/* else return DENY; */
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_EXIT_INSN(),
-		},
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-	{
-		"C prog: deny all writes",
-		.prog_file = "./test_sysctl_prog.bpf.o",
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_WRONLY,
-		.newval = "123 456 789",
-		.result = OP_EPERM,
-	},
-	{
-		"C prog: deny access by name",
-		.prog_file = "./test_sysctl_prog.bpf.o",
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/route/mtu_expires",
-		.open_flags = O_RDONLY,
-		.result = OP_EPERM,
-	},
-	{
-		"C prog: read tcp_mem",
-		.prog_file = "./test_sysctl_prog.bpf.o",
-		.attach_type = BPF_CGROUP_SYSCTL,
-		.sysctl = "net/ipv4/tcp_mem",
-		.open_flags = O_RDONLY,
-		.result = SUCCESS,
-	},
-};
-
-static size_t probe_prog_length(const struct bpf_insn *fp)
-{
-	size_t len;
-
-	for (len = MAX_INSNS - 1; len > 0; --len)
-		if (fp[len].code != 0 || fp[len].imm != 0)
-			break;
-	return len + 1;
-}
-
-static int fixup_sysctl_value(const char *buf, size_t buf_len,
-			      struct bpf_insn *prog, size_t insn_num)
-{
-	union {
-		uint8_t raw[sizeof(uint64_t)];
-		uint64_t num;
-	} value = {};
-
-	if (buf_len > sizeof(value)) {
-		log_err("Value is too big (%zd) to use in fixup", buf_len);
-		return -1;
-	}
-	if (prog[insn_num].code != (BPF_LD | BPF_DW | BPF_IMM)) {
-		log_err("Can fixup only BPF_LD_IMM64 insns");
-		return -1;
-	}
-
-	memcpy(value.raw, buf, buf_len);
-	prog[insn_num].imm = (uint32_t)value.num;
-	prog[insn_num + 1].imm = (uint32_t)(value.num >> 32);
-
-	return 0;
-}
-
-static int load_sysctl_prog_insns(struct sysctl_test *test,
-				  const char *sysctl_path)
-{
-	struct bpf_insn *prog = test->insns;
-	LIBBPF_OPTS(bpf_prog_load_opts, opts);
-	int ret, insn_cnt;
-
-	insn_cnt = probe_prog_length(prog);
-
-	if (test->fixup_value_insn) {
-		char buf[128];
-		ssize_t len;
-		int fd;
-
-		fd = open(sysctl_path, O_RDONLY | O_CLOEXEC);
-		if (fd < 0) {
-			log_err("open(%s) failed", sysctl_path);
-			return -1;
-		}
-		len = read(fd, buf, sizeof(buf));
-		if (len == -1) {
-			log_err("read(%s) failed", sysctl_path);
-			close(fd);
-			return -1;
-		}
-		close(fd);
-		if (fixup_sysctl_value(buf, len, prog, test->fixup_value_insn))
-			return -1;
-	}
-
-	opts.log_buf = bpf_log_buf;
-	opts.log_size = BPF_LOG_BUF_SIZE;
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SYSCTL, NULL, "GPL", prog, insn_cnt, &opts);
-	if (ret < 0 && test->result != LOAD_REJECT) {
-		log_err(">>> Loading program error.\n"
-			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
-	}
-
-	return ret;
-}
-
-static int load_sysctl_prog_file(struct sysctl_test *test)
-{
-	struct bpf_object *obj;
-	int prog_fd;
-
-	if (bpf_prog_test_load(test->prog_file, BPF_PROG_TYPE_CGROUP_SYSCTL, &obj, &prog_fd)) {
-		if (test->result != LOAD_REJECT)
-			log_err(">>> Loading program (%s) error.\n",
-				test->prog_file);
-		return -1;
-	}
-
-	return prog_fd;
-}
-
-static int load_sysctl_prog(struct sysctl_test *test, const char *sysctl_path)
-{
-		return test->prog_file
-			? load_sysctl_prog_file(test)
-			: load_sysctl_prog_insns(test, sysctl_path);
-}
-
-static int access_sysctl(const char *sysctl_path,
-			 const struct sysctl_test *test)
-{
-	int err = 0;
-	int fd;
-
-	fd = open(sysctl_path, test->open_flags | O_CLOEXEC);
-	if (fd < 0)
-		return fd;
-
-	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
-		log_err("lseek(%d) failed", test->seek);
-		goto err;
-	}
-
-	if (test->open_flags == O_RDONLY) {
-		char buf[128];
-
-		if (read(fd, buf, sizeof(buf)) == -1)
-			goto err;
-		if (test->oldval &&
-		    strncmp(buf, test->oldval, strlen(test->oldval))) {
-			log_err("Read value %s != %s", buf, test->oldval);
-			goto err;
-		}
-	} else if (test->open_flags == O_WRONLY) {
-		if (!test->newval) {
-			log_err("New value for sysctl is not set");
-			goto err;
-		}
-		if (write(fd, test->newval, strlen(test->newval)) == -1)
-			goto err;
-	} else {
-		log_err("Unexpected sysctl access: neither read nor write");
-		goto err;
-	}
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(fd);
-	return err;
-}
-
-static int run_test_case(int cgfd, struct sysctl_test *test)
-{
-	enum bpf_attach_type atype = test->attach_type;
-	char sysctl_path[128];
-	int progfd = -1;
-	int err = 0;
-
-	printf("Test case: %s .. ", test->descr);
-
-	snprintf(sysctl_path, sizeof(sysctl_path), "/proc/sys/%s",
-		 test->sysctl);
-
-	progfd = load_sysctl_prog(test, sysctl_path);
-	if (progfd < 0) {
-		if (test->result == LOAD_REJECT)
-			goto out;
-		else
-			goto err;
-	}
-
-	if (bpf_prog_attach(progfd, cgfd, atype, BPF_F_ALLOW_OVERRIDE) < 0) {
-		if (test->result == ATTACH_REJECT)
-			goto out;
-		else
-			goto err;
-	}
-
-	errno = 0;
-	if (access_sysctl(sysctl_path, test) == -1) {
-		if (test->result == OP_EPERM && errno == EPERM)
-			goto out;
-		else
-			goto err;
-	}
-
-	if (test->result != SUCCESS) {
-		log_err("Unexpected success");
-		goto err;
-	}
-
-	goto out;
-err:
-	err = -1;
-out:
-	/* Detaching w/o checking return code: best effort attempt. */
-	if (progfd != -1)
-		bpf_prog_detach(cgfd, atype);
-	close(progfd);
-	printf("[%s]\n", err ? "FAIL" : "PASS");
-	return err;
-}
-
-static int run_tests(int cgfd)
-{
-	int passes = 0;
-	int fails = 0;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
-		if (run_test_case(cgfd, &tests[i]))
-			++fails;
-		else
-			++passes;
-	}
-	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
-	return fails ? -1 : 0;
-}
-
-int main(int argc, char **argv)
-{
-	int cgfd = -1;
-	int err = 0;
-
-	cgfd = cgroup_setup_and_join(CG_PATH);
-	if (cgfd < 0)
-		goto err;
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	if (run_tests(cgfd))
-		goto err;
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(cgfd);
-	cleanup_cgroup_environment();
-	return err;
-}
-- 
cgit v1.2.3


From e1ca44e85f652a6ebd657c67c394894c1fdfb403 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 18 Jun 2025 21:13:56 -0700
Subject: af_unix: Add test for consecutive consumed OOB.

Let's add a test case where consecutive concumed OOB skbs stay
at the head of the queue.

Without the previous patch, ioctl(SIOCATMARK) assertion fails.

Before:

  #  RUN           msg_oob.no_peek.ex_oob_ex_oob_oob ...
  # msg_oob.c:305:ex_oob_ex_oob_oob:Expected answ[0] (0) == oob_head (1)
  # ex_oob_ex_oob_oob: Test terminated by assertion
  #          FAIL  msg_oob.no_peek.ex_oob_ex_oob_oob
  not ok 12 msg_oob.no_peek.ex_oob_ex_oob_oob

After:

  #  RUN           msg_oob.no_peek.ex_oob_ex_oob_oob ...
  #            OK  msg_oob.no_peek.ex_oob_ex_oob_oob
  ok 12 msg_oob.no_peek.ex_oob_ex_oob_oob

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250619041457.1132791-3-kuni1840@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/af_unix/msg_oob.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c
index 3ed3882a93b8..918509a3f040 100644
--- a/tools/testing/selftests/net/af_unix/msg_oob.c
+++ b/tools/testing/selftests/net/af_unix/msg_oob.c
@@ -548,6 +548,29 @@ TEST_F(msg_oob, ex_oob_oob)
 	siocatmarkpair(false);
 }
 
+TEST_F(msg_oob, ex_oob_ex_oob_oob)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("x", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("y", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	sendpair("z", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+}
+
 TEST_F(msg_oob, ex_oob_ahead_break)
 {
 	sendpair("hello", 5, MSG_OOB);
-- 
cgit v1.2.3


From 632f55fa60c481035297739ecd374d945c9b32c7 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 18 Jun 2025 21:13:58 -0700
Subject: selftest: af_unix: Add tests for -ECONNRESET.

A new function resetpair() calls close() for the receiver and checks
the return value from recv() on the initial sender side.

Now resetpair() is added to each test case and some additional test
cases.

Note that TCP sets -ECONNRESET to the consumed OOB, but we have decided
not to touch TCP MSG_OOB code in the past.

Before:

  #  RUN           msg_oob.no_peek.ex_oob_ex_oob ...
  # msg_oob.c:236:ex_oob_ex_oob:AF_UNIX :Connection reset by peer
  # msg_oob.c:237:ex_oob_ex_oob:Expected:
  # msg_oob.c:239:ex_oob_ex_oob:Expected ret[0] (-1) == expected_len (0)
  # ex_oob_ex_oob: Test terminated by assertion
  #          FAIL  msg_oob.no_peek.ex_oob_ex_oob
  not ok 14 msg_oob.no_peek.ex_oob_ex_oob
  ...
  # FAILED: 36 / 48 tests passed.
  # Totals: pass:36 fail:12 xfail:0 xpass:0 skip:0 error:0

After:

  #  RUN           msg_oob.no_peek.ex_oob_ex_oob ...
  # msg_oob.c:244:ex_oob_ex_oob:AF_UNIX :
  # msg_oob.c:245:ex_oob_ex_oob:TCP     :Connection reset by peer
  #            OK  msg_oob.no_peek.ex_oob_ex_oob
  ok 14 msg_oob.no_peek.ex_oob_ex_oob
  ...
  # PASSED: 48 / 48 tests passed.
  # Totals: pass:48 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250619041457.1132791-5-kuni1840@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/af_unix/msg_oob.c | 119 +++++++++++++++++++++++++-
 1 file changed, 115 insertions(+), 4 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c
index 918509a3f040..b5f474969917 100644
--- a/tools/testing/selftests/net/af_unix/msg_oob.c
+++ b/tools/testing/selftests/net/af_unix/msg_oob.c
@@ -210,7 +210,7 @@ static void __sendpair(struct __test_metadata *_metadata,
 static void __recvpair(struct __test_metadata *_metadata,
 		       FIXTURE_DATA(msg_oob) *self,
 		       const char *expected_buf, int expected_len,
-		       int buf_len, int flags)
+		       int buf_len, int flags, bool is_sender)
 {
 	int i, ret[2], recv_errno[2], expected_errno = 0;
 	char recv_buf[2][BUF_SZ] = {};
@@ -221,7 +221,9 @@ static void __recvpair(struct __test_metadata *_metadata,
 	errno = 0;
 
 	for (i = 0; i < 2; i++) {
-		ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags);
+		int index = is_sender ? i * 2 : i * 2 + 1;
+
+		ret[i] = recv(self->fd[index], recv_buf[i], buf_len, flags);
 		recv_errno[i] = errno;
 	}
 
@@ -308,6 +310,20 @@ static void __siocatmarkpair(struct __test_metadata *_metadata,
 		ASSERT_EQ(answ[0], answ[1]);
 }
 
+static void __resetpair(struct __test_metadata *_metadata,
+			FIXTURE_DATA(msg_oob) *self,
+			const FIXTURE_VARIANT(msg_oob) *variant,
+			bool reset)
+{
+	int i;
+
+	for (i = 0; i < 2; i++)
+		close(self->fd[i * 2 + 1]);
+
+	__recvpair(_metadata, self, "", reset ? -ECONNRESET : 0, 1,
+		   variant->peek ? MSG_PEEK : 0, true);
+}
+
 #define sendpair(buf, len, flags)					\
 	__sendpair(_metadata, self, buf, len, flags)
 
@@ -316,9 +332,10 @@ static void __siocatmarkpair(struct __test_metadata *_metadata,
 		if (variant->peek)					\
 			__recvpair(_metadata, self,			\
 				   expected_buf, expected_len,		\
-				   buf_len, (flags) | MSG_PEEK);	\
+				   buf_len, (flags) | MSG_PEEK, false);	\
 		__recvpair(_metadata, self,				\
-			   expected_buf, expected_len, buf_len, flags);	\
+			   expected_buf, expected_len,			\
+			   buf_len, flags, false);			\
 	} while (0)
 
 #define epollpair(oob_remaining)					\
@@ -330,6 +347,9 @@ static void __siocatmarkpair(struct __test_metadata *_metadata,
 #define setinlinepair()							\
 	__setinlinepair(_metadata, self)
 
+#define resetpair(reset)						\
+	__resetpair(_metadata, self, variant, reset)
+
 #define tcp_incompliant							\
 	for (self->tcp_compliant = false;				\
 	     self->tcp_compliant == false;				\
@@ -344,6 +364,21 @@ TEST_F(msg_oob, non_oob)
 	recvpair("", -EINVAL, 1, MSG_OOB);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(true);
+}
+
+TEST_F(msg_oob, non_oob_no_reset)
+{
+	sendpair("x", 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("x", 1, 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, oob)
@@ -355,6 +390,19 @@ TEST_F(msg_oob, oob)
 	recvpair("x", 1, 1, MSG_OOB);
 	epollpair(false);
 	siocatmarkpair(true);
+
+	tcp_incompliant {
+		resetpair(false);		/* TCP sets -ECONNRESET for ex-OOB. */
+	}
+}
+
+TEST_F(msg_oob, oob_reset)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	resetpair(true);
 }
 
 TEST_F(msg_oob, oob_drop)
@@ -370,6 +418,8 @@ TEST_F(msg_oob, oob_drop)
 	recvpair("", -EINVAL, 1, MSG_OOB);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, oob_ahead)
@@ -385,6 +435,10 @@ TEST_F(msg_oob, oob_ahead)
 	recvpair("hell", 4, 4, 0);
 	epollpair(false);
 	siocatmarkpair(true);
+
+	tcp_incompliant {
+		resetpair(false);		/* TCP sets -ECONNRESET for ex-OOB. */
+	}
 }
 
 TEST_F(msg_oob, oob_break)
@@ -403,6 +457,8 @@ TEST_F(msg_oob, oob_break)
 
 	recvpair("", -EAGAIN, 1, 0);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, oob_ahead_break)
@@ -426,6 +482,8 @@ TEST_F(msg_oob, oob_ahead_break)
 	recvpair("world", 5, 5, 0);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, oob_break_drop)
@@ -449,6 +507,8 @@ TEST_F(msg_oob, oob_break_drop)
 	recvpair("", -EINVAL, 1, MSG_OOB);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, ex_oob_break)
@@ -476,6 +536,8 @@ TEST_F(msg_oob, ex_oob_break)
 	recvpair("ld", 2, 2, 0);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, ex_oob_drop)
@@ -498,6 +560,8 @@ TEST_F(msg_oob, ex_oob_drop)
 		epollpair(false);
 		siocatmarkpair(true);
 	}
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, ex_oob_drop_2)
@@ -523,6 +587,8 @@ TEST_F(msg_oob, ex_oob_drop_2)
 		epollpair(false);
 		siocatmarkpair(true);
 	}
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, ex_oob_oob)
@@ -546,6 +612,31 @@ TEST_F(msg_oob, ex_oob_oob)
 	recvpair("", -EINVAL, 1, MSG_OOB);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
+}
+
+TEST_F(msg_oob, ex_oob_ex_oob)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("x", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("y", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	tcp_incompliant {
+		resetpair(false);		/* TCP sets -ECONNRESET for ex-OOB. */
+	}
 }
 
 TEST_F(msg_oob, ex_oob_ex_oob_oob)
@@ -599,6 +690,10 @@ TEST_F(msg_oob, ex_oob_ahead_break)
 	recvpair("d", 1, 1, MSG_OOB);
 	epollpair(false);
 	siocatmarkpair(true);
+
+	tcp_incompliant {
+		resetpair(false);		/* TCP sets -ECONNRESET for ex-OOB. */
+	}
 }
 
 TEST_F(msg_oob, ex_oob_siocatmark)
@@ -618,6 +713,8 @@ TEST_F(msg_oob, ex_oob_siocatmark)
 	recvpair("hell", 4, 4, 0);		/* Intentionally stop at ex-OOB. */
 	epollpair(true);
 	siocatmarkpair(false);
+
+	resetpair(true);
 }
 
 TEST_F(msg_oob, inline_oob)
@@ -635,6 +732,8 @@ TEST_F(msg_oob, inline_oob)
 	recvpair("x", 1, 1, 0);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, inline_oob_break)
@@ -656,6 +755,8 @@ TEST_F(msg_oob, inline_oob_break)
 	recvpair("o", 1, 1, 0);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, inline_oob_ahead_break)
@@ -684,6 +785,8 @@ TEST_F(msg_oob, inline_oob_ahead_break)
 
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, inline_ex_oob_break)
@@ -709,6 +812,8 @@ TEST_F(msg_oob, inline_ex_oob_break)
 	recvpair("rld", 3, 3, 0);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, inline_ex_oob_no_drop)
@@ -730,6 +835,8 @@ TEST_F(msg_oob, inline_ex_oob_no_drop)
 	recvpair("y", 1, 1, 0);
 	epollpair(false);
 	siocatmarkpair(false);
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, inline_ex_oob_drop)
@@ -754,6 +861,8 @@ TEST_F(msg_oob, inline_ex_oob_drop)
 		epollpair(false);
 		siocatmarkpair(false);
 	}
+
+	resetpair(false);
 }
 
 TEST_F(msg_oob, inline_ex_oob_siocatmark)
@@ -775,6 +884,8 @@ TEST_F(msg_oob, inline_ex_oob_siocatmark)
 	recvpair("hell", 4, 4, 0);		/* Intentionally stop at ex-OOB. */
 	epollpair(true);
 	siocatmarkpair(false);
+
+	resetpair(true);
 }
 
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From fa6f092cc0a02d0fcee37e9e8172eda372a03d33 Mon Sep 17 00:00:00 2001
From: Adin Scannell <amscanne@meta.com>
Date: Tue, 24 Jun 2025 22:02:15 -0700
Subject: libbpf: Fix possible use-after-free for externs

The `name` field in `obj->externs` points into the BTF data at initial
open time. However, some functions may invalidate this after opening and
before loading (e.g. `bpf_map__set_value_size`), which results in
pointers into freed memory and undefined behavior.

The simplest solution is to simply `strdup` these strings, similar to
the `essent_name`, and free them at the same time.

In order to test this path, the `global_map_resize` BPF selftest is
modified slightly to ensure the presence of an extern, which causes this
test to fail prior to the fix. Given there isn't an obvious API or error
to test against, I opted to add this to the existing test as an aspect
of the resizing feature rather than duplicate the test.

Fixes: 9d0a23313b1a ("libbpf: Add capability for resizing datasec maps")
Signed-off-by: Adin Scannell <amscanne@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250625050215.2777374-1-amscanne@meta.com
---
 .../testing/selftests/bpf/progs/test_global_map_resize.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
index a3f220ba7025..ee65bad0436d 100644
--- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c
+++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
@@ -32,6 +32,16 @@ int my_int_last SEC(".data.array_not_last");
 
 int percpu_arr[1] SEC(".data.percpu_arr");
 
+/* at least one extern is included, to ensure that a specific
+ * regression is tested whereby resizing resulted in a free-after-use
+ * bug after type information is invalidated by the resize operation.
+ *
+ * There isn't a particularly good API to test for this specific condition,
+ * but by having externs for the resizing tests it will cover this path.
+ */
+extern int LINUX_KERNEL_VERSION __kconfig;
+long version_sink;
+
 SEC("tp/syscalls/sys_enter_getpid")
 int bss_array_sum(void *ctx)
 {
@@ -44,6 +54,9 @@ int bss_array_sum(void *ctx)
 	for (size_t i = 0; i < bss_array_len; ++i)
 		sum += array[i];
 
+	/* see above; ensure this is not optimized out */
+	version_sink = LINUX_KERNEL_VERSION;
+
 	return 0;
 }
 
@@ -59,6 +72,9 @@ int data_array_sum(void *ctx)
 	for (size_t i = 0; i < data_array_len; ++i)
 		sum += my_array[i];
 
+	/* see above; ensure this is not optimized out */
+	version_sink = LINUX_KERNEL_VERSION;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5e9388f7984a9cc7e659a105113f6ccf0aebedd0 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 25 Jun 2025 17:03:55 -0400
Subject: selftests/bpf: adapt one more case in test_lru_map to the new
 target_free

The below commit that updated BPF_MAP_TYPE_LRU_HASH free target,
also updated tools/testing/selftests/bpf/test_lru_map to match.

But that missed one case that passes with 4 cores, but fails at
higher cpu counts.

Update test_lru_sanity3 to also adjust its expectation of target_free.

This time tested with 1, 4, 16, 64 and 384 cpu count.

Fixes: d4adf1c9ee77 ("bpf: Adjust free target to avoid global starvation of LRU map")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20250625210412.2732970-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_lru_map.c | 33 ++++++++++++++++--------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 4ae83f4b7fc7..0921939532c6 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try)
 	return ret;
 }
 
+/* Derive target_free from map_size, same as bpf_common_lru_populate */
+static unsigned int __tgt_size(unsigned int map_size)
+{
+	return (map_size / nr_cpus) / 2;
+}
+
 /* Inverse of how bpf_common_lru_populate derives target_free from map_size. */
 static unsigned int __map_size(unsigned int tgt_free)
 {
@@ -410,12 +416,12 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 	printf("Pass\n");
 }
 
-/* Size of the LRU map is 2*tgt_free
- * It is to test the active/inactive list rotation
- * Insert 1 to 2*tgt_free (+2*tgt_free keys)
- * Lookup key 1 to tgt_free*3/2
- * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys)
- *  => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU
+/* Test the active/inactive list rotation
+ *
+ * Fill the whole map, deplete the free list.
+ * Reference all except the last lru->target_free elements.
+ * Insert lru->target_free new elements. This triggers one shrink.
+ * Verify that the non-referenced elements are replaced.
  */
 static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 {
@@ -434,8 +440,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 
 	assert(sched_next_online(0, &next_cpu) != -1);
 
-	batch_size = tgt_free / 2;
-	assert(batch_size * 2 == tgt_free);
+	batch_size = __tgt_size(tgt_free);
 
 	map_size = tgt_free * 2;
 	lru_map_fd = create_map(map_type, map_flags, map_size);
@@ -446,23 +451,21 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 
 	value[0] = 1234;
 
-	/* Insert 1 to 2*tgt_free (+2*tgt_free keys) */
-	end_key = 1 + (2 * tgt_free);
+	/* Fill the map */
+	end_key = 1 + map_size;
 	for (key = 1; key < end_key; key++)
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
 
-	/* Lookup key 1 to tgt_free*3/2 */
-	end_key = tgt_free + batch_size;
+	/* Reference all but the last batch_size */
+	end_key = 1 + map_size - batch_size;
 	for (key = 1; key < end_key; key++) {
 		assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
 					    BPF_NOEXIST));
 	}
 
-	/* Add 1+2*tgt_free to tgt_free*5/2
-	 * (+tgt_free/2 keys)
-	 */
+	/* Insert new batch_size: replaces the non-referenced elements */
 	key = 2 * tgt_free + 1;
 	end_key = key + batch_size;
 	for (; key < end_key; key++) {
-- 
cgit v1.2.3


From 07caaf875c937e5f0a262a1dbad307d08ecc8673 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 24 Jun 2025 14:09:55 -0700
Subject: netlink: specs: ethtool: replace underscores with dashes in names

We're trying to add a strict regexp for the name format in the spec.
Underscores will not be allowed, dashes should be used instead.
This makes no difference to C (codegen replaces special chars in names)
but gives more uniform naming in Python.

Fixes: 13e59344fb9d ("net: ethtool: add support for symmetric-xor RSS hash")
Fixes: 46fb3ba95b93 ("ethtool: Add an interface for flashing transceiver modules' firmware")
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250624211002.3475021-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools/testing')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
index f439c434ba36..648ff50bc1c3 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
@@ -38,7 +38,7 @@ def test_rss_input_xfrm(cfg, ipver):
         raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11")
 
     input_xfrm = cfg.ethnl.rss_get(
-        {'header': {'dev-name': cfg.ifname}}).get('input_xfrm')
+        {'header': {'dev-name': cfg.ifname}}).get('input-xfrm')
 
     # Check for symmetric xor/or-xor
     if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2):
-- 
cgit v1.2.3