From 2051da858534a73589cdb27af914fe1c03b9ee98 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 1 Dec 2024 17:20:50 -0800
Subject: arm64/crc-t10dif: expose CRC-T10DIF function through lib

Move the arm64 CRC-T10DIF assembly code into the lib directory and wire
it up to the library interface.  This allows it to be used without going
through the crypto API.  It remains usable via the crypto API too via
the shash algorithms that use the library interface.  Thus all the
arch-specific "shash" code becomes unnecessary and is removed.

Note: to see the diff from arch/arm64/crypto/crct10dif-ce-glue.c to
arch/arm64/lib/crc-t10dif-glue.c, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241202012056.209768-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 tools/testing/selftests/arm64/fp/kernel-test.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c
index 859345379044..348e8bef62c7 100644
--- a/tools/testing/selftests/arm64/fp/kernel-test.c
+++ b/tools/testing/selftests/arm64/fp/kernel-test.c
@@ -46,8 +46,7 @@ static void handle_kick_signal(int sig, siginfo_t *info, void *context)
 }
 
 static char *drivers[] = {
-	"crct10dif-arm64-ce",
-	/* "crct10dif-arm64-neon", - Same priority as generic */
+	"crct10dif-arm64",
 	"sha1-ce",
 	"sha224-arm64",
 	"sha224-arm64-neon",
-- 
cgit v1.2.3


From 615ab43b838bb982dc234feff75ee9ad35447c5d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 22 Nov 2024 14:24:59 +0100
Subject: tests/pid_namespace: add pid_max tests

Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Link: https://lore.kernel.org/r/20241122132459.135120-3-aleksandr.mikhalitsyn@canonical.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/pid_namespace/.gitignore |   1 +
 tools/testing/selftests/pid_namespace/Makefile   |   2 +-
 tools/testing/selftests/pid_namespace/pid_max.c  | 358 +++++++++++++++++++++++
 3 files changed, 360 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/pid_namespace/pid_max.c

(limited to 'tools')

diff --git a/tools/testing/selftests/pid_namespace/.gitignore b/tools/testing/selftests/pid_namespace/.gitignore
index 93ab9d7e5b7e..5118f0f3edf4 100644
--- a/tools/testing/selftests/pid_namespace/.gitignore
+++ b/tools/testing/selftests/pid_namespace/.gitignore
@@ -1 +1,2 @@
+pid_max
 regression_enomem
diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile
index 9286a1d22cd3..b972f55d07ae 100644
--- a/tools/testing/selftests/pid_namespace/Makefile
+++ b/tools/testing/selftests/pid_namespace/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS += -g $(KHDR_INCLUDES)
 
-TEST_GEN_PROGS = regression_enomem
+TEST_GEN_PROGS = regression_enomem pid_max
 
 LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h
 
diff --git a/tools/testing/selftests/pid_namespace/pid_max.c b/tools/testing/selftests/pid_namespace/pid_max.c
new file mode 100644
index 000000000000..51c414faabb0
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/pid_max.c
@@ -0,0 +1,358 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+#define __STACK_SIZE (8 * 1024 * 1024)
+static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
+{
+	char *stack;
+	pid_t ret;
+
+	stack = malloc(__STACK_SIZE);
+	if (!stack)
+		return -ENOMEM;
+
+#ifdef __ia64__
+	ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg);
+#else
+	ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg);
+#endif
+	free(stack);
+	return ret;
+}
+
+static int pid_max_cb(void *data)
+{
+	int fd, ret;
+	pid_t pid;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return -1;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return -1;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return -1;
+	}
+
+	ret = write(fd, "500", sizeof("500") - 1);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return -1;
+	}
+
+	for (int i = 0; i < 501; i++) {
+		pid = fork();
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+		wait_for_pid(pid);
+		if (pid > 500) {
+			fprintf(stderr, "Managed to create pid number beyond limit\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int pid_max_nested_inner(void *data)
+{
+	int fret = -1;
+	pid_t pids[2];
+	int fd, i, ret;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return fret;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return fret;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return fret;
+	}
+
+	ret = write(fd, "500", sizeof("500") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return fret;
+	}
+
+	pids[0] = fork();
+	if (pids[0] < 0) {
+		fprintf(stderr, "Failed to create first new process\n");
+		return fret;
+	}
+
+	if (pids[0] == 0)
+		exit(EXIT_SUCCESS);
+
+	pids[1] = fork();
+	wait_for_pid(pids[0]);
+	if (pids[1] >= 0) {
+		if (pids[1] == 0)
+			exit(EXIT_SUCCESS);
+		wait_for_pid(pids[1]);
+
+		fprintf(stderr, "Managed to create process even though ancestor pid namespace had a limit\n");
+		return fret;
+	}
+
+	/* Now make sure that we wrap pids at 400. */
+	for (i = 0; i < 510; i++) {
+		pid_t pid;
+
+		pid = fork();
+		if (pid < 0)
+			return fret;
+
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+
+		wait_for_pid(pid);
+		if (pid >= 500) {
+			fprintf(stderr, "Managed to create process with pid %d beyond configured limit\n", pid);
+			return fret;
+		}
+	}
+
+	return 0;
+}
+
+static int pid_max_nested_outer(void *data)
+{
+	int fret = -1, nr_procs = 400;
+	pid_t pids[1000];
+	int fd, i, ret;
+	pid_t pid;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return fret;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return fret;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return fret;
+	}
+
+	ret = write(fd, "400", sizeof("400") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return fret;
+	}
+
+	/*
+	 * Create 397 processes. This leaves room for do_clone() (398) and
+	 * one more 399. So creating another process needs to fail.
+	 */
+	for (nr_procs = 0; nr_procs < 396; nr_procs++) {
+		pid = fork();
+		if (pid < 0)
+			goto reap;
+
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+
+		pids[nr_procs] = pid;
+	}
+
+	pid = do_clone(pid_max_nested_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	if (pid < 0) {
+		fprintf(stderr, "%m - Failed to clone nested pidns\n");
+		goto reap;
+	}
+
+	if (wait_for_pid(pid)) {
+		fprintf(stderr, "%m - Nested pid_max failed\n");
+		goto reap;
+	}
+
+	fret = 0;
+
+reap:
+	for (int i = 0; i < nr_procs; i++)
+		wait_for_pid(pids[i]);
+
+	return fret;
+}
+
+static int pid_max_nested_limit_inner(void *data)
+{
+	int fret = -1, nr_procs = 400;
+	int fd, ret;
+	pid_t pid;
+	pid_t pids[1000];
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return fret;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return fret;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return fret;
+	}
+
+	ret = write(fd, "500", sizeof("500") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return fret;
+	}
+
+	for (nr_procs = 0; nr_procs < 500; nr_procs++) {
+		pid = fork();
+		if (pid < 0)
+			break;
+
+		if (pid == 0)
+			exit(EXIT_SUCCESS);
+
+		pids[nr_procs] = pid;
+	}
+
+	if (nr_procs >= 400) {
+		fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
+		goto reap;
+	}
+
+	fret = 0;
+
+reap:
+	for (int i = 0; i < nr_procs; i++)
+		wait_for_pid(pids[i]);
+
+	return fret;
+}
+
+static int pid_max_nested_limit_outer(void *data)
+{
+	int fd, ret;
+	pid_t pid;
+
+	ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+		return -1;
+	}
+
+	umount2("/proc", MNT_DETACH);
+
+	ret = mount("proc", "/proc", "proc", 0, NULL);
+	if (ret) {
+		fprintf(stderr, "%m - Failed to mount proc\n");
+		return -1;
+	}
+
+	fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (fd < 0) {
+		fprintf(stderr, "%m - Failed to open pid_max\n");
+		return -1;
+	}
+
+	ret = write(fd, "400", sizeof("400") - 1);
+	close(fd);
+	if (ret < 0) {
+		fprintf(stderr, "%m - Failed to write pid_max\n");
+		return -1;
+	}
+
+	pid = do_clone(pid_max_nested_limit_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	if (pid < 0) {
+		fprintf(stderr, "%m - Failed to clone nested pidns\n");
+		return -1;
+	}
+
+	if (wait_for_pid(pid)) {
+		fprintf(stderr, "%m - Nested pid_max failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+TEST(pid_max_simple)
+{
+	pid_t pid;
+
+
+	pid = do_clone(pid_max_cb, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	ASSERT_GT(pid, 0);
+	ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST(pid_max_nested_limit)
+{
+	pid_t pid;
+
+	pid = do_clone(pid_max_nested_limit_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	ASSERT_GT(pid, 0);
+	ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST(pid_max_nested)
+{
+	pid_t pid;
+
+	pid = do_clone(pid_max_nested_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
+	ASSERT_GT(pid, 0);
+	ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From eb449bd96954b1c1e491d19066cfd2a010f0aa47 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Fri, 22 Nov 2024 09:44:15 -0800
Subject: mm: convert mm_lock_seq to a proper seqcount

Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock
variants to increment it, in-line with the usual seqcount usage pattern.
This lets us check whether the mmap_lock is write-locked by checking
mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be
used when implementing mmap_lock speculation functions.
As a result vm_lock_seq is also change to be unsigned to match the type
of mm_lock_seq.sequence.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com
---
 tools/testing/vma/vma.c          | 4 ++--
 tools/testing/vma/vma_internal.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 8fab5e13c7c3..9bcf1736bf18 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
 	 * begun. Linking to the tree will have caused this to be incremented,
 	 * which means we will get a false positive otherwise.
 	 */
-	vma->vm_lock_seq = -1;
+	vma->vm_lock_seq = UINT_MAX;
 
 	return vma;
 }
@@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
 	int seq = vma->vm_lock_seq;
 
 	/* We reset after each check. */
-	vma->vm_lock_seq = -1;
+	vma->vm_lock_seq = UINT_MAX;
 
 	/* The vma_start_write() stub simply increments this value. */
 	return seq > -1;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index e76ff579e1fd..1d9fc97b8e80 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -241,7 +241,7 @@ struct vm_area_struct {
 	 * counter reuse can only lead to occasional unnecessary use of the
 	 * slowpath.
 	 */
-	int vm_lock_seq;
+	unsigned int vm_lock_seq;
 	struct vma_lock *vm_lock;
 #endif
 
@@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
 		return false;
 
 	init_rwsem(&vma->vm_lock->lock);
-	vma->vm_lock_seq = -1;
+	vma->vm_lock_seq = UINT_MAX;
 
 	return true;
 }
-- 
cgit v1.2.3


From 2116b349e29a2e9ba17ea2e45b31234e4b350793 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:52 +0100
Subject: objtool: Generic annotation infrastructure

Avoid endless .discard.foo sections for each annotation, create a
single .discard.annotate_insn section that takes an annotation type along
with the instruction.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094310.932794537@infradead.org
---
 tools/objtool/check.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4ce176ad411f..b0efc8ee16d6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2373,6 +2373,49 @@ static int read_unwind_hints(struct objtool_file *file)
 	return 0;
 }
 
+static int read_annotate(struct objtool_file *file, void (*func)(int type, struct instruction *insn))
+{
+	struct section *sec;
+	struct instruction *insn;
+	struct reloc *reloc;
+	int type;
+
+	sec = find_section_by_name(file->elf, ".discard.annotate_insn");
+	if (!sec)
+		return 0;
+
+	if (!sec->rsec)
+		return 0;
+
+	if (sec->sh.sh_entsize != 8) {
+		static bool warned = false;
+		if (!warned) {
+			WARN("%s: dodgy linker, sh_entsize != 8", sec->name);
+			warned = true;
+		}
+		sec->sh.sh_entsize = 8;
+	}
+
+	for_each_reloc(sec->rsec, reloc) {
+		type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4);
+
+		insn = find_insn(file, reloc->sym->sec,
+				 reloc->sym->offset + reloc_addend(reloc));
+		if (!insn) {
+			WARN("bad .discard.annotate_insn entry: %d of type %d", reloc_idx(reloc), type);
+			return -1;
+		}
+
+		func(type, insn);
+	}
+
+	return 0;
+}
+
+static void __annotate_nop(int type, struct instruction *insn)
+{
+}
+
 static int read_noendbr_hints(struct objtool_file *file)
 {
 	struct instruction *insn;
@@ -2670,6 +2713,8 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
+	read_annotate(file, __annotate_nop);
+
 	/*
 	 * Must be before read_unwind_hints() since that needs insn->noendbr.
 	 */
-- 
cgit v1.2.3


From 22c3d58079688b697f36d670616e463cbb14d058 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:53 +0100
Subject: objtool: Convert ANNOTATE_NOENDBR to ANNOTATE

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.042140333@infradead.org
---
 tools/include/linux/objtool_types.h |  5 +++++
 tools/objtool/check.c               | 32 +++++---------------------------
 2 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 453a4f4ef39d..4884f8cf8429 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -54,4 +54,9 @@ struct unwind_hint {
 #define UNWIND_HINT_TYPE_SAVE		6
 #define UNWIND_HINT_TYPE_RESTORE	7
 
+/*
+ * Annotate types
+ */
+#define ANNOTYPE_NOENDBR		1
+
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b0efc8ee16d6..a74ff26860f7 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2412,32 +2412,12 @@ static int read_annotate(struct objtool_file *file, void (*func)(int type, struc
 	return 0;
 }
 
-static void __annotate_nop(int type, struct instruction *insn)
+static void __annotate_noendbr(int type, struct instruction *insn)
 {
-}
-
-static int read_noendbr_hints(struct objtool_file *file)
-{
-	struct instruction *insn;
-	struct section *rsec;
-	struct reloc *reloc;
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.noendbr");
-	if (!rsec)
-		return 0;
-
-	for_each_reloc(rsec, reloc) {
-		insn = find_insn(file, reloc->sym->sec,
-				 reloc->sym->offset + reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.noendbr entry");
-			return -1;
-		}
-
-		insn->noendbr = 1;
-	}
+	if (type != ANNOTYPE_NOENDBR)
+		return;
 
-	return 0;
+	insn->noendbr = 1;
 }
 
 static int read_retpoline_hints(struct objtool_file *file)
@@ -2713,12 +2693,10 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
-	read_annotate(file, __annotate_nop);
-
 	/*
 	 * Must be before read_unwind_hints() since that needs insn->noendbr.
 	 */
-	ret = read_noendbr_hints(file);
+	ret = read_annotate(file, __annotate_noendbr);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From bf5febebd99fddfc6226a94e937d38a8d470b24e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:54 +0100
Subject: objtool: Convert ANNOTATE_RETPOLINE_SAFE to ANNOTATE

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.145275669@infradead.org
---
 tools/include/linux/objtool_types.h |  1 +
 tools/objtool/check.c               | 52 +++++++++++++------------------------
 2 files changed, 19 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 4884f8cf8429..1b348361ad1d 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -58,5 +58,6 @@ struct unwind_hint {
  * Annotate types
  */
 #define ANNOTYPE_NOENDBR		1
+#define ANNOTYPE_RETPOLINE_SAFE		2
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index a74ff26860f7..c5b52309b80d 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2373,12 +2373,12 @@ static int read_unwind_hints(struct objtool_file *file)
 	return 0;
 }
 
-static int read_annotate(struct objtool_file *file, void (*func)(int type, struct instruction *insn))
+static int read_annotate(struct objtool_file *file, int (*func)(int type, struct instruction *insn))
 {
 	struct section *sec;
 	struct instruction *insn;
 	struct reloc *reloc;
-	int type;
+	int type, ret;
 
 	sec = find_section_by_name(file->elf, ".discard.annotate_insn");
 	if (!sec)
@@ -2406,53 +2406,37 @@ static int read_annotate(struct objtool_file *file, void (*func)(int type, struc
 			return -1;
 		}
 
-		func(type, insn);
+		ret = func(type, insn);
+		if (ret < 0)
+			return ret;
 	}
 
 	return 0;
 }
 
-static void __annotate_noendbr(int type, struct instruction *insn)
+static int __annotate_noendbr(int type, struct instruction *insn)
 {
 	if (type != ANNOTYPE_NOENDBR)
-		return;
+		return 0;
 
 	insn->noendbr = 1;
+	return 0;
 }
 
-static int read_retpoline_hints(struct objtool_file *file)
+static int __annotate_retpoline_safe(int type, struct instruction *insn)
 {
-	struct section *rsec;
-	struct instruction *insn;
-	struct reloc *reloc;
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe");
-	if (!rsec)
+	if (type != ANNOTYPE_RETPOLINE_SAFE)
 		return 0;
 
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.retpoline_safe entry");
-			return -1;
-		}
-
-		if (insn->type != INSN_JUMP_DYNAMIC &&
-		    insn->type != INSN_CALL_DYNAMIC &&
-		    insn->type != INSN_RETURN &&
-		    insn->type != INSN_NOP) {
-			WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop");
-			return -1;
-		}
-
-		insn->retpoline_safe = true;
+	if (insn->type != INSN_JUMP_DYNAMIC &&
+	    insn->type != INSN_CALL_DYNAMIC &&
+	    insn->type != INSN_RETURN &&
+	    insn->type != INSN_NOP) {
+		WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop");
+		return -1;
 	}
 
+	insn->retpoline_safe = true;
 	return 0;
 }
 
@@ -2742,7 +2726,7 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
-	ret = read_retpoline_hints(file);
+	ret = read_annotate(file, __annotate_retpoline_safe);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 317f2a64618c528539d17fe6957a64106087fbd2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:55 +0100
Subject: objtool: Convert instrumentation_{begin,end}() to ANNOTATE

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.245980207@infradead.org
---
 tools/include/linux/objtool_types.h |  2 ++
 tools/objtool/check.c               | 49 ++++++++-----------------------------
 2 files changed, 12 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 1b348361ad1d..d4d68dd36f7a 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -59,5 +59,7 @@ struct unwind_hint {
  */
 #define ANNOTYPE_NOENDBR		1
 #define ANNOTYPE_RETPOLINE_SAFE		2
+#define ANNOTYPE_INSTR_BEGIN		3
+#define ANNOTYPE_INSTR_END		4
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index c5b52309b80d..8e39c7f484d8 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2440,48 +2440,19 @@ static int __annotate_retpoline_safe(int type, struct instruction *insn)
 	return 0;
 }
 
-static int read_instr_hints(struct objtool_file *file)
+static int __annotate_instr(int type, struct instruction *insn)
 {
-	struct section *rsec;
-	struct instruction *insn;
-	struct reloc *reloc;
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.instr_end");
-	if (!rsec)
-		return 0;
-
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.instr_end entry");
-			return -1;
-		}
+	switch (type) {
+	case ANNOTYPE_INSTR_BEGIN:
+		insn->instr++;
+		break;
 
+	case ANNOTYPE_INSTR_END:
 		insn->instr--;
-	}
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.instr_begin");
-	if (!rsec)
-		return 0;
-
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.instr_begin entry");
-			return -1;
-		}
+		break;
 
-		insn->instr++;
+	default:
+		break;
 	}
 
 	return 0;
@@ -2730,7 +2701,7 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
-	ret = read_instr_hints(file);
+	ret = read_annotate(file, __annotate_instr);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 18aa6118a1689b4d73c5ebbd917ae3f20c9c0db1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:56 +0100
Subject: objtool: Convert VALIDATE_UNRET_BEGIN to ANNOTATE

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.358508242@infradead.org
---
 tools/include/linux/objtool_types.h |  1 +
 tools/objtool/check.c               | 28 +++++-----------------------
 2 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index d4d68dd36f7a..16236a56364b 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -61,5 +61,6 @@ struct unwind_hint {
 #define ANNOTYPE_RETPOLINE_SAFE		2
 #define ANNOTYPE_INSTR_BEGIN		3
 #define ANNOTYPE_INSTR_END		4
+#define ANNOTYPE_UNRET_BEGIN		5
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 8e39c7f484d8..2a703748cad1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2458,33 +2458,15 @@ static int __annotate_instr(int type, struct instruction *insn)
 	return 0;
 }
 
-static int read_validate_unret_hints(struct objtool_file *file)
+static int __annotate_unret(int type, struct instruction *insn)
 {
-	struct section *rsec;
-	struct instruction *insn;
-	struct reloc *reloc;
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.validate_unret");
-	if (!rsec)
+	if (type != ANNOTYPE_UNRET_BEGIN)
 		return 0;
 
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.instr_end entry");
-			return -1;
-		}
-		insn->unret = 1;
-	}
-
+	insn->unret = 1;
 	return 0;
-}
 
+}
 
 static int read_intra_function_calls(struct objtool_file *file)
 {
@@ -2705,7 +2687,7 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
-	ret = read_validate_unret_hints(file);
+	ret = read_annotate(file, __annotate_unret);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From f0cd57c35a75f152d3b31b9be3f7f413b96a6d3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:57 +0100
Subject: objtool: Convert ANNOTATE_IGNORE_ALTERNATIVE to ANNOTATE

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.465691316@infradead.org
---
 tools/include/linux/objtool_types.h |  1 +
 tools/objtool/check.c               | 45 +++++++++----------------------------
 2 files changed, 11 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 16236a56364b..eab15dbe1cb7 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -62,5 +62,6 @@ struct unwind_hint {
 #define ANNOTYPE_INSTR_BEGIN		3
 #define ANNOTYPE_INSTR_END		4
 #define ANNOTYPE_UNRET_BEGIN		5
+#define ANNOTYPE_IGNORE_ALTS		6
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2a703748cad1..ba2cb9b69399 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1309,40 +1309,6 @@ static void add_uaccess_safe(struct objtool_file *file)
 	}
 }
 
-/*
- * FIXME: For now, just ignore any alternatives which add retpolines.  This is
- * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline.
- * But it at least allows objtool to understand the control flow *around* the
- * retpoline.
- */
-static int add_ignore_alternatives(struct objtool_file *file)
-{
-	struct section *rsec;
-	struct reloc *reloc;
-	struct instruction *insn;
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.ignore_alts");
-	if (!rsec)
-		return 0;
-
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.ignore_alts entry");
-			return -1;
-		}
-
-		insn->ignore_alts = true;
-	}
-
-	return 0;
-}
-
 /*
  * Symbols that replace INSN_CALL_DYNAMIC, every (tail) call to such a symbol
  * will be added to the .retpoline_sites section.
@@ -2414,6 +2380,15 @@ static int read_annotate(struct objtool_file *file, int (*func)(int type, struct
 	return 0;
 }
 
+static int __annotate_ignore_alts(int type, struct instruction *insn)
+{
+	if (type != ANNOTYPE_IGNORE_ALTS)
+		return 0;
+
+	insn->ignore_alts = true;
+	return 0;
+}
+
 static int __annotate_noendbr(int type, struct instruction *insn)
 {
 	if (type != ANNOTYPE_NOENDBR)
@@ -2626,7 +2601,7 @@ static int decode_sections(struct objtool_file *file)
 	add_ignores(file);
 	add_uaccess_safe(file);
 
-	ret = add_ignore_alternatives(file);
+	ret = read_annotate(file, __annotate_ignore_alts);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 112765ca1cb9353e71b4f5af4e6e6c4a69c28d99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:58 +0100
Subject: objtool: Convert ANNOTATE_INTRA_FUNCTION_CALL to ANNOTATE

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.584892071@infradead.org
---
 tools/include/linux/objtool_types.h |  1 +
 tools/objtool/check.c               | 96 +++++++++++++++----------------------
 2 files changed, 40 insertions(+), 57 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index eab15dbe1cb7..23d6fb6d04c7 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -63,5 +63,6 @@ struct unwind_hint {
 #define ANNOTYPE_INSTR_END		4
 #define ANNOTYPE_UNRET_BEGIN		5
 #define ANNOTYPE_IGNORE_ALTS		6
+#define ANNOTYPE_INTRA_FUNCTION_CALL	7
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index ba2cb9b69399..2222fe710832 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2339,7 +2339,8 @@ static int read_unwind_hints(struct objtool_file *file)
 	return 0;
 }
 
-static int read_annotate(struct objtool_file *file, int (*func)(int type, struct instruction *insn))
+static int read_annotate(struct objtool_file *file,
+			 int (*func)(struct objtool_file *file, int type, struct instruction *insn))
 {
 	struct section *sec;
 	struct instruction *insn;
@@ -2372,7 +2373,7 @@ static int read_annotate(struct objtool_file *file, int (*func)(int type, struct
 			return -1;
 		}
 
-		ret = func(type, insn);
+		ret = func(file, type, insn);
 		if (ret < 0)
 			return ret;
 	}
@@ -2380,7 +2381,7 @@ static int read_annotate(struct objtool_file *file, int (*func)(int type, struct
 	return 0;
 }
 
-static int __annotate_ignore_alts(int type, struct instruction *insn)
+static int __annotate_ignore_alts(struct objtool_file *file, int type, struct instruction *insn)
 {
 	if (type != ANNOTYPE_IGNORE_ALTS)
 		return 0;
@@ -2389,7 +2390,7 @@ static int __annotate_ignore_alts(int type, struct instruction *insn)
 	return 0;
 }
 
-static int __annotate_noendbr(int type, struct instruction *insn)
+static int __annotate_noendbr(struct objtool_file *file, int type, struct instruction *insn)
 {
 	if (type != ANNOTYPE_NOENDBR)
 		return 0;
@@ -2398,7 +2399,37 @@ static int __annotate_noendbr(int type, struct instruction *insn)
 	return 0;
 }
 
-static int __annotate_retpoline_safe(int type, struct instruction *insn)
+static int __annotate_ifc(struct objtool_file *file, int type, struct instruction *insn)
+{
+	unsigned long dest_off;
+
+	if (type != ANNOTYPE_INTRA_FUNCTION_CALL)
+		return 0;
+
+	if (insn->type != INSN_CALL) {
+		WARN_INSN(insn, "intra_function_call not a direct call");
+		return -1;
+	}
+
+	/*
+	 * Treat intra-function CALLs as JMPs, but with a stack_op.
+	 * See add_call_destinations(), which strips stack_ops from
+	 * normal CALLs.
+	 */
+	insn->type = INSN_JUMP_UNCONDITIONAL;
+
+	dest_off = arch_jump_destination(insn);
+	insn->jump_dest = find_insn(file, insn->sec, dest_off);
+	if (!insn->jump_dest) {
+		WARN_INSN(insn, "can't find call dest at %s+0x%lx",
+			  insn->sec->name, dest_off);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __annotate_retpoline_safe(struct objtool_file *file, int type, struct instruction *insn)
 {
 	if (type != ANNOTYPE_RETPOLINE_SAFE)
 		return 0;
@@ -2415,7 +2446,7 @@ static int __annotate_retpoline_safe(int type, struct instruction *insn)
 	return 0;
 }
 
-static int __annotate_instr(int type, struct instruction *insn)
+static int __annotate_instr(struct objtool_file *file, int type, struct instruction *insn)
 {
 	switch (type) {
 	case ANNOTYPE_INSTR_BEGIN:
@@ -2433,7 +2464,7 @@ static int __annotate_instr(int type, struct instruction *insn)
 	return 0;
 }
 
-static int __annotate_unret(int type, struct instruction *insn)
+static int __annotate_unret(struct objtool_file *file, int type, struct instruction *insn)
 {
 	if (type != ANNOTYPE_UNRET_BEGIN)
 		return 0;
@@ -2443,55 +2474,6 @@ static int __annotate_unret(int type, struct instruction *insn)
 
 }
 
-static int read_intra_function_calls(struct objtool_file *file)
-{
-	struct instruction *insn;
-	struct section *rsec;
-	struct reloc *reloc;
-
-	rsec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls");
-	if (!rsec)
-		return 0;
-
-	for_each_reloc(rsec, reloc) {
-		unsigned long dest_off;
-
-		if (reloc->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in %s",
-			     rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
-		if (!insn) {
-			WARN("bad .discard.intra_function_call entry");
-			return -1;
-		}
-
-		if (insn->type != INSN_CALL) {
-			WARN_INSN(insn, "intra_function_call not a direct call");
-			return -1;
-		}
-
-		/*
-		 * Treat intra-function CALLs as JMPs, but with a stack_op.
-		 * See add_call_destinations(), which strips stack_ops from
-		 * normal CALLs.
-		 */
-		insn->type = INSN_JUMP_UNCONDITIONAL;
-
-		dest_off = arch_jump_destination(insn);
-		insn->jump_dest = find_insn(file, insn->sec, dest_off);
-		if (!insn->jump_dest) {
-			WARN_INSN(insn, "can't find call dest at %s+0x%lx",
-				  insn->sec->name, dest_off);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
 /*
  * Return true if name matches an instrumentation function, where calls to that
  * function from noinstr code can safely be removed, but compilers won't do so.
@@ -2630,7 +2612,7 @@ static int decode_sections(struct objtool_file *file)
 	 * Must be before add_call_destination(); it changes INSN_CALL to
 	 * INSN_JUMP.
 	 */
-	ret = read_intra_function_calls(file);
+	ret = read_annotate(file, __annotate_ifc);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From a8a330dd9900024dc18b048c4f0f3c6ad22ff4c1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:38:59 +0100
Subject: objtool: Collapse annotate sequences

Reduce read_annotate() runs by collapsing subsequent runs into a
single call.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094311.688871544@infradead.org
---
 tools/objtool/check.c | 87 +++++++++++++++++++--------------------------------
 1 file changed, 32 insertions(+), 55 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2222fe710832..3bea8b2963d3 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2381,21 +2381,24 @@ static int read_annotate(struct objtool_file *file,
 	return 0;
 }
 
-static int __annotate_ignore_alts(struct objtool_file *file, int type, struct instruction *insn)
+static int __annotate_early(struct objtool_file *file, int type, struct instruction *insn)
 {
-	if (type != ANNOTYPE_IGNORE_ALTS)
-		return 0;
+	switch (type) {
+	case ANNOTYPE_IGNORE_ALTS:
+		insn->ignore_alts = true;
+		break;
 
-	insn->ignore_alts = true;
-	return 0;
-}
+	/*
+	 * Must be before read_unwind_hints() since that needs insn->noendbr.
+	 */
+	case ANNOTYPE_NOENDBR:
+		insn->noendbr = 1;
+		break;
 
-static int __annotate_noendbr(struct objtool_file *file, int type, struct instruction *insn)
-{
-	if (type != ANNOTYPE_NOENDBR)
-		return 0;
+	default:
+		break;
+	}
 
-	insn->noendbr = 1;
 	return 0;
 }
 
@@ -2429,26 +2432,21 @@ static int __annotate_ifc(struct objtool_file *file, int type, struct instructio
 	return 0;
 }
 
-static int __annotate_retpoline_safe(struct objtool_file *file, int type, struct instruction *insn)
+static int __annotate_late(struct objtool_file *file, int type, struct instruction *insn)
 {
-	if (type != ANNOTYPE_RETPOLINE_SAFE)
-		return 0;
-
-	if (insn->type != INSN_JUMP_DYNAMIC &&
-	    insn->type != INSN_CALL_DYNAMIC &&
-	    insn->type != INSN_RETURN &&
-	    insn->type != INSN_NOP) {
-		WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop");
-		return -1;
-	}
+	switch (type) {
+	case ANNOTYPE_RETPOLINE_SAFE:
+		if (insn->type != INSN_JUMP_DYNAMIC &&
+		    insn->type != INSN_CALL_DYNAMIC &&
+		    insn->type != INSN_RETURN &&
+		    insn->type != INSN_NOP) {
+			WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop");
+			return -1;
+		}
 
-	insn->retpoline_safe = true;
-	return 0;
-}
+		insn->retpoline_safe = true;
+		break;
 
-static int __annotate_instr(struct objtool_file *file, int type, struct instruction *insn)
-{
-	switch (type) {
 	case ANNOTYPE_INSTR_BEGIN:
 		insn->instr++;
 		break;
@@ -2457,6 +2455,10 @@ static int __annotate_instr(struct objtool_file *file, int type, struct instruct
 		insn->instr--;
 		break;
 
+	case ANNOTYPE_UNRET_BEGIN:
+		insn->unret = 1;
+		break;
+
 	default:
 		break;
 	}
@@ -2464,16 +2466,6 @@ static int __annotate_instr(struct objtool_file *file, int type, struct instruct
 	return 0;
 }
 
-static int __annotate_unret(struct objtool_file *file, int type, struct instruction *insn)
-{
-	if (type != ANNOTYPE_UNRET_BEGIN)
-		return 0;
-
-	insn->unret = 1;
-	return 0;
-
-}
-
 /*
  * Return true if name matches an instrumentation function, where calls to that
  * function from noinstr code can safely be removed, but compilers won't do so.
@@ -2583,14 +2575,7 @@ static int decode_sections(struct objtool_file *file)
 	add_ignores(file);
 	add_uaccess_safe(file);
 
-	ret = read_annotate(file, __annotate_ignore_alts);
-	if (ret)
-		return ret;
-
-	/*
-	 * Must be before read_unwind_hints() since that needs insn->noendbr.
-	 */
-	ret = read_annotate(file, __annotate_noendbr);
+	ret = read_annotate(file, __annotate_early);
 	if (ret)
 		return ret;
 
@@ -2636,15 +2621,7 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
-	ret = read_annotate(file, __annotate_retpoline_safe);
-	if (ret)
-		return ret;
-
-	ret = read_annotate(file, __annotate_instr);
-	if (ret)
-		return ret;
-
-	ret = read_annotate(file, __annotate_unret);
+	ret = read_annotate(file, __annotate_late);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 06e24745985c8dd0da18337503afcf2f2fdbdff1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:39:04 +0100
Subject: objtool: Remove annotate_{,un}reachable()

There are no users of annotate_reachable() left.

And the annotate_unreachable() usage in unreachable() is plain wrong;
it will hide dangerous fall-through code-gen.

Remove both.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094312.235637588@infradead.org
---
 tools/objtool/check.c | 43 ++-----------------------------------------
 1 file changed, 2 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3bea8b2963d3..798cff5bffc4 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -638,47 +638,8 @@ static int add_dead_ends(struct objtool_file *file)
 	uint64_t offset;
 
 	/*
-	 * Check for manually annotated dead ends.
-	 */
-	rsec = find_section_by_name(file->elf, ".rela.discard.unreachable");
-	if (!rsec)
-		goto reachable;
-
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type == STT_SECTION) {
-			offset = reloc_addend(reloc);
-		} else if (reloc->sym->local_label) {
-			offset = reloc->sym->offset;
-		} else {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, offset);
-		if (insn)
-			insn = prev_insn_same_sec(file, insn);
-		else if (offset == reloc->sym->sec->sh.sh_size) {
-			insn = find_last_insn(file, reloc->sym->sec);
-			if (!insn) {
-				WARN("can't find unreachable insn at %s+0x%" PRIx64,
-				     reloc->sym->sec->name, offset);
-				return -1;
-			}
-		} else {
-			WARN("can't find unreachable insn at %s+0x%" PRIx64,
-			     reloc->sym->sec->name, offset);
-			return -1;
-		}
-
-		insn->dead_end = true;
-	}
-
-reachable:
-	/*
-	 * These manually annotated reachable checks are needed for GCC 4.4,
-	 * where the Linux unreachable() macro isn't supported.  In that case
-	 * GCC doesn't know the "ud2" is fatal, so it generates code as if it's
-	 * not a dead end.
+	 * UD2 defaults to being a dead-end, allow them to be annotated for
+	 * non-fatal, eg WARN.
 	 */
 	rsec = find_section_by_name(file->elf, ".rela.discard.reachable");
 	if (!rsec)
-- 
cgit v1.2.3


From e7a174fb43d24adca066e82d1cb9fdee092d48d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:39:05 +0100
Subject: objtool: Convert {.UN}REACHABLE to ANNOTATE

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094312.353431347@infradead.org
---
 tools/include/linux/objtool_types.h |  1 +
 tools/objtool/check.c               | 82 ++++++++++---------------------------
 2 files changed, 23 insertions(+), 60 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
index 23d6fb6d04c7..df5d9fa84dba 100644
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -64,5 +64,6 @@ struct unwind_hint {
 #define ANNOTYPE_UNRET_BEGIN		5
 #define ANNOTYPE_IGNORE_ALTS		6
 #define ANNOTYPE_INTRA_FUNCTION_CALL	7
+#define ANNOTYPE_REACHABLE		8
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 798cff5bffc4..27d0c4153582 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -627,56 +627,6 @@ static struct instruction *find_last_insn(struct objtool_file *file,
 	return insn;
 }
 
-/*
- * Mark "ud2" instructions and manually annotated dead ends.
- */
-static int add_dead_ends(struct objtool_file *file)
-{
-	struct section *rsec;
-	struct reloc *reloc;
-	struct instruction *insn;
-	uint64_t offset;
-
-	/*
-	 * UD2 defaults to being a dead-end, allow them to be annotated for
-	 * non-fatal, eg WARN.
-	 */
-	rsec = find_section_by_name(file->elf, ".rela.discard.reachable");
-	if (!rsec)
-		return 0;
-
-	for_each_reloc(rsec, reloc) {
-		if (reloc->sym->type == STT_SECTION) {
-			offset = reloc_addend(reloc);
-		} else if (reloc->sym->local_label) {
-			offset = reloc->sym->offset;
-		} else {
-			WARN("unexpected relocation symbol type in %s", rsec->name);
-			return -1;
-		}
-
-		insn = find_insn(file, reloc->sym->sec, offset);
-		if (insn)
-			insn = prev_insn_same_sec(file, insn);
-		else if (offset == reloc->sym->sec->sh.sh_size) {
-			insn = find_last_insn(file, reloc->sym->sec);
-			if (!insn) {
-				WARN("can't find reachable insn at %s+0x%" PRIx64,
-				     reloc->sym->sec->name, offset);
-				return -1;
-			}
-		} else {
-			WARN("can't find reachable insn at %s+0x%" PRIx64,
-			     reloc->sym->sec->name, offset);
-			return -1;
-		}
-
-		insn->dead_end = false;
-	}
-
-	return 0;
-}
-
 static int create_static_call_sections(struct objtool_file *file)
 {
 	struct static_call_site *site;
@@ -2306,6 +2256,7 @@ static int read_annotate(struct objtool_file *file,
 	struct section *sec;
 	struct instruction *insn;
 	struct reloc *reloc;
+	uint64_t offset;
 	int type, ret;
 
 	sec = find_section_by_name(file->elf, ".discard.annotate_insn");
@@ -2327,8 +2278,19 @@ static int read_annotate(struct objtool_file *file,
 	for_each_reloc(sec->rsec, reloc) {
 		type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4);
 
-		insn = find_insn(file, reloc->sym->sec,
-				 reloc->sym->offset + reloc_addend(reloc));
+		offset = reloc->sym->offset + reloc_addend(reloc);
+		insn = find_insn(file, reloc->sym->sec, offset);
+
+		/*
+		 * Reachable annotations are 'funneh' and act on the previous instruction :/
+		 */
+		if (type == ANNOTYPE_REACHABLE) {
+			if (insn)
+				insn = prev_insn_same_sec(file, insn);
+			else if (offset == reloc->sym->sec->sh.sh_size)
+				insn = find_last_insn(file, reloc->sym->sec);
+		}
+
 		if (!insn) {
 			WARN("bad .discard.annotate_insn entry: %d of type %d", reloc_idx(reloc), type);
 			return -1;
@@ -2420,6 +2382,10 @@ static int __annotate_late(struct objtool_file *file, int type, struct instructi
 		insn->unret = 1;
 		break;
 
+	case ANNOTYPE_REACHABLE:
+		insn->dead_end = false;
+		break;
+
 	default:
 		break;
 	}
@@ -2566,14 +2532,6 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
-	/*
-	 * Must be after add_call_destinations() such that it can override
-	 * dead_end_function() marks.
-	 */
-	ret = add_dead_ends(file);
-	if (ret)
-		return ret;
-
 	ret = add_jump_table_alts(file);
 	if (ret)
 		return ret;
@@ -2582,6 +2540,10 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
+	/*
+	 * Must be after add_call_destinations() such that it can override
+	 * dead_end_function() marks.
+	 */
 	ret = read_annotate(file, __annotate_late);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 87116ae6da034242baf06e799f9f0e2a8ee6a796 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:39:06 +0100
Subject: objtool: Fix ANNOTATE_REACHABLE to be a normal annotation

Currently REACHABLE is weird for being on the instruction after the
instruction it modifies.

Since all REACHABLE annotations have an explicit instruction, flip
them around.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094312.494176035@infradead.org
---
 tools/objtool/check.c | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 27d0c4153582..26bdd3ebf5d2 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -614,19 +614,6 @@ static int init_pv_ops(struct objtool_file *file)
 	return 0;
 }
 
-static struct instruction *find_last_insn(struct objtool_file *file,
-					  struct section *sec)
-{
-	struct instruction *insn = NULL;
-	unsigned int offset;
-	unsigned int end = (sec->sh.sh_size > 10) ? sec->sh.sh_size - 10 : 0;
-
-	for (offset = sec->sh.sh_size - 1; offset >= end && !insn; offset--)
-		insn = find_insn(file, sec, offset);
-
-	return insn;
-}
-
 static int create_static_call_sections(struct objtool_file *file)
 {
 	struct static_call_site *site;
@@ -2281,16 +2268,6 @@ static int read_annotate(struct objtool_file *file,
 		offset = reloc->sym->offset + reloc_addend(reloc);
 		insn = find_insn(file, reloc->sym->sec, offset);
 
-		/*
-		 * Reachable annotations are 'funneh' and act on the previous instruction :/
-		 */
-		if (type == ANNOTYPE_REACHABLE) {
-			if (insn)
-				insn = prev_insn_same_sec(file, insn);
-			else if (offset == reloc->sym->sec->sh.sh_size)
-				insn = find_last_insn(file, reloc->sym->sec);
-		}
-
 		if (!insn) {
 			WARN("bad .discard.annotate_insn entry: %d of type %d", reloc_idx(reloc), type);
 			return -1;
-- 
cgit v1.2.3


From e7e0eb53c2f0f68fe2577472ce2802a4efd9d7ce Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Nov 2024 10:39:07 +0100
Subject: objtool: Warn about unknown annotation types

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20241128094312.611961175@infradead.org
---
 tools/objtool/check.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 26bdd3ebf5d2..bfb407f3ac96 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2335,6 +2335,10 @@ static int __annotate_ifc(struct objtool_file *file, int type, struct instructio
 static int __annotate_late(struct objtool_file *file, int type, struct instruction *insn)
 {
 	switch (type) {
+	case ANNOTYPE_NOENDBR:
+		/* early */
+		break;
+
 	case ANNOTYPE_RETPOLINE_SAFE:
 		if (insn->type != INSN_JUMP_DYNAMIC &&
 		    insn->type != INSN_CALL_DYNAMIC &&
@@ -2359,11 +2363,20 @@ static int __annotate_late(struct objtool_file *file, int type, struct instructi
 		insn->unret = 1;
 		break;
 
+	case ANNOTYPE_IGNORE_ALTS:
+		/* early */
+		break;
+
+	case ANNOTYPE_INTRA_FUNCTION_CALL:
+		/* ifc */
+		break;
+
 	case ANNOTYPE_REACHABLE:
 		insn->dead_end = false;
 		break;
 
 	default:
+		WARN_INSN(insn, "Unknown annotation type: %d", type);
 		break;
 	}
 
-- 
cgit v1.2.3


From c3cb6c158c64dc39838208d51dcd06d1990b371d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 11 Oct 2024 19:08:50 +0200
Subject: objtool: Allow arch code to discover jump table size

In preparation for adding support for annotated jump tables, where
ELF relocations and symbols are used to describe the locations of jump
tables in the executable, refactor the jump table discovery logic so the
table size can be returned from arch_find_switch_table().

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241011170847.334429-12-ardb+git@google.com
---
 tools/objtool/arch/loongarch/special.c  |  3 ++-
 tools/objtool/arch/powerpc/special.c    |  3 ++-
 tools/objtool/arch/x86/special.c        |  4 +++-
 tools/objtool/check.c                   | 31 ++++++++++++++++++++-----------
 tools/objtool/include/objtool/check.h   |  5 ++++-
 tools/objtool/include/objtool/special.h |  3 ++-
 6 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c
index 9bba1e9318e0..87230ed570fd 100644
--- a/tools/objtool/arch/loongarch/special.c
+++ b/tools/objtool/arch/loongarch/special.c
@@ -9,7 +9,8 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
 }
 
 struct reloc *arch_find_switch_table(struct objtool_file *file,
-				     struct instruction *insn)
+				     struct instruction *insn,
+				     unsigned long *table_size)
 {
 	return NULL;
 }
diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c
index d33868147196..51610689abf7 100644
--- a/tools/objtool/arch/powerpc/special.c
+++ b/tools/objtool/arch/powerpc/special.c
@@ -13,7 +13,8 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
 }
 
 struct reloc *arch_find_switch_table(struct objtool_file *file,
-				    struct instruction *insn)
+				     struct instruction *insn,
+				     unsigned long *table_size)
 {
 	exit(-1);
 }
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index 4ea0f9815fda..9c1c9df09aaa 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -109,7 +109,8 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
  *    NOTE: MITIGATION_RETPOLINE made it harder still to decode dynamic jumps.
  */
 struct reloc *arch_find_switch_table(struct objtool_file *file,
-				    struct instruction *insn)
+				     struct instruction *insn,
+				     unsigned long *table_size)
 {
 	struct reloc  *text_reloc, *rodata_reloc;
 	struct section *table_sec;
@@ -158,5 +159,6 @@ struct reloc *arch_find_switch_table(struct objtool_file *file,
 	if (reloc_type(text_reloc) == R_X86_64_PC32)
 		file->ignore_unreachables = true;
 
+	*table_size = 0;
 	return rodata_reloc;
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index bfb407f3ac96..e92c5564d9ca 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -150,6 +150,15 @@ static inline struct reloc *insn_jump_table(struct instruction *insn)
 	return NULL;
 }
 
+static inline unsigned long insn_jump_table_size(struct instruction *insn)
+{
+	if (insn->type == INSN_JUMP_DYNAMIC ||
+	    insn->type == INSN_CALL_DYNAMIC)
+		return insn->_jump_table_size;
+
+	return 0;
+}
+
 static bool is_jump_table_jump(struct instruction *insn)
 {
 	struct alt_group *alt_group = insn->alt_group;
@@ -1937,6 +1946,7 @@ out:
 static int add_jump_table(struct objtool_file *file, struct instruction *insn,
 			  struct reloc *next_table)
 {
+	unsigned long table_size = insn_jump_table_size(insn);
 	struct symbol *pfunc = insn_func(insn)->pfunc;
 	struct reloc *table = insn_jump_table(insn);
 	struct instruction *dest_insn;
@@ -1951,6 +1961,8 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
 	for_each_reloc_from(table->sec, reloc) {
 
 		/* Check for the end of the table: */
+		if (table_size && reloc_offset(reloc) - reloc_offset(table) >= table_size)
+			break;
 		if (reloc != table && reloc == next_table)
 			break;
 
@@ -1995,12 +2007,12 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
  * find_jump_table() - Given a dynamic jump, find the switch jump table
  * associated with it.
  */
-static struct reloc *find_jump_table(struct objtool_file *file,
-				      struct symbol *func,
-				      struct instruction *insn)
+static void find_jump_table(struct objtool_file *file, struct symbol *func,
+			    struct instruction *insn)
 {
 	struct reloc *table_reloc;
 	struct instruction *dest_insn, *orig_insn = insn;
+	unsigned long table_size;
 
 	/*
 	 * Backward search using the @first_jump_src links, these help avoid
@@ -2021,17 +2033,17 @@ static struct reloc *find_jump_table(struct objtool_file *file,
 		     insn->jump_dest->offset > orig_insn->offset))
 		    break;
 
-		table_reloc = arch_find_switch_table(file, insn);
+		table_reloc = arch_find_switch_table(file, insn, &table_size);
 		if (!table_reloc)
 			continue;
 		dest_insn = find_insn(file, table_reloc->sym->sec, reloc_addend(table_reloc));
 		if (!dest_insn || !insn_func(dest_insn) || insn_func(dest_insn)->pfunc != func)
 			continue;
 
-		return table_reloc;
+		orig_insn->_jump_table = table_reloc;
+		orig_insn->_jump_table_size = table_size;
+		break;
 	}
-
-	return NULL;
 }
 
 /*
@@ -2042,7 +2054,6 @@ static void mark_func_jump_tables(struct objtool_file *file,
 				    struct symbol *func)
 {
 	struct instruction *insn, *last = NULL;
-	struct reloc *reloc;
 
 	func_for_each_insn(file, func, insn) {
 		if (!last)
@@ -2065,9 +2076,7 @@ static void mark_func_jump_tables(struct objtool_file *file,
 		if (insn->type != INSN_JUMP_DYNAMIC)
 			continue;
 
-		reloc = find_jump_table(file, func, insn);
-		if (reloc)
-			insn->_jump_table = reloc;
+		find_jump_table(file, func, insn);
 	}
 }
 
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index daa46f1f0965..e1cd13cd28a3 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -71,7 +71,10 @@ struct instruction {
 	struct instruction *first_jump_src;
 	union {
 		struct symbol *_call_dest;
-		struct reloc *_jump_table;
+		struct {
+			struct reloc *_jump_table;
+			unsigned long _jump_table_size;
+		};
 	};
 	struct alternative *alts;
 	struct symbol *sym;
diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h
index 86d4af9c5aa9..e7ee7ffccefd 100644
--- a/tools/objtool/include/objtool/special.h
+++ b/tools/objtool/include/objtool/special.h
@@ -38,5 +38,6 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
 				 struct instruction *insn,
 				 struct reloc *reloc);
 struct reloc *arch_find_switch_table(struct objtool_file *file,
-				    struct instruction *insn);
+				     struct instruction *insn,
+				     unsigned long *table_size);
 #endif /* _SPECIAL_H */
-- 
cgit v1.2.3


From 2fe34a116c707821c99bb352cb33be277c99d491 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:11 +0100
Subject: selftests/bpf: add a macro to compare raw memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We sometimes need to compare whole structures in an assert. It is
possible to use the existing macros on each field, but when the whole
structure has to be checked, it is more convenient to simply compare the
whole structure memory

Add a dedicated assert macro, ASSERT_MEMEQ, to allow bare memory
comparision
The output generated by this new macro looks like the following:
[...]
run_tests_skb_less:FAIL:returned flow keys unexpected memory mismatch
actual:
	00 00 00 00 00 00 00 00 	00 00 00 00 00 00 00 00
	00 00 00 00 00 00 00 00 	00 00 00 00 00 00 00 00
	00 00 00 00 00 00 00 00 	00 00 00 00 00 00 00 00
	00 00 00 00 00 00 00 00
expected:
	0E 00 3E 00 DD 86 01 01 	00 06 86 DD 50 00 90 1F
	00 00 00 00 00 00 00 00 	00 00 00 00 00 00 00 00
	00 00 00 00 00 00 00 00 	00 00 00 00 00 00 00 00
	01 00 00 00 00 00 00 00
[...]

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-1-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c | 15 +++++++++++++++
 tools/testing/selftests/bpf/test_progs.h | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 6088d8222d59..c9e745d49493 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1282,6 +1282,21 @@ void crash_handler(int signum)
 	backtrace_symbols_fd(bt, sz, STDERR_FILENO);
 }
 
+void hexdump(const char *prefix, const void *buf, size_t len)
+{
+	for (int i = 0; i < len; i++) {
+		if (!(i % 16)) {
+			if (i)
+				fprintf(stdout, "\n");
+			fprintf(stdout, "%s", prefix);
+		}
+		if (i && !(i % 8) && (i % 16))
+			fprintf(stdout, "\t");
+		fprintf(stdout, "%02X ", ((uint8_t *)(buf))[i]);
+	}
+	fprintf(stdout, "\n");
+}
+
 static void sigint_handler(int signum)
 {
 	int i;
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 74de33ae37e5..404d0d4915d5 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -185,6 +185,7 @@ void test__end_subtest(void);
 void test__skip(void);
 void test__fail(void);
 int test__join_cgroup(const char *path);
+void hexdump(const char *prefix, const void *buf, size_t len);
 
 #define PRINT_FAIL(format...)                                                  \
 	({                                                                     \
@@ -344,6 +345,20 @@ int test__join_cgroup(const char *path);
 	___ok;								\
 })
 
+#define ASSERT_MEMEQ(actual, expected, len, name) ({			\
+	static int duration = 0;					\
+	const void *__act = actual;					\
+	const void *__exp = expected;					\
+	int __len = len;						\
+	bool ___ok = memcmp(__act, __exp, __len) == 0;			\
+	CHECK(!___ok, (name), "unexpected memory mismatch\n");		\
+	fprintf(stdout, "actual:\n");					\
+	hexdump("\t", __act, __len);					\
+	fprintf(stdout, "expected:\n");					\
+	hexdump("\t", __exp, __len);					\
+	___ok;								\
+})
+
 #define ASSERT_OK(res, name) ({						\
 	static int duration = 0;					\
 	long long ___res = (res);					\
-- 
cgit v1.2.3


From 3fed5d084fb36365ccf06b6fe20d19e0e672a47d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:12 +0100
Subject: selftests/bpf: use ASSERT_MEMEQ to compare bpf flow keys
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flow_dissector program currently compares flow keys returned by bpf
program with the expected one thanks to a custom macro using memcmp.

Use the new ASSERT_MEMEQ macro to perform this comparision. This update
also allows to get rid of the unused bpf_test_run_opts variable in
run_tests_skb_less (it was only used by the CHECK macro for its duration
field)

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-2-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/flow_dissector.c      | 36 ++++------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index cfcc90cb7ffb..3ea25ecdf3c9 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -13,33 +13,6 @@
 #define IP_MF 0x2000
 #endif
 
-#define CHECK_FLOW_KEYS(desc, got, expected)				\
-	_CHECK(memcmp(&got, &expected, sizeof(got)) != 0,		\
-	      desc,							\
-	      topts.duration,						\
-	      "nhoff=%u/%u "						\
-	      "thoff=%u/%u "						\
-	      "addr_proto=0x%x/0x%x "					\
-	      "is_frag=%u/%u "						\
-	      "is_first_frag=%u/%u "					\
-	      "is_encap=%u/%u "						\
-	      "ip_proto=0x%x/0x%x "					\
-	      "n_proto=0x%x/0x%x "					\
-	      "flow_label=0x%x/0x%x "					\
-	      "sport=%u/%u "						\
-	      "dport=%u/%u\n",						\
-	      got.nhoff, expected.nhoff,				\
-	      got.thoff, expected.thoff,				\
-	      got.addr_proto, expected.addr_proto,			\
-	      got.is_frag, expected.is_frag,				\
-	      got.is_first_frag, expected.is_first_frag,		\
-	      got.is_encap, expected.is_encap,				\
-	      got.ip_proto, expected.ip_proto,				\
-	      got.n_proto, expected.n_proto,				\
-	      got.flow_label, expected.flow_label,			\
-	      got.sport, expected.sport,				\
-	      got.dport, expected.dport)
-
 struct ipv4_pkt {
 	struct ethhdr eth;
 	struct iphdr iph;
@@ -545,7 +518,6 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
 		/* Keep in sync with 'flags' from eth_get_headlen. */
 		__u32 eth_get_headlen_flags =
 			BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
-		LIBBPF_OPTS(bpf_test_run_opts, topts);
 		struct bpf_flow_keys flow_keys = {};
 		__u32 key = (__u32)(tests[i].keys.sport) << 16 |
 			    tests[i].keys.dport;
@@ -567,7 +539,9 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
 		err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
 		ASSERT_OK(err, "bpf_map_lookup_elem");
 
-		CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
+		ASSERT_MEMEQ(&flow_keys, &tests[i].keys,
+			     sizeof(struct bpf_flow_keys),
+			     "returned flow keys");
 
 		err = bpf_map_delete_elem(keys_fd, &key);
 		ASSERT_OK(err, "bpf_map_delete_elem");
@@ -656,7 +630,9 @@ void test_flow_dissector(void)
 			continue;
 		ASSERT_EQ(topts.data_size_out, sizeof(flow_keys),
 			  "test_run data_size_out");
-		CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
+		ASSERT_MEMEQ(&flow_keys, &tests[i].keys,
+			     sizeof(struct bpf_flow_keys),
+			     "returned flow keys");
 	}
 
 	/* Do the same tests but for skb-less flow dissector.
-- 
cgit v1.2.3


From 28494d6a277ebe5b9a5b1313b1f1ee396b890e35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:13 +0100
Subject: selftests/bpf: replace CHECK calls with ASSERT macros in
 flow_dissector test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flow dissector test currently relies on generic CHECK macros to
perform tests. Update those to newer, more-specific ASSERT macros.

This update allows to get rid of the global duration variable, which was
needed by the CHECK macros

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-3-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/flow_dissector.c      | 41 +++++++++++-----------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 3ea25ecdf3c9..6fbe8b6dad56 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -79,7 +79,6 @@ struct test {
 
 #define VLAN_HLEN	4
 
-static __u32 duration;
 struct test tests[] = {
 	{
 		.name = "ipv4",
@@ -511,7 +510,7 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
 	int i, err, keys_fd;
 
 	keys_fd = bpf_map__fd(keys);
-	if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd))
+	if (!ASSERT_OK_FD(keys_fd, "bpf_map__fd"))
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
@@ -530,14 +529,16 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
 			continue;
 
 		err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
-		CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
+		if (!ASSERT_EQ(err, sizeof(tests[i].pkt), "tx_tap"))
+			continue;
 
 		/* check the stored flow_keys only if BPF_OK expected */
 		if (tests[i].retval != BPF_OK)
 			continue;
 
 		err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
-		ASSERT_OK(err, "bpf_map_lookup_elem");
+		if (!ASSERT_OK(err, "bpf_map_lookup_elem"))
+			continue;
 
 		ASSERT_MEMEQ(&flow_keys, &tests[i].keys,
 			     sizeof(struct bpf_flow_keys),
@@ -553,17 +554,17 @@ static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd)
 	int err, prog_fd;
 
 	prog_fd = bpf_program__fd(skel->progs._dissect);
-	if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd))
+	if (!ASSERT_OK_FD(prog_fd, "bpf_program__fd"))
 		return;
 
 	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
-	if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno))
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
 		return;
 
 	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
 
 	err = bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
-	CHECK(err, "bpf_prog_detach2", "err %d errno %d\n", err, errno);
+	ASSERT_OK(err, "bpf_prog_detach2");
 }
 
 static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
@@ -572,7 +573,7 @@ static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
 	int err, net_fd;
 
 	net_fd = open("/proc/self/ns/net", O_RDONLY);
-	if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno))
+	if (!ASSERT_OK_FD(net_fd, "open(/proc/self/ns/net"))
 		return;
 
 	link = bpf_program__attach_netns(skel->progs._dissect, net_fd);
@@ -582,7 +583,7 @@ static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
 	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
 
 	err = bpf_link__destroy(link);
-	CHECK(err, "bpf_link__destroy", "err %d\n", err);
+	ASSERT_OK(err, "bpf_link__destroy");
 out_close:
 	close(net_fd);
 }
@@ -593,18 +594,18 @@ void test_flow_dissector(void)
 	struct bpf_flow *skel;
 
 	skel = bpf_flow__open_and_load();
-	if (CHECK(!skel, "skel", "failed to open/load skeleton\n"))
+	if (!ASSERT_OK_PTR(skel, "open/load skeleton"))
 		return;
 
 	prog_fd = bpf_program__fd(skel->progs._dissect);
-	if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd))
-		goto out_destroy_skel;
+	if (!ASSERT_OK_FD(prog_fd, "bpf_program__fd"))
+		return;
 	keys_fd = bpf_map__fd(skel->maps.last_dissection);
-	if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd))
-		goto out_destroy_skel;
+	if (!ASSERT_OK_FD(keys_fd, "bpf_map__fd"))
+		return;
 	err = init_prog_array(skel->obj, skel->maps.jmp_table);
-	if (CHECK(err, "init_prog_array", "err %d\n", err))
-		goto out_destroy_skel;
+	if (!ASSERT_OK(err, "init_prog_array"))
+		return;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		struct bpf_flow_keys flow_keys;
@@ -634,17 +635,17 @@ void test_flow_dissector(void)
 			     sizeof(struct bpf_flow_keys),
 			     "returned flow keys");
 	}
-
 	/* Do the same tests but for skb-less flow dissector.
 	 * We use a known path in the net/tun driver that calls
 	 * eth_get_headlen and we manually export bpf_flow_keys
 	 * via BPF map in this case.
 	 */
-
 	tap_fd = create_tap("tap0");
-	CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno);
+	if (!ASSERT_OK_FD(tap_fd, "create_tap"))
+		goto out_destroy_skel;
 	err = ifup("tap0");
-	CHECK(err, "ifup", "err %d errno %d\n", err, errno);
+	if (!ASSERT_OK(err, "ifup"))
+		goto out_destroy_skel;
 
 	/* Test direct prog attachment */
 	test_skb_less_prog_attach(skel, tap_fd);
-- 
cgit v1.2.3


From 2b044dd186f0f378894f1e590d62325cbbf9b085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:14 +0100
Subject: selftests/bpf: re-split main function into dedicated tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flow_dissector runs plenty of tests over diffent kind of packets,
grouped into three categories: skb mode, non-skb mode with direct
attach, and non-skb with indirect attach.

Re-split the main function into dedicated tests. Each test now must have
its own setup/teardown, but for the advantage of being able to run them
separately. While at it, make sure that tests attaching the bpf programs
are run in a dedicated ns.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-4-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/flow_dissector.c      | 108 ++++++++++++++-------
 1 file changed, 73 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 6fbe8b6dad56..7e7051a85be7 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -549,63 +549,117 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
 	}
 }
 
-static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd)
+void test_flow_dissector_skb_less_direct_attach(void)
 {
-	int err, prog_fd;
+	int err, prog_fd, tap_fd;
+	struct bpf_flow *skel;
+	struct netns_obj *ns;
+
+	ns = netns_new("flow_dissector_skb_less_indirect_attach_ns", true);
+	if (!ASSERT_OK_PTR(ns, "create and open netns"))
+		return;
+
+	skel = bpf_flow__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open/load skeleton"))
+		goto out_clean_ns;
+
+	err = init_prog_array(skel->obj, skel->maps.jmp_table);
+	if (!ASSERT_OK(err, "init_prog_array"))
+		goto out_destroy_skel;
 
 	prog_fd = bpf_program__fd(skel->progs._dissect);
 	if (!ASSERT_OK_FD(prog_fd, "bpf_program__fd"))
-		return;
+		goto out_destroy_skel;
 
 	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
 	if (!ASSERT_OK(err, "bpf_prog_attach"))
-		return;
+		goto out_destroy_skel;
+
+	tap_fd = create_tap("tap0");
+	if (!ASSERT_OK_FD(tap_fd, "create_tap"))
+		goto out_destroy_skel;
+	err = ifup("tap0");
+	if (!ASSERT_OK(err, "ifup"))
+		goto out_close_tap;
 
 	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
 
 	err = bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
 	ASSERT_OK(err, "bpf_prog_detach2");
+
+out_close_tap:
+	close(tap_fd);
+out_destroy_skel:
+	bpf_flow__destroy(skel);
+out_clean_ns:
+	netns_free(ns);
 }
 
-static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
+void test_flow_dissector_skb_less_indirect_attach(void)
 {
+	int err, net_fd, tap_fd;
+	struct bpf_flow *skel;
 	struct bpf_link *link;
-	int err, net_fd;
+	struct netns_obj *ns;
+
+	ns = netns_new("flow_dissector_skb_less_indirect_attach_ns", true);
+	if (!ASSERT_OK_PTR(ns, "create and open netns"))
+		return;
+
+	skel = bpf_flow__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open/load skeleton"))
+		goto out_clean_ns;
 
 	net_fd = open("/proc/self/ns/net", O_RDONLY);
 	if (!ASSERT_OK_FD(net_fd, "open(/proc/self/ns/net"))
-		return;
+		goto out_destroy_skel;
+
+	err = init_prog_array(skel->obj, skel->maps.jmp_table);
+	if (!ASSERT_OK(err, "init_prog_array"))
+		goto out_destroy_skel;
+
+	tap_fd = create_tap("tap0");
+	if (!ASSERT_OK_FD(tap_fd, "create_tap"))
+		goto out_close_ns;
+	err = ifup("tap0");
+	if (!ASSERT_OK(err, "ifup"))
+		goto out_close_tap;
 
 	link = bpf_program__attach_netns(skel->progs._dissect, net_fd);
 	if (!ASSERT_OK_PTR(link, "attach_netns"))
-		goto out_close;
+		goto out_close_tap;
 
 	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
 
 	err = bpf_link__destroy(link);
 	ASSERT_OK(err, "bpf_link__destroy");
-out_close:
+
+out_close_tap:
+	close(tap_fd);
+out_close_ns:
 	close(net_fd);
+out_destroy_skel:
+	bpf_flow__destroy(skel);
+out_clean_ns:
+	netns_free(ns);
 }
 
-void test_flow_dissector(void)
+void test_flow_dissector_skb(void)
 {
-	int i, err, prog_fd, keys_fd = -1, tap_fd;
 	struct bpf_flow *skel;
+	int i, err, prog_fd;
 
 	skel = bpf_flow__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "open/load skeleton"))
 		return;
 
-	prog_fd = bpf_program__fd(skel->progs._dissect);
-	if (!ASSERT_OK_FD(prog_fd, "bpf_program__fd"))
-		return;
-	keys_fd = bpf_map__fd(skel->maps.last_dissection);
-	if (!ASSERT_OK_FD(keys_fd, "bpf_map__fd"))
-		return;
 	err = init_prog_array(skel->obj, skel->maps.jmp_table);
 	if (!ASSERT_OK(err, "init_prog_array"))
-		return;
+		goto out_destroy_skel;
+
+	prog_fd = bpf_program__fd(skel->progs._dissect);
+	if (!ASSERT_OK_FD(prog_fd, "bpf_program__fd"))
+		goto out_destroy_skel;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		struct bpf_flow_keys flow_keys;
@@ -635,24 +689,8 @@ void test_flow_dissector(void)
 			     sizeof(struct bpf_flow_keys),
 			     "returned flow keys");
 	}
-	/* Do the same tests but for skb-less flow dissector.
-	 * We use a known path in the net/tun driver that calls
-	 * eth_get_headlen and we manually export bpf_flow_keys
-	 * via BPF map in this case.
-	 */
-	tap_fd = create_tap("tap0");
-	if (!ASSERT_OK_FD(tap_fd, "create_tap"))
-		goto out_destroy_skel;
-	err = ifup("tap0");
-	if (!ASSERT_OK(err, "ifup"))
-		goto out_destroy_skel;
-
-	/* Test direct prog attachment */
-	test_skb_less_prog_attach(skel, tap_fd);
-	/* Test indirect prog attachment via link */
-	test_skb_less_link_create(skel, tap_fd);
 
-	close(tap_fd);
 out_destroy_skel:
 	bpf_flow__destroy(skel);
 }
+
-- 
cgit v1.2.3


From a2cc66bb937a68d22b5acf7d08f7c8cb5fa859be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:15 +0100
Subject: selftests/bpf: expose all subtests from flow_dissector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flow_dissector test integrated in test_progs actually runs a wide
matrix of tests over different packets types and bpf programs modes, but
exposes only 3 main tests, preventing tests users from running specific
subtests with a specific input only.

Expose all subtests executed by flow_dissector by using
test__start_subtest().

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-5-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/flow_dissector.c        | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 7e7051a85be7..29182009cda9 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -8,6 +8,7 @@
 #include "bpf_flow.skel.h"
 
 #define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */
+#define TEST_NAME_MAX_LEN	64
 
 #ifndef IP_MF
 #define IP_MF 0x2000
@@ -505,8 +506,10 @@ static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array)
 	return 0;
 }
 
-static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
+static void run_tests_skb_less(int tap_fd, struct bpf_map *keys,
+			       char *test_suffix)
 {
+	char test_name[TEST_NAME_MAX_LEN];
 	int i, err, keys_fd;
 
 	keys_fd = bpf_map__fd(keys);
@@ -520,6 +523,10 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
 		struct bpf_flow_keys flow_keys = {};
 		__u32 key = (__u32)(tests[i].keys.sport) << 16 |
 			    tests[i].keys.dport;
+		snprintf(test_name, TEST_NAME_MAX_LEN, "%s-%s", tests[i].name,
+			 test_suffix);
+		if (!test__start_subtest(test_name))
+			continue;
 
 		/* For skb-less case we can't pass input flags; run
 		 * only the tests that have a matching set of flags.
@@ -582,7 +589,8 @@ void test_flow_dissector_skb_less_direct_attach(void)
 	if (!ASSERT_OK(err, "ifup"))
 		goto out_close_tap;
 
-	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
+	run_tests_skb_less(tap_fd, skel->maps.last_dissection,
+			   "non-skb-direct-attach");
 
 	err = bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
 	ASSERT_OK(err, "bpf_prog_detach2");
@@ -629,7 +637,8 @@ void test_flow_dissector_skb_less_indirect_attach(void)
 	if (!ASSERT_OK_PTR(link, "attach_netns"))
 		goto out_close_tap;
 
-	run_tests_skb_less(tap_fd, skel->maps.last_dissection);
+	run_tests_skb_less(tap_fd, skel->maps.last_dissection,
+			   "non-skb-indirect-attach");
 
 	err = bpf_link__destroy(link);
 	ASSERT_OK(err, "bpf_link__destroy");
@@ -646,6 +655,7 @@ out_clean_ns:
 
 void test_flow_dissector_skb(void)
 {
+	char test_name[TEST_NAME_MAX_LEN];
 	struct bpf_flow *skel;
 	int i, err, prog_fd;
 
@@ -670,6 +680,10 @@ void test_flow_dissector_skb(void)
 		);
 		static struct bpf_flow_keys ctx = {};
 
+		snprintf(test_name, TEST_NAME_MAX_LEN, "%s-skb", tests[i].name);
+		if (!test__start_subtest(test_name))
+			continue;
+
 		if (tests[i].flags) {
 			topts.ctx_in = &ctx;
 			topts.ctx_size_in = sizeof(ctx);
-- 
cgit v1.2.3


From b4940402675004a7e2a14c83824d5c15e3e80f9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:16 +0100
Subject: selftests/bpf: add gre packets testing to flow_dissector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_flow program is able to handle GRE headers in IP packets. Add a
few test data input simulating those GRE packets, with 2 different
cases:
- parse GRE and the encapsulated packet
- parse GRE only

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-6-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/flow_dissector.c      | 76 ++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 29182009cda9..1e17254376ec 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -63,6 +63,19 @@ struct dvlan_ipv6_pkt {
 	struct tcphdr tcp;
 } __packed;
 
+struct gre_base_hdr {
+	__be16 flags;
+	__be16 protocol;
+} gre_base_hdr;
+
+struct gre_minimal_pkt {
+	struct ethhdr eth;
+	struct iphdr iph;
+	struct gre_base_hdr gre_hdr;
+	struct iphdr iph_inner;
+	struct tcphdr tcp;
+} __packed;
+
 struct test {
 	const char *name;
 	union {
@@ -72,6 +85,7 @@ struct test {
 		struct ipv6_pkt ipv6;
 		struct ipv6_frag_pkt ipv6_frag;
 		struct dvlan_ipv6_pkt dvlan_ipv6;
+		struct gre_minimal_pkt gre_minimal;
 	} pkt;
 	struct bpf_flow_keys keys;
 	__u32 flags;
@@ -417,6 +431,68 @@ struct test tests[] = {
 		},
 		.retval = BPF_FLOW_DISSECTOR_CONTINUE,
 	},
+	{
+		.name = "ip-gre",
+		.pkt.gre_minimal = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_GRE,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.gre_hdr = {
+				.flags = 0,
+				.protocol = __bpf_constant_htons(ETH_P_IP),
+			},
+			.iph_inner.ihl = 5,
+			.iph_inner.protocol = IPPROTO_TCP,
+			.iph_inner.tot_len =
+				__bpf_constant_htons(MAGIC_BYTES -
+				sizeof(struct iphdr)),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr) * 2 +
+				 sizeof(struct gre_base_hdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_encap = true,
+			.sport = 80,
+			.dport = 8080,
+		},
+		.retval = BPF_OK,
+	},
+	{
+		.name = "ip-gre-no-encap",
+		.pkt.ipip = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_GRE,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph_inner.ihl = 5,
+			.iph_inner.protocol = IPPROTO_TCP,
+			.iph_inner.tot_len =
+				__bpf_constant_htons(MAGIC_BYTES -
+				sizeof(struct iphdr)),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr)
+				 + sizeof(struct gre_base_hdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_GRE,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_encap = true,
+		},
+		.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+		.retval = BPF_OK,
+	},
 };
 
 static int create_tap(const char *ifname)
-- 
cgit v1.2.3


From 6fb5be12d1bb66e8dce6238e4387f0db99efee25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:17 +0100
Subject: selftests/bpf: migrate flow_dissector namespace exclusivity test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit a11c397c43d5 ("bpf/flow_dissector: add mode to enforce global BPF
flow dissector") is currently tested in test_flow_dissector.sh, which is
not part of test_progs. Add the corresponding test to flow_dissector.c,
which is part of test_progs. The new test reproduces the behavior
implemented in its shell script counterpart:
- attach a  flow dissector program to the root net namespace, ensure
  that we can not attach another flow dissector in any non-root net
  namespace
- attach a flow dissector program to a non-root net namespace, ensure
  that we can not attach another flow dissector in root namespace

Since the new test is performing operations in the root net namespace,
make sure to set it as a "serial" test to make sure not to conflict with
any other test.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-7-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/flow_dissector.c      | 62 ++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 1e17254376ec..8e6e483fead3 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -7,6 +7,7 @@
 
 #include "bpf_flow.skel.h"
 
+#define TEST_NS	"flow_dissector_ns"
 #define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */
 #define TEST_NAME_MAX_LEN	64
 
@@ -495,6 +496,67 @@ struct test tests[] = {
 	},
 };
 
+void serial_test_flow_dissector_namespace(void)
+{
+	struct bpf_flow *skel;
+	struct nstoken *ns;
+	int err, prog_fd;
+
+	skel = bpf_flow__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open/load skeleton"))
+		return;
+
+	prog_fd = bpf_program__fd(skel->progs._dissect);
+	if (!ASSERT_OK_FD(prog_fd, "get dissector fd"))
+		goto out_destroy_skel;
+
+	/* We must be able to attach a flow dissector to root namespace */
+	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+	if (!ASSERT_OK(err, "attach on root namespace ok"))
+		goto out_destroy_skel;
+
+	err = make_netns(TEST_NS);
+	if (!ASSERT_OK(err, "create non-root net namespace"))
+		goto out_destroy_skel;
+
+	/* We must not be able to additionally attach a flow dissector to a
+	 * non-root net namespace
+	 */
+	ns = open_netns(TEST_NS);
+	if (!ASSERT_OK_PTR(ns, "enter non-root net namespace"))
+		goto out_clean_ns;
+
+	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+	close_netns(ns);
+	ASSERT_ERR(err, "refuse new flow dissector in non-root net namespace");
+	ASSERT_EQ(errno, EEXIST, "refused because of already attached prog");
+
+	/* If no flow dissector is attached to the root namespace, we must
+	 * be able to attach one to a non-root net namespace
+	 */
+	bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
+	ns = open_netns(TEST_NS);
+	ASSERT_OK_PTR(ns, "enter non-root net namespace");
+	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+	close_netns(ns);
+	ASSERT_OK(err, "accept new flow dissector in non-root net namespace");
+
+	/* If a flow dissector is attached to non-root net namespace, attaching
+	 * a flow dissector to root namespace must fail
+	 */
+	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+	ASSERT_ERR(err, "refuse new flow dissector on root namespace");
+	ASSERT_EQ(errno, EEXIST, "refused because of already attached prog");
+
+	ns = open_netns(TEST_NS);
+	bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
+	close_netns(ns);
+out_clean_ns:
+	remove_netns(TEST_NS);
+out_destroy_skel:
+	bpf_flow__destroy(skel);
+}
+
 static int create_tap(const char *ifname)
 {
 	struct ifreq ifr = {
-- 
cgit v1.2.3


From c24010821a89954a93b39354c42596d315518c9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:18 +0100
Subject: selftests/bpf: Enable generic tc actions in selftests config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable CONFIG_NET_ACT_GACT to allow adding simple actions with tc
filters. This is for example needed to migrate test_flow_dissector into
the automated testing performed in CI.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-8-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 4ca84c8d9116..c378d5d07e02 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -58,6 +58,7 @@ CONFIG_MPLS=y
 CONFIG_MPLS_IPTUNNEL=y
 CONFIG_MPLS_ROUTING=y
 CONFIG_MPTCP=y
+CONFIG_NET_ACT_GACT=y
 CONFIG_NET_ACT_SKBMOD=y
 CONFIG_NET_CLS=y
 CONFIG_NET_CLS_ACT=y
-- 
cgit v1.2.3


From f4504af68575c1675235bacd8d36157cfc27ae5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:19 +0100
Subject: selftests/bpf: move ip checksum helper to network helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_metadata test has a small helper computing ipv4 checksums to allow
manually building packets.

Move this helper to network_helpers to share it with other tests.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-9-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.h      | 24 ++++++++++++++++++++++
 .../selftests/bpf/prog_tests/xdp_metadata.c        | 19 +----------------
 2 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 5764155b6d25..0b2ed90c763f 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -105,6 +105,30 @@ static __u16 csum_fold(__u32 csum)
 	return (__u16)~csum;
 }
 
+static __wsum csum_partial(const void *buf, int len, __wsum sum)
+{
+	__u16 *p = (__u16 *)buf;
+	int num_u16 = len >> 1;
+	int i;
+
+	for (i = 0; i < num_u16; i++)
+		sum += p[i];
+
+	return sum;
+}
+
+static inline __sum16 build_ip_csum(struct iphdr *iph)
+{
+	__u32 sum = 0;
+	__u16 *p;
+
+	iph->check = 0;
+	p = (void *)iph;
+	sum = csum_partial(p, iph->ihl << 2, 0);
+
+	return csum_fold(sum);
+}
+
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 					__u32 len, __u8 proto,
 					__wsum csum)
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index c87ee2bf558c..7f8e16165533 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -133,23 +133,6 @@ static void close_xsk(struct xsk *xsk)
 	munmap(xsk->umem_area, UMEM_SIZE);
 }
 
-static void ip_csum(struct iphdr *iph)
-{
-	__u32 sum = 0;
-	__u16 *p;
-	int i;
-
-	iph->check = 0;
-	p = (void *)iph;
-	for (i = 0; i < sizeof(*iph) / sizeof(*p); i++)
-		sum += p[i];
-
-	while (sum >> 16)
-		sum = (sum & 0xffff) + (sum >> 16);
-
-	iph->check = ~sum;
-}
-
 static int generate_packet(struct xsk *xsk, __u16 dst_port)
 {
 	struct xsk_tx_metadata *meta;
@@ -192,7 +175,7 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
 	iph->protocol = IPPROTO_UDP;
 	ASSERT_EQ(inet_pton(FAMILY, TX_ADDR, &iph->saddr), 1, "inet_pton(TX_ADDR)");
 	ASSERT_EQ(inet_pton(FAMILY, RX_ADDR, &iph->daddr), 1, "inet_pton(RX_ADDR)");
-	ip_csum(iph);
+	iph->check = build_ip_csum(iph);
 
 	udph->source = htons(UDP_SOURCE_PORT);
 	udph->dest = htons(dst_port);
-- 
cgit v1.2.3


From 752fddc0501c540214875b26bde2538f0a831811 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:20 +0100
Subject: selftests/bpf: document pseudo-header checksum helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

network_helpers.h provides helpers to compute checksum for pseudo
headers but no helpers to compute the global checksums.

Before adding those, clarify csum_tcpudp_magic and csum_ipv6_magic
purpose by adding some documentation.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-10-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.h | 30 +++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 0b2ed90c763f..00d6e7d52545 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -129,6 +129,21 @@ static inline __sum16 build_ip_csum(struct iphdr *iph)
 	return csum_fold(sum);
 }
 
+/**
+ * csum_tcpudp_magic - compute IP pseudo-header checksum
+ *
+ * Compute the IPv4 pseudo header checksum. The helper can take a
+ * accumulated sum from the transport layer to accumulate it and directly
+ * return the transport layer
+ *
+ * @saddr: IP source address
+ * @daddr: IP dest address
+ * @len: IP data size
+ * @proto: transport layer protocol
+ * @csum: The accumulated partial sum to add to the computation
+ *
+ * Returns the folded sum
+ */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 					__u32 len, __u8 proto,
 					__wsum csum)
@@ -144,6 +159,21 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 	return csum_fold((__u32)s);
 }
 
+/**
+ * csum_ipv6_magic - compute IPv6 pseudo-header checksum
+ *
+ * Compute the ipv6 pseudo header checksum. The helper can take a
+ * accumulated sum from the transport layer to accumulate it and directly
+ * return the transport layer
+ *
+ * @saddr: IPv6 source address
+ * @daddr: IPv6 dest address
+ * @len: IPv6 data size
+ * @proto: transport layer protocol
+ * @csum: The accumulated partial sum to add to the computation
+ *
+ * Returns the folded sum
+ */
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
 					__u32 len, __u8 proto,
-- 
cgit v1.2.3


From a2f482c34a52176ae89d143979bbc9e7a72857c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:21 +0100
Subject: selftests/bpf: use the same udp and tcp headers in tests under
 test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trying to add udp-dedicated helpers in network_helpers involves
including some udp header, which makes multiple test_progs tests build
fail:

In file included from ./progs/test_cls_redirect.h:13,
                 from [...]/prog_tests/cls_redirect.c:15:
[...]/usr/include/linux/udp.h:23:8: error: redefinition of ‘struct udphdr’
   23 | struct udphdr {
      |        ^~~~~~
In file included from ./network_helpers.h:17,
                 from [...]/prog_tests/cls_redirect.c:13:
[...]/usr/include/netinet/udp.h:55:8: note: originally defined here
   55 | struct udphdr
      |        ^~~~~~

This error is due to struct udphdr being defined in both <linux/udp.h>
and <netinet/udp.h>.

Use only <netinet/udp.h> in every test. While at it, perform the same
for tcp.h. For some tests, the change needs to be done in the eBPF
program part as well, because of some headers sharing between both
sides.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-11-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.c                | 2 +-
 tools/testing/selftests/bpf/prog_tests/sockopt_sk.c          | 2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_bonding.c         | 2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c     | 2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c       | 2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c        | 2 +-
 tools/testing/selftests/bpf/progs/test_cls_redirect.c        | 2 +-
 tools/testing/selftests/bpf/progs/test_cls_redirect.h        | 2 +-
 tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c | 2 +-
 tools/testing/selftests/bpf/xdp_hw_metadata.c                | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 27784946b01b..80844a5fb1fe 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -21,7 +21,7 @@
 #include <linux/limits.h>
 
 #include <linux/ip.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 #include <netinet/tcp.h>
 #include <net/if.h>
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index 05d0e07da394..ba6b3ec1156a 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -2,7 +2,7 @@
 #include <test_progs.h>
 #include "cgroup_helpers.h"
 
-#include <linux/tcp.h>
+#include <netinet/tcp.h>
 #include <linux/netlink.h>
 #include "sockopt_sk.skel.h"
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
index 6d8b54124cb3..fb952703653e 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
@@ -17,7 +17,7 @@
 #include "network_helpers.h"
 #include <linux/if_bonding.h>
 #include <linux/limits.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 #include <uapi/linux/netdev.h>
 
 #include "xdp_dummy.skel.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index bad0ea167be7..d12f926b4b8b 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -7,7 +7,7 @@
 #include <linux/if_link.h>
 #include <linux/ipv6.h>
 #include <linux/in6.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 #include <bpf/bpf_endian.h>
 #include <uapi/linux/netdev.h>
 #include "test_xdp_do_redirect.skel.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c b/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c
index e1bf141d3401..3f9146d83d79 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c
@@ -3,7 +3,7 @@
 #include <network_helpers.h>
 #include <bpf/btf.h>
 #include <linux/if_link.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 #include <net/if.h>
 #include <unistd.h>
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 7f8e16165533..3d47878ef6bf 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -10,7 +10,7 @@
 #include <linux/errqueue.h>
 #include <linux/if_link.h>
 #include <linux/net_tstamp.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 #include <sys/mman.h>
 #include <net/if.h>
 #include <poll.h>
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
index 683c8aaa63da..f344c6835e84 100644
--- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
@@ -15,7 +15,7 @@
 #include <linux/ipv6.h>
 #include <linux/pkt_cls.h>
 #include <linux/tcp.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h
index 233b089d1fba..eb55cb8a3dbd 100644
--- a/tools/testing/selftests/bpf/progs/test_cls_redirect.h
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h
@@ -10,7 +10,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 
 /* offsetof() is used in static asserts, and the libbpf-redefined CO-RE
  * friendly version breaks compilation for older clang versions <= 15
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
index 464515b824b9..d0f7670351e5 100644
--- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
@@ -15,7 +15,7 @@
 #include <linux/ipv6.h>
 #include <linux/pkt_cls.h>
 #include <linux/tcp.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 6f9956eed797..06266aad2f99 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -27,7 +27,7 @@
 #include <linux/errqueue.h>
 #include <linux/if_link.h>
 #include <linux/net_tstamp.h>
-#include <linux/udp.h>
+#include <netinet/udp.h>
 #include <linux/sockios.h>
 #include <linux/if_xdp.h>
 #include <sys/mman.h>
-- 
cgit v1.2.3


From bcc00987bc56faa3d0e20762ececefc831506846 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:22 +0100
Subject: selftests/bpf: add network helpers to generate udp checksums
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

network_helpers.c provides some helpers to generate ip checksums or ip
pseudo-header checksums, but not for upper layers (eg: udp checksums)

Add helpers for udp checksum to allow manually building udp packets.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-12-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/network_helpers.h | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 00d6e7d52545..ebec8a8d6f81 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -14,6 +14,7 @@ typedef __u16 __sum16;
 #include <linux/sockios.h>
 #include <linux/err.h>
 #include <netinet/tcp.h>
+#include <netinet/udp.h>
 #include <bpf/bpf_endian.h>
 #include <net/if.h>
 
@@ -193,6 +194,47 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 	return csum_fold((__u32)s);
 }
 
+/**
+ * build_udp_v4_csum - compute UDP checksum for UDP over IPv4
+ *
+ * Compute the checksum to embed in UDP header, composed of the sum of IP
+ * pseudo-header checksum, UDP header checksum and UDP data checksum
+ * @iph IP header
+ * @udph UDP header, which must be immediately followed by UDP data
+ *
+ * Returns the total checksum
+ */
+
+static inline __sum16 build_udp_v4_csum(const struct iphdr *iph,
+					const struct udphdr *udph)
+{
+	unsigned long sum;
+
+	sum = csum_partial(udph, ntohs(udph->len), 0);
+	return csum_tcpudp_magic(iph->saddr, iph->daddr, ntohs(udph->len),
+				 IPPROTO_UDP, sum);
+}
+
+/**
+ * build_udp_v6_csum - compute UDP checksum for UDP over IPv6
+ *
+ * Compute the checksum to embed in UDP header, composed of the sum of IPv6
+ * pseudo-header checksum, UDP header checksum and UDP data checksum
+ * @ip6h IPv6 header
+ * @udph UDP header, which must be immediately followed by UDP data
+ *
+ * Returns the total checksum
+ */
+static inline __sum16 build_udp_v6_csum(const struct ipv6hdr *ip6h,
+					const struct udphdr *udph)
+{
+	unsigned long sum;
+
+	sum = csum_partial(udph, ntohs(udph->len), 0);
+	return csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ntohs(udph->len),
+			       IPPROTO_UDP, sum);
+}
+
 struct tmonitor_ctx;
 
 #ifdef TRAFFIC_MONITOR
-- 
cgit v1.2.3


From 20203a51e3940abb1639d41bbcfe995f6463bbd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:23 +0100
Subject: selftests/bpf: migrate bpf flow dissectors tests to test_progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_flow_dissector.sh loads flow_dissector program and subprograms,
creates and configured relevant tunnels and interfaces, and ensure that
the bpf dissection is actually performed correctly. Similar tests exist
in test_progs (thanks to flow_dissector.c) and run the same programs,
but those are only executed with BPF_PROG_RUN: those tests are then
missing some coverage (eg: coverage for flow keys manipulated when the
configured flower uses a port range, which has a dedicated test in
test_flow_dissector.sh)

Convert test_flow_dissector.sh into test_progs so that the corresponding
tests are also run in CI.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-13-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/prog_tests/flow_dissector_classification.c | 792 +++++++++++++++++++++
 1 file changed, 792 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c
new file mode 100644
index 000000000000..3729fbfd3084
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c
@@ -0,0 +1,792 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <bpf/bpf.h>
+#include <linux/bpf.h>
+#include <bpf/libbpf.h>
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "bpf_util.h"
+#include "bpf_flow.skel.h"
+
+#define CFG_PORT_INNER 8000
+#define CFG_PORT_GUE 6080
+#define SUBTEST_NAME_MAX_LEN 32
+#define TEST_NAME_MAX_LEN (32 + SUBTEST_NAME_MAX_LEN)
+#define MAX_SOURCE_PORTS 3
+#define TEST_PACKETS_COUNT 10
+#define TEST_PACKET_LEN 100
+#define TEST_PACKET_PATTERN 'a'
+#define TEST_IPV4 "192.168.0.1/32"
+#define TEST_IPV6 "100::a/128"
+#define TEST_TUNNEL_REMOTE "127.0.0.2"
+#define TEST_TUNNEL_LOCAL "127.0.0.1"
+
+#define INIT_ADDR4(addr4, port)					\
+	{							\
+		.sin_family = AF_INET,				\
+		.sin_port = __constant_htons(port),		\
+		.sin_addr.s_addr = __constant_htonl(addr4),	\
+	}
+
+#define INIT_ADDR6(addr6, port)				\
+	{						\
+		.sin6_family = AF_INET6,		\
+		.sin6_port = __constant_htons(port),	\
+		.sin6_addr = addr6,			\
+	}
+#define TEST_IN4_SRC_ADDR_DEFAULT INIT_ADDR4(INADDR_LOOPBACK + 2, 0)
+#define TEST_IN4_DST_ADDR_DEFAULT INIT_ADDR4(INADDR_LOOPBACK, CFG_PORT_INNER)
+#define TEST_OUT4_SRC_ADDR_DEFAULT INIT_ADDR4(INADDR_LOOPBACK + 1, 0)
+#define TEST_OUT4_DST_ADDR_DEFAULT INIT_ADDR4(INADDR_LOOPBACK, 0)
+
+#define TEST_IN6_SRC_ADDR_DEFAULT INIT_ADDR6(IN6ADDR_LOOPBACK_INIT, 0)
+#define TEST_IN6_DST_ADDR_DEFAULT \
+	INIT_ADDR6(IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
+#define TEST_OUT6_SRC_ADDR_DEFAULT INIT_ADDR6(IN6ADDR_LOOPBACK_INIT, 0)
+#define TEST_OUT6_DST_ADDR_DEFAULT INIT_ADDR6(IN6ADDR_LOOPBACK_INIT, 0)
+
+#define TEST_IN4_SRC_ADDR_DISSECT_CONTINUE INIT_ADDR4(INADDR_LOOPBACK + 126, 0)
+#define TEST_IN4_SRC_ADDR_IPIP INIT_ADDR4((in_addr_t)0x01010101, 0)
+#define TEST_IN4_DST_ADDR_IPIP INIT_ADDR4((in_addr_t)0xC0A80001, CFG_PORT_INNER)
+
+struct grehdr {
+	uint16_t unused;
+	uint16_t protocol;
+} __packed;
+
+struct guehdr {
+	union {
+		struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+			__u8 hlen : 5, control : 1, version : 2;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+			__u8 version : 2, control : 1, hlen : 5;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+			__u8 proto_ctype;
+			__be16 flags;
+		};
+		__be32 word;
+	};
+};
+
+static char buf[ETH_DATA_LEN];
+
+struct test_configuration {
+	char name[SUBTEST_NAME_MAX_LEN];
+	int (*test_setup)(void);
+	void (*test_teardown)(void);
+	int source_ports[MAX_SOURCE_PORTS];
+	int cfg_l3_inner;
+	struct sockaddr_in in_saddr4;
+	struct sockaddr_in in_daddr4;
+	struct sockaddr_in6 in_saddr6;
+	struct sockaddr_in6 in_daddr6;
+	int cfg_l3_outer;
+	struct sockaddr_in out_saddr4;
+	struct sockaddr_in out_daddr4;
+	struct sockaddr_in6 out_saddr6;
+	struct sockaddr_in6 out_daddr6;
+	int cfg_encap_proto;
+	uint8_t cfg_dsfield_inner;
+	uint8_t cfg_dsfield_outer;
+	int cfg_l3_extra;
+	struct sockaddr_in extra_saddr4;
+	struct sockaddr_in extra_daddr4;
+	struct sockaddr_in6 extra_saddr6;
+	struct sockaddr_in6 extra_daddr6;
+};
+
+static unsigned long util_gettime(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void build_ipv4_header(void *header, uint8_t proto, uint32_t src,
+			      uint32_t dst, int payload_len, uint8_t tos)
+{
+	struct iphdr *iph = header;
+
+	iph->ihl = 5;
+	iph->version = 4;
+	iph->tos = tos;
+	iph->ttl = 8;
+	iph->tot_len = htons(sizeof(*iph) + payload_len);
+	iph->id = htons(1337);
+	iph->protocol = proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->check = build_ip_csum((void *)iph);
+}
+
+static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
+{
+	uint16_t val, *ptr = (uint16_t *)ip6h;
+
+	val = ntohs(*ptr);
+	val &= 0xF00F;
+	val |= ((uint16_t)dsfield) << 4;
+	*ptr = htons(val);
+}
+
+static void build_ipv6_header(void *header, uint8_t proto,
+			      const struct sockaddr_in6 *src,
+			      const struct sockaddr_in6 *dst, int payload_len,
+			      uint8_t dsfield)
+{
+	struct ipv6hdr *ip6h = header;
+
+	ip6h->version = 6;
+	ip6h->payload_len = htons(payload_len);
+	ip6h->nexthdr = proto;
+	ip6h->hop_limit = 8;
+	ipv6_set_dsfield(ip6h, dsfield);
+
+	memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
+	memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
+}
+
+static void build_udp_header(void *header, int payload_len, uint16_t sport,
+			     uint16_t dport, int family)
+{
+	struct udphdr *udph = header;
+	int len = sizeof(*udph) + payload_len;
+
+	udph->source = htons(sport);
+	udph->dest = htons(dport);
+	udph->len = htons(len);
+	udph->check = 0;
+	if (family == AF_INET)
+		udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
+						udph);
+	else
+		udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
+						udph);
+}
+
+static void build_gue_header(void *header, uint8_t proto)
+{
+	struct guehdr *gueh = header;
+
+	gueh->proto_ctype = proto;
+}
+
+static void build_gre_header(void *header, uint16_t proto)
+{
+	struct grehdr *greh = header;
+
+	greh->protocol = htons(proto);
+}
+
+static int l3_length(int family)
+{
+	if (family == AF_INET)
+		return sizeof(struct iphdr);
+	else
+		return sizeof(struct ipv6hdr);
+}
+
+static int build_packet(const struct test_configuration *test, uint16_t sport)
+{
+	int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
+	int el3_len = 0, packet_len;
+
+	memset(buf, 0, ETH_DATA_LEN);
+
+	if (test->cfg_l3_extra)
+		el3_len = l3_length(test->cfg_l3_extra);
+
+	/* calculate header offsets */
+	if (test->cfg_encap_proto) {
+		ol3_len = l3_length(test->cfg_l3_outer);
+
+		if (test->cfg_encap_proto == IPPROTO_GRE)
+			ol4_len = sizeof(struct grehdr);
+		else if (test->cfg_encap_proto == IPPROTO_UDP)
+			ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
+	}
+
+	il3_len = l3_length(test->cfg_l3_inner);
+	il4_len = sizeof(struct udphdr);
+
+	packet_len = el3_len + ol3_len + ol4_len + il3_len + il4_len +
+		     TEST_PACKET_LEN;
+	if (!ASSERT_LE(packet_len, sizeof(buf), "check packet size"))
+		return -1;
+
+	/*
+	 * Fill packet from inside out, to calculate correct checksums.
+	 * But create ip before udp headers, as udp uses ip for pseudo-sum.
+	 */
+	memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
+	       TEST_PACKET_PATTERN, TEST_PACKET_LEN);
+
+	/* add zero byte for udp csum padding */
+	buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + TEST_PACKET_LEN] =
+		0;
+
+	switch (test->cfg_l3_inner) {
+	case PF_INET:
+		build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
+				  IPPROTO_UDP, test->in_saddr4.sin_addr.s_addr,
+				  test->in_daddr4.sin_addr.s_addr,
+				  il4_len + TEST_PACKET_LEN,
+				  test->cfg_dsfield_inner);
+		break;
+	case PF_INET6:
+		build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
+				  IPPROTO_UDP, &test->in_saddr6,
+				  &test->in_daddr6, il4_len + TEST_PACKET_LEN,
+				  test->cfg_dsfield_inner);
+		break;
+	}
+
+	build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
+			 TEST_PACKET_LEN, sport, CFG_PORT_INNER,
+			 test->cfg_l3_inner);
+
+	if (!test->cfg_encap_proto)
+		return il3_len + il4_len + TEST_PACKET_LEN;
+
+	switch (test->cfg_l3_outer) {
+	case PF_INET:
+		build_ipv4_header(buf + el3_len, test->cfg_encap_proto,
+				  test->out_saddr4.sin_addr.s_addr,
+				  test->out_daddr4.sin_addr.s_addr,
+				  ol4_len + il3_len + il4_len + TEST_PACKET_LEN,
+				  test->cfg_dsfield_outer);
+		break;
+	case PF_INET6:
+		build_ipv6_header(buf + el3_len, test->cfg_encap_proto,
+				  &test->out_saddr6, &test->out_daddr6,
+				  ol4_len + il3_len + il4_len + TEST_PACKET_LEN,
+				  test->cfg_dsfield_outer);
+		break;
+	}
+
+	switch (test->cfg_encap_proto) {
+	case IPPROTO_UDP:
+		build_gue_header(buf + el3_len + ol3_len + ol4_len -
+					 sizeof(struct guehdr),
+				 test->cfg_l3_inner == PF_INET ? IPPROTO_IPIP :
+								 IPPROTO_IPV6);
+		build_udp_header(buf + el3_len + ol3_len,
+				 sizeof(struct guehdr) + il3_len + il4_len +
+					 TEST_PACKET_LEN,
+				 sport, CFG_PORT_GUE, test->cfg_l3_outer);
+		break;
+	case IPPROTO_GRE:
+		build_gre_header(buf + el3_len + ol3_len,
+				 test->cfg_l3_inner == PF_INET ? ETH_P_IP :
+								 ETH_P_IPV6);
+		break;
+	}
+
+	switch (test->cfg_l3_extra) {
+	case PF_INET:
+		build_ipv4_header(buf,
+				  test->cfg_l3_outer == PF_INET ? IPPROTO_IPIP :
+								  IPPROTO_IPV6,
+				  test->extra_saddr4.sin_addr.s_addr,
+				  test->extra_daddr4.sin_addr.s_addr,
+				  ol3_len + ol4_len + il3_len + il4_len +
+					  TEST_PACKET_LEN,
+				  0);
+		break;
+	case PF_INET6:
+		build_ipv6_header(buf,
+				  test->cfg_l3_outer == PF_INET ? IPPROTO_IPIP :
+								  IPPROTO_IPV6,
+				  &test->extra_saddr6, &test->extra_daddr6,
+				  ol3_len + ol4_len + il3_len + il4_len +
+					  TEST_PACKET_LEN,
+				  0);
+		break;
+	}
+
+	return el3_len + ol3_len + ol4_len + il3_len + il4_len +
+	       TEST_PACKET_LEN;
+}
+
+/* sender transmits encapsulated over RAW or unencap'd over UDP */
+static int setup_tx(const struct test_configuration *test)
+{
+	int family, fd, ret;
+
+	if (test->cfg_l3_extra)
+		family = test->cfg_l3_extra;
+	else if (test->cfg_l3_outer)
+		family = test->cfg_l3_outer;
+	else
+		family = test->cfg_l3_inner;
+
+	fd = socket(family, SOCK_RAW, IPPROTO_RAW);
+	if (!ASSERT_OK_FD(fd, "setup tx socket"))
+		return fd;
+
+	if (test->cfg_l3_extra) {
+		if (test->cfg_l3_extra == PF_INET)
+			ret = connect(fd, (void *)&test->extra_daddr4,
+				      sizeof(test->extra_daddr4));
+		else
+			ret = connect(fd, (void *)&test->extra_daddr6,
+				      sizeof(test->extra_daddr6));
+		if (!ASSERT_OK(ret, "connect")) {
+			close(fd);
+			return ret;
+		}
+	} else if (test->cfg_l3_outer) {
+		/* connect to destination if not encapsulated */
+		if (test->cfg_l3_outer == PF_INET)
+			ret = connect(fd, (void *)&test->out_daddr4,
+				      sizeof(test->out_daddr4));
+		else
+			ret = connect(fd, (void *)&test->out_daddr6,
+				      sizeof(test->out_daddr6));
+		if (!ASSERT_OK(ret, "connect")) {
+			close(fd);
+			return ret;
+		}
+	} else {
+		/* otherwise using loopback */
+		if (test->cfg_l3_inner == PF_INET)
+			ret = connect(fd, (void *)&test->in_daddr4,
+				      sizeof(test->in_daddr4));
+		else
+			ret = connect(fd, (void *)&test->in_daddr6,
+				      sizeof(test->in_daddr6));
+		if (!ASSERT_OK(ret, "connect")) {
+			close(fd);
+			return ret;
+		}
+	}
+
+	return fd;
+}
+
+/* receiver reads unencapsulated UDP */
+static int setup_rx(const struct test_configuration *test)
+{
+	int fd, ret;
+
+	fd = socket(test->cfg_l3_inner, SOCK_DGRAM, 0);
+	if (!ASSERT_OK_FD(fd, "socket rx"))
+		return fd;
+
+	if (test->cfg_l3_inner == PF_INET)
+		ret = bind(fd, (void *)&test->in_daddr4,
+			   sizeof(test->in_daddr4));
+	else
+		ret = bind(fd, (void *)&test->in_daddr6,
+			   sizeof(test->in_daddr6));
+	if (!ASSERT_OK(ret, "bind rx")) {
+		close(fd);
+		return ret;
+	}
+
+	return fd;
+}
+
+static int do_tx(int fd, const char *pkt, int len)
+{
+	int ret;
+
+	ret = write(fd, pkt, len);
+	return ret != len;
+}
+
+static int do_poll(int fd, short events, int timeout)
+{
+	struct pollfd pfd;
+	int ret;
+
+	pfd.fd = fd;
+	pfd.events = events;
+
+	ret = poll(&pfd, 1, timeout);
+	return ret;
+}
+
+static int do_rx(int fd)
+{
+	char rbuf;
+	int ret, num = 0;
+
+	while (1) {
+		ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
+		if (ret == -1 && errno == EAGAIN)
+			break;
+		if (ret < 0)
+			return -1;
+		if (!ASSERT_EQ(rbuf, TEST_PACKET_PATTERN, "check pkt pattern"))
+			return -1;
+		num++;
+	}
+
+	return num;
+}
+
+static int run_test(const struct test_configuration *test,
+		    int source_port_index)
+{
+	int fdt = -1, fdr = -1, len, tx = 0, rx = 0, err;
+	unsigned long tstop, tcur;
+
+	fdr = setup_rx(test);
+	fdt = setup_tx(test);
+	if (!ASSERT_OK_FD(fdr, "setup rx") || !ASSERT_OK_FD(fdt, "setup tx")) {
+		err = -1;
+		goto out_close_sockets;
+	}
+
+	len = build_packet(test,
+			   (uint16_t)test->source_ports[source_port_index]);
+	if (!ASSERT_GT(len, 0, "build test packet"))
+		return -1;
+
+	tcur = util_gettime();
+	tstop = tcur;
+
+	while (tx < TEST_PACKETS_COUNT) {
+		if (!ASSERT_OK(do_tx(fdt, buf, len), "do_tx"))
+			break;
+		tx++;
+		err = do_rx(fdr);
+		if (!ASSERT_GE(err, 0, "do_rx"))
+			break;
+		rx += err;
+	}
+
+	/* read straggler packets, if any */
+	if (rx < tx) {
+		tstop = util_gettime() + 100;
+		while (rx < tx) {
+			tcur = util_gettime();
+			if (tcur >= tstop)
+				break;
+
+			err = do_poll(fdr, POLLIN, tstop - tcur);
+			if (err < 0)
+				break;
+			err = do_rx(fdr);
+			if (err >= 0)
+				rx += err;
+		}
+	}
+
+out_close_sockets:
+	close(fdt);
+	close(fdr);
+	return rx;
+}
+
+static int attach_and_configure_program(struct bpf_flow *skel)
+{
+	struct bpf_map *prog_array = skel->maps.jmp_table;
+	int main_prog_fd, sub_prog_fd, map_fd, i, err;
+	struct bpf_program *prog;
+	char prog_name[32];
+
+	main_prog_fd = bpf_program__fd(skel->progs._dissect);
+	if (main_prog_fd < 0)
+		return main_prog_fd;
+
+	err = bpf_prog_attach(main_prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+	if (err)
+		return err;
+
+	map_fd = bpf_map__fd(prog_array);
+	if (map_fd < 0)
+		return map_fd;
+
+	for (i = 0; i < bpf_map__max_entries(prog_array); i++) {
+		snprintf(prog_name, sizeof(prog_name), "flow_dissector_%d", i);
+
+		prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+		if (!prog)
+			return -1;
+
+		sub_prog_fd = bpf_program__fd(prog);
+		if (sub_prog_fd < 0)
+			return -1;
+
+		err = bpf_map_update_elem(map_fd, &i, &sub_prog_fd, BPF_ANY);
+		if (err)
+			return -1;
+	}
+
+	return main_prog_fd;
+}
+
+static void detach_program(struct bpf_flow *skel, int prog_fd)
+{
+	bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
+}
+
+static int set_port_drop(int pf, bool multi_port)
+{
+	SYS(fail, "tc qdisc add dev lo ingress");
+	SYS(fail_delete_qdisc, "tc filter add %s %s %s %s %s %s %s %s %s %s",
+	    "dev lo",
+	    "parent FFFF:",
+	    "protocol", pf == PF_INET6 ? "ipv6" : "ip",
+	    "pref 1337",
+	    "flower",
+	    "ip_proto udp",
+	    "src_port", multi_port ? "8-10" : "9",
+	    "action drop");
+	return 0;
+
+fail_delete_qdisc:
+	SYS_NOFAIL("tc qdisc del dev lo ingress");
+fail:
+	return 1;
+}
+
+static void remove_filter(void)
+{
+	SYS_NOFAIL("tc filter del dev lo ingress");
+	SYS_NOFAIL("tc qdisc del dev lo ingress");
+}
+
+static int ipv4_setup(void)
+{
+	return set_port_drop(PF_INET, false);
+}
+
+static int ipv6_setup(void)
+{
+	return set_port_drop(PF_INET6, false);
+}
+
+static int port_range_setup(void)
+{
+	return set_port_drop(PF_INET, true);
+}
+
+static int set_addresses(void)
+{
+	SYS(out, "ip -4 addr add  %s dev lo", TEST_IPV4);
+	SYS(out_remove_ipv4, "ip -6 addr add %s dev lo", TEST_IPV6);
+	return 0;
+out_remove_ipv4:
+	SYS_NOFAIL("ip -4 addr del %s dev lo", TEST_IPV4);
+out:
+	return -1;
+}
+
+static void unset_addresses(void)
+{
+	SYS_NOFAIL("ip -4 addr del %s dev lo", TEST_IPV4);
+	SYS_NOFAIL("ip -6 addr del %s dev lo", TEST_IPV6);
+}
+
+static int ipip_setup(void)
+{
+	if (!ASSERT_OK(set_addresses(), "configure addresses"))
+		return -1;
+	if (!ASSERT_OK(set_port_drop(PF_INET, false), "set filter"))
+		goto out_unset_addresses;
+	SYS(out_remove_filter,
+	    "ip link add ipip_test type ipip remote %s local %s dev lo",
+	    TEST_TUNNEL_REMOTE, TEST_TUNNEL_LOCAL);
+	SYS(out_clean_netif, "ip link set ipip_test up");
+	return 0;
+
+out_clean_netif:
+	SYS_NOFAIL("ip link del ipip_test");
+out_remove_filter:
+	remove_filter();
+out_unset_addresses:
+	unset_addresses();
+	return -1;
+}
+
+static void ipip_shutdown(void)
+{
+	SYS_NOFAIL("ip link del ipip_test");
+	remove_filter();
+	unset_addresses();
+}
+
+static int gre_setup(void)
+{
+	if (!ASSERT_OK(set_addresses(), "configure addresses"))
+		return -1;
+	if (!ASSERT_OK(set_port_drop(PF_INET, false), "set filter"))
+		goto out_unset_addresses;
+	SYS(out_remove_filter,
+	    "ip link add gre_test type gre remote %s local %s dev lo",
+	    TEST_TUNNEL_REMOTE, TEST_TUNNEL_LOCAL);
+	SYS(out_clean_netif, "ip link set gre_test up");
+	return 0;
+
+out_clean_netif:
+	SYS_NOFAIL("ip link del ipip_test");
+out_remove_filter:
+	remove_filter();
+out_unset_addresses:
+	unset_addresses();
+	return -1;
+}
+
+static void gre_shutdown(void)
+{
+	SYS_NOFAIL("ip link del gre_test");
+	remove_filter();
+	unset_addresses();
+}
+
+static const struct test_configuration tests_input[] = {
+	{
+		.name = "ipv4",
+		.test_setup = ipv4_setup,
+		.test_teardown = remove_filter,
+		.source_ports = { 8, 9, 10 },
+		.cfg_l3_inner = PF_INET,
+		.in_saddr4 = TEST_IN4_SRC_ADDR_DEFAULT,
+		.in_daddr4 = TEST_IN4_DST_ADDR_DEFAULT
+	},
+	{
+		.name = "ipv4_continue_dissect",
+		.test_setup = ipv4_setup,
+		.test_teardown = remove_filter,
+		.source_ports = { 8, 9, 10 },
+		.cfg_l3_inner = PF_INET,
+		.in_saddr4 = TEST_IN4_SRC_ADDR_DISSECT_CONTINUE,
+		.in_daddr4 = TEST_IN4_DST_ADDR_DEFAULT },
+	{
+		.name = "ipip",
+		.test_setup = ipip_setup,
+		.test_teardown = ipip_shutdown,
+		.source_ports = { 8, 9, 10 },
+		.cfg_l3_inner = PF_INET,
+		.in_saddr4 = TEST_IN4_SRC_ADDR_IPIP,
+		.in_daddr4 = TEST_IN4_DST_ADDR_IPIP,
+		.out_saddr4 = TEST_OUT4_SRC_ADDR_DEFAULT,
+		.out_daddr4 = TEST_OUT4_DST_ADDR_DEFAULT,
+		.cfg_l3_outer = PF_INET,
+		.cfg_encap_proto = IPPROTO_IPIP,
+
+	},
+	{
+		.name = "gre",
+		.test_setup = gre_setup,
+		.test_teardown = gre_shutdown,
+		.source_ports = { 8, 9, 10 },
+		.cfg_l3_inner = PF_INET,
+		.in_saddr4 = TEST_IN4_SRC_ADDR_IPIP,
+		.in_daddr4 = TEST_IN4_DST_ADDR_IPIP,
+		.out_saddr4 = TEST_OUT4_SRC_ADDR_DEFAULT,
+		.out_daddr4 = TEST_OUT4_DST_ADDR_DEFAULT,
+		.cfg_l3_outer = PF_INET,
+		.cfg_encap_proto = IPPROTO_GRE,
+	},
+	{
+		.name = "port_range",
+		.test_setup = port_range_setup,
+		.test_teardown = remove_filter,
+		.source_ports = { 7, 9, 11 },
+		.cfg_l3_inner = PF_INET,
+		.in_saddr4 = TEST_IN4_SRC_ADDR_DEFAULT,
+		.in_daddr4 = TEST_IN4_DST_ADDR_DEFAULT },
+	{
+		.name = "ipv6",
+		.test_setup = ipv6_setup,
+		.test_teardown = remove_filter,
+		.source_ports = { 8, 9, 10 },
+		.cfg_l3_inner = PF_INET6,
+		.in_saddr6 = TEST_IN6_SRC_ADDR_DEFAULT,
+		.in_daddr6 = TEST_IN6_DST_ADDR_DEFAULT
+	},
+};
+
+struct test_ctx {
+	struct bpf_flow *skel;
+	struct netns_obj *ns;
+	int prog_fd;
+};
+
+static int test_global_init(struct test_ctx *ctx)
+{
+	int err;
+
+	ctx->skel = bpf_flow__open_and_load();
+	if (!ASSERT_OK_PTR(ctx->skel, "open and load flow_dissector"))
+		return -1;
+
+	ctx->ns = netns_new("flow_dissector_classification", true);
+	if (!ASSERT_OK_PTR(ctx->ns, "switch ns"))
+		goto out_destroy_skel;
+
+	err = write_sysctl("/proc/sys/net/ipv4/conf/default/rp_filter", "0");
+	err |= write_sysctl("/proc/sys/net/ipv4/conf/all/rp_filter", "0");
+	err |= write_sysctl("/proc/sys/net/ipv4/conf/lo/rp_filter", "0");
+	if (!ASSERT_OK(err, "configure net tunables"))
+		goto out_clean_ns;
+
+	ctx->prog_fd = attach_and_configure_program(ctx->skel);
+	if (!ASSERT_OK_FD(ctx->prog_fd, "attach and configure program"))
+		goto out_clean_ns;
+	return 0;
+out_clean_ns:
+	netns_free(ctx->ns);
+out_destroy_skel:
+	bpf_flow__destroy(ctx->skel);
+	return -1;
+}
+
+static void test_global_shutdown(struct test_ctx *ctx)
+{
+	detach_program(ctx->skel, ctx->prog_fd);
+	netns_free(ctx->ns);
+	bpf_flow__destroy(ctx->skel);
+}
+
+void test_flow_dissector_classification(void)
+{
+	struct test_ctx ctx;
+	const struct test_configuration *test;
+	int i;
+
+	if (test_global_init(&ctx))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(tests_input); i++) {
+		if (!test__start_subtest(tests_input[i].name))
+			continue;
+		test = &tests_input[i];
+		/* All tests are expected to have one rx-ok port first,
+		 * then a non-working rx port, and finally a rx-ok port
+		 */
+		if (test->test_setup &&
+		    !ASSERT_OK(test->test_setup(), "init filter"))
+			continue;
+
+		ASSERT_EQ(run_test(test, 0), TEST_PACKETS_COUNT,
+			  "test first port");
+		ASSERT_EQ(run_test(test, 1), 0, "test second port");
+		ASSERT_EQ(run_test(test, 2), TEST_PACKETS_COUNT,
+			  "test third port");
+		if (test->test_teardown)
+			test->test_teardown();
+	}
+	test_global_shutdown(&ctx);
+}
-- 
cgit v1.2.3


From 63b37657c5fdd0625f5825d5f90e7bec1dfafb64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Wed, 20 Nov 2024 08:43:24 +0100
Subject: selftests/bpf: remove test_flow_dissector.sh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that test_flow_dissector.sh has been converted to test_progs, remove
the legacy test.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20241120-flow_dissector-v3-14-45b46494f937@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 -
 tools/testing/selftests/bpf/Makefile               |   2 -
 tools/testing/selftests/bpf/test_flow_dissector.c  | 780 ---------------------
 tools/testing/selftests/bpf/test_flow_dissector.sh | 178 -----
 4 files changed, 961 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c
 delete mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index c2a1842c3d8b..5ad1b9f5e8e6 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -19,7 +19,6 @@ feature
 urandom_read
 test_sockmap
 test_lirc_mode2_user
-test_flow_dissector
 flow_dissector_load
 test_tcpnotify_user
 test_libbpf
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 6ad3b1ba1920..a1964d40a60e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -133,7 +133,6 @@ TEST_PROGS := test_kmod.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
-	test_flow_dissector.sh \
 	test_xdp_vlan_mode_generic.sh \
 	test_xdp_vlan_mode_native.sh \
 	test_lwt_ip_encap.sh \
@@ -161,7 +160,6 @@ TEST_GEN_PROGS_EXTENDED = \
 	flow_dissector_load \
 	runqslower \
 	test_cpp \
-	test_flow_dissector \
 	test_lirc_mode2_user \
 	veristat \
 	xdp_features \
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c
deleted file mode 100644
index 571cc076dd7d..000000000000
--- a/tools/testing/selftests/bpf/test_flow_dissector.c
+++ /dev/null
@@ -1,780 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Inject packets with all sorts of encapsulation into the kernel.
- *
- * IPv4/IPv6	outer layer 3
- * GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/..
- * IPv4/IPv6    inner layer 3
- */
-
-#define _GNU_SOURCE
-
-#include <stddef.h>
-#include <arpa/inet.h>
-#include <asm/byteorder.h>
-#include <error.h>
-#include <errno.h>
-#include <linux/if_packet.h>
-#include <linux/if_ether.h>
-#include <linux/ipv6.h>
-#include <netinet/ip.h>
-#include <netinet/in.h>
-#include <netinet/udp.h>
-#include <poll.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#define CFG_PORT_INNER	8000
-
-/* Add some protocol definitions that do not exist in userspace */
-
-struct grehdr {
-	uint16_t unused;
-	uint16_t protocol;
-} __attribute__((packed));
-
-struct guehdr {
-	union {
-		struct {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-			__u8	hlen:5,
-				control:1,
-				version:2;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-			__u8	version:2,
-				control:1,
-				hlen:5;
-#else
-#error  "Please fix <asm/byteorder.h>"
-#endif
-			__u8	proto_ctype;
-			__be16	flags;
-		};
-		__be32	word;
-	};
-};
-
-static uint8_t	cfg_dsfield_inner;
-static uint8_t	cfg_dsfield_outer;
-static uint8_t	cfg_encap_proto;
-static bool	cfg_expect_failure = false;
-static int	cfg_l3_extra = AF_UNSPEC;	/* optional SIT prefix */
-static int	cfg_l3_inner = AF_UNSPEC;
-static int	cfg_l3_outer = AF_UNSPEC;
-static int	cfg_num_pkt = 10;
-static int	cfg_num_secs = 0;
-static char	cfg_payload_char = 'a';
-static int	cfg_payload_len = 100;
-static int	cfg_port_gue = 6080;
-static bool	cfg_only_rx;
-static bool	cfg_only_tx;
-static int	cfg_src_port = 9;
-
-static char	buf[ETH_DATA_LEN];
-
-#define INIT_ADDR4(name, addr4, port)				\
-	static struct sockaddr_in name = {			\
-		.sin_family = AF_INET,				\
-		.sin_port = __constant_htons(port),		\
-		.sin_addr.s_addr = __constant_htonl(addr4),	\
-	};
-
-#define INIT_ADDR6(name, addr6, port)				\
-	static struct sockaddr_in6 name = {			\
-		.sin6_family = AF_INET6,			\
-		.sin6_port = __constant_htons(port),		\
-		.sin6_addr = addr6,				\
-	};
-
-INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER)
-INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0)
-INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0)
-INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0)
-INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0)
-INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0)
-
-INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
-INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
-INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
-INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
-INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
-INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
-
-static unsigned long util_gettime(void)
-{
-	struct timeval tv;
-
-	gettimeofday(&tv, NULL);
-	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
-}
-
-static void util_printaddr(const char *msg, struct sockaddr *addr)
-{
-	unsigned long off = 0;
-	char nbuf[INET6_ADDRSTRLEN];
-
-	switch (addr->sa_family) {
-	case PF_INET:
-		off = __builtin_offsetof(struct sockaddr_in, sin_addr);
-		break;
-	case PF_INET6:
-		off = __builtin_offsetof(struct sockaddr_in6, sin6_addr);
-		break;
-	default:
-		error(1, 0, "printaddr: unsupported family %u\n",
-		      addr->sa_family);
-	}
-
-	if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf,
-		       sizeof(nbuf)))
-		error(1, errno, "inet_ntop");
-
-	fprintf(stderr, "%s: %s\n", msg, nbuf);
-}
-
-static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
-{
-	unsigned long sum = 0;
-	int i;
-
-	for (i = 0; i < num_u16; i++)
-		sum += start[i];
-
-	return sum;
-}
-
-static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
-			      unsigned long sum)
-{
-	sum += add_csum_hword(start, num_u16);
-
-	while (sum >> 16)
-		sum = (sum & 0xffff) + (sum >> 16);
-
-	return ~sum;
-}
-
-static void build_ipv4_header(void *header, uint8_t proto,
-			      uint32_t src, uint32_t dst,
-			      int payload_len, uint8_t tos)
-{
-	struct iphdr *iph = header;
-
-	iph->ihl = 5;
-	iph->version = 4;
-	iph->tos = tos;
-	iph->ttl = 8;
-	iph->tot_len = htons(sizeof(*iph) + payload_len);
-	iph->id = htons(1337);
-	iph->protocol = proto;
-	iph->saddr = src;
-	iph->daddr = dst;
-	iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
-}
-
-static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
-{
-	uint16_t val, *ptr = (uint16_t *)ip6h;
-
-	val = ntohs(*ptr);
-	val &= 0xF00F;
-	val |= ((uint16_t) dsfield) << 4;
-	*ptr = htons(val);
-}
-
-static void build_ipv6_header(void *header, uint8_t proto,
-			      struct sockaddr_in6 *src,
-			      struct sockaddr_in6 *dst,
-			      int payload_len, uint8_t dsfield)
-{
-	struct ipv6hdr *ip6h = header;
-
-	ip6h->version = 6;
-	ip6h->payload_len = htons(payload_len);
-	ip6h->nexthdr = proto;
-	ip6h->hop_limit = 8;
-	ipv6_set_dsfield(ip6h, dsfield);
-
-	memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
-	memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
-}
-
-static uint16_t build_udp_v4_csum(const struct iphdr *iph,
-				  const struct udphdr *udph,
-				  int num_words)
-{
-	unsigned long pseudo_sum;
-	int num_u16 = sizeof(iph->saddr);	/* halfwords: twice byte len */
-
-	pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16);
-	pseudo_sum += htons(IPPROTO_UDP);
-	pseudo_sum += udph->len;
-	return build_ip_csum((void *) udph, num_words, pseudo_sum);
-}
-
-static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h,
-				  const struct udphdr *udph,
-				  int num_words)
-{
-	unsigned long pseudo_sum;
-	int num_u16 = sizeof(ip6h->saddr);	/* halfwords: twice byte len */
-
-	pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16);
-	pseudo_sum += htons(ip6h->nexthdr);
-	pseudo_sum += ip6h->payload_len;
-	return build_ip_csum((void *) udph, num_words, pseudo_sum);
-}
-
-static void build_udp_header(void *header, int payload_len,
-			     uint16_t dport, int family)
-{
-	struct udphdr *udph = header;
-	int len = sizeof(*udph) + payload_len;
-
-	udph->source = htons(cfg_src_port);
-	udph->dest = htons(dport);
-	udph->len = htons(len);
-	udph->check = 0;
-	if (family == AF_INET)
-		udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
-						udph, len >> 1);
-	else
-		udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
-						udph, len >> 1);
-}
-
-static void build_gue_header(void *header, uint8_t proto)
-{
-	struct guehdr *gueh = header;
-
-	gueh->proto_ctype = proto;
-}
-
-static void build_gre_header(void *header, uint16_t proto)
-{
-	struct grehdr *greh = header;
-
-	greh->protocol = htons(proto);
-}
-
-static int l3_length(int family)
-{
-	if (family == AF_INET)
-		return sizeof(struct iphdr);
-	else
-		return sizeof(struct ipv6hdr);
-}
-
-static int build_packet(void)
-{
-	int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
-	int el3_len = 0;
-
-	if (cfg_l3_extra)
-		el3_len = l3_length(cfg_l3_extra);
-
-	/* calculate header offsets */
-	if (cfg_encap_proto) {
-		ol3_len = l3_length(cfg_l3_outer);
-
-		if (cfg_encap_proto == IPPROTO_GRE)
-			ol4_len = sizeof(struct grehdr);
-		else if (cfg_encap_proto == IPPROTO_UDP)
-			ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
-	}
-
-	il3_len = l3_length(cfg_l3_inner);
-	il4_len = sizeof(struct udphdr);
-
-	if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
-	    sizeof(buf))
-		error(1, 0, "packet too large\n");
-
-	/*
-	 * Fill packet from inside out, to calculate correct checksums.
-	 * But create ip before udp headers, as udp uses ip for pseudo-sum.
-	 */
-	memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
-	       cfg_payload_char, cfg_payload_len);
-
-	/* add zero byte for udp csum padding */
-	buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
-
-	switch (cfg_l3_inner) {
-	case PF_INET:
-		build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
-				  IPPROTO_UDP,
-				  in_saddr4.sin_addr.s_addr,
-				  in_daddr4.sin_addr.s_addr,
-				  il4_len + cfg_payload_len,
-				  cfg_dsfield_inner);
-		break;
-	case PF_INET6:
-		build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
-				  IPPROTO_UDP,
-				  &in_saddr6, &in_daddr6,
-				  il4_len + cfg_payload_len,
-				  cfg_dsfield_inner);
-		break;
-	}
-
-	build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
-			 cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
-
-	if (!cfg_encap_proto)
-		return il3_len + il4_len + cfg_payload_len;
-
-	switch (cfg_l3_outer) {
-	case PF_INET:
-		build_ipv4_header(buf + el3_len, cfg_encap_proto,
-				  out_saddr4.sin_addr.s_addr,
-				  out_daddr4.sin_addr.s_addr,
-				  ol4_len + il3_len + il4_len + cfg_payload_len,
-				  cfg_dsfield_outer);
-		break;
-	case PF_INET6:
-		build_ipv6_header(buf + el3_len, cfg_encap_proto,
-				  &out_saddr6, &out_daddr6,
-				  ol4_len + il3_len + il4_len + cfg_payload_len,
-				  cfg_dsfield_outer);
-		break;
-	}
-
-	switch (cfg_encap_proto) {
-	case IPPROTO_UDP:
-		build_gue_header(buf + el3_len + ol3_len + ol4_len -
-				 sizeof(struct guehdr),
-				 cfg_l3_inner == PF_INET ? IPPROTO_IPIP
-							 : IPPROTO_IPV6);
-		build_udp_header(buf + el3_len + ol3_len,
-				 sizeof(struct guehdr) + il3_len + il4_len +
-				 cfg_payload_len,
-				 cfg_port_gue, cfg_l3_outer);
-		break;
-	case IPPROTO_GRE:
-		build_gre_header(buf + el3_len + ol3_len,
-				 cfg_l3_inner == PF_INET ? ETH_P_IP
-							 : ETH_P_IPV6);
-		break;
-	}
-
-	switch (cfg_l3_extra) {
-	case PF_INET:
-		build_ipv4_header(buf,
-				  cfg_l3_outer == PF_INET ? IPPROTO_IPIP
-							  : IPPROTO_IPV6,
-				  extra_saddr4.sin_addr.s_addr,
-				  extra_daddr4.sin_addr.s_addr,
-				  ol3_len + ol4_len + il3_len + il4_len +
-				  cfg_payload_len, 0);
-		break;
-	case PF_INET6:
-		build_ipv6_header(buf,
-				  cfg_l3_outer == PF_INET ? IPPROTO_IPIP
-							  : IPPROTO_IPV6,
-				  &extra_saddr6, &extra_daddr6,
-				  ol3_len + ol4_len + il3_len + il4_len +
-				  cfg_payload_len, 0);
-		break;
-	}
-
-	return el3_len + ol3_len + ol4_len + il3_len + il4_len +
-	       cfg_payload_len;
-}
-
-/* sender transmits encapsulated over RAW or unencap'd over UDP */
-static int setup_tx(void)
-{
-	int family, fd, ret;
-
-	if (cfg_l3_extra)
-		family = cfg_l3_extra;
-	else if (cfg_l3_outer)
-		family = cfg_l3_outer;
-	else
-		family = cfg_l3_inner;
-
-	fd = socket(family, SOCK_RAW, IPPROTO_RAW);
-	if (fd == -1)
-		error(1, errno, "socket tx");
-
-	if (cfg_l3_extra) {
-		if (cfg_l3_extra == PF_INET)
-			ret = connect(fd, (void *) &extra_daddr4,
-				      sizeof(extra_daddr4));
-		else
-			ret = connect(fd, (void *) &extra_daddr6,
-				      sizeof(extra_daddr6));
-		if (ret)
-			error(1, errno, "connect tx");
-	} else if (cfg_l3_outer) {
-		/* connect to destination if not encapsulated */
-		if (cfg_l3_outer == PF_INET)
-			ret = connect(fd, (void *) &out_daddr4,
-				      sizeof(out_daddr4));
-		else
-			ret = connect(fd, (void *) &out_daddr6,
-				      sizeof(out_daddr6));
-		if (ret)
-			error(1, errno, "connect tx");
-	} else {
-		/* otherwise using loopback */
-		if (cfg_l3_inner == PF_INET)
-			ret = connect(fd, (void *) &in_daddr4,
-				      sizeof(in_daddr4));
-		else
-			ret = connect(fd, (void *) &in_daddr6,
-				      sizeof(in_daddr6));
-		if (ret)
-			error(1, errno, "connect tx");
-	}
-
-	return fd;
-}
-
-/* receiver reads unencapsulated UDP */
-static int setup_rx(void)
-{
-	int fd, ret;
-
-	fd = socket(cfg_l3_inner, SOCK_DGRAM, 0);
-	if (fd == -1)
-		error(1, errno, "socket rx");
-
-	if (cfg_l3_inner == PF_INET)
-		ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4));
-	else
-		ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6));
-	if (ret)
-		error(1, errno, "bind rx");
-
-	return fd;
-}
-
-static int do_tx(int fd, const char *pkt, int len)
-{
-	int ret;
-
-	ret = write(fd, pkt, len);
-	if (ret == -1)
-		error(1, errno, "send");
-	if (ret != len)
-		error(1, errno, "send: len (%d < %d)\n", ret, len);
-
-	return 1;
-}
-
-static int do_poll(int fd, short events, int timeout)
-{
-	struct pollfd pfd;
-	int ret;
-
-	pfd.fd = fd;
-	pfd.events = events;
-
-	ret = poll(&pfd, 1, timeout);
-	if (ret == -1)
-		error(1, errno, "poll");
-	if (ret && !(pfd.revents & POLLIN))
-		error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents);
-
-	return ret;
-}
-
-static int do_rx(int fd)
-{
-	char rbuf;
-	int ret, num = 0;
-
-	while (1) {
-		ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
-		if (ret == -1 && errno == EAGAIN)
-			break;
-		if (ret == -1)
-			error(1, errno, "recv");
-		if (rbuf != cfg_payload_char)
-			error(1, 0, "recv: payload mismatch");
-		num++;
-	}
-
-	return num;
-}
-
-static int do_main(void)
-{
-	unsigned long tstop, treport, tcur;
-	int fdt = -1, fdr = -1, len, tx = 0, rx = 0;
-
-	if (!cfg_only_tx)
-		fdr = setup_rx();
-	if (!cfg_only_rx)
-		fdt = setup_tx();
-
-	len = build_packet();
-
-	tcur = util_gettime();
-	treport = tcur + 1000;
-	tstop = tcur + (cfg_num_secs * 1000);
-
-	while (1) {
-		if (!cfg_only_rx)
-			tx += do_tx(fdt, buf, len);
-
-		if (!cfg_only_tx)
-			rx += do_rx(fdr);
-
-		if (cfg_num_secs) {
-			tcur = util_gettime();
-			if (tcur >= tstop)
-				break;
-			if (tcur >= treport) {
-				fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
-				tx = 0;
-				rx = 0;
-				treport = tcur + 1000;
-			}
-		} else {
-			if (tx == cfg_num_pkt)
-				break;
-		}
-	}
-
-	/* read straggler packets, if any */
-	if (rx < tx) {
-		tstop = util_gettime() + 100;
-		while (rx < tx) {
-			tcur = util_gettime();
-			if (tcur >= tstop)
-				break;
-
-			do_poll(fdr, POLLIN, tstop - tcur);
-			rx += do_rx(fdr);
-		}
-	}
-
-	fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
-
-	if (fdr != -1 && close(fdr))
-		error(1, errno, "close rx");
-	if (fdt != -1 && close(fdt))
-		error(1, errno, "close tx");
-
-	/*
-	 * success (== 0) only if received all packets
-	 * unless failure is expected, in which case none must arrive.
-	 */
-	if (cfg_expect_failure)
-		return rx != 0;
-	else
-		return rx != tx;
-}
-
-
-static void __attribute__((noreturn)) usage(const char *filepath)
-{
-	fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] "
-			"[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] "
-			"[-s <osrc> [-d <odst>] [-S <isrc>] [-D <idst>] "
-			"[-x <otos>] [-X <itos>] [-f <isport>] [-F]\n",
-		filepath);
-	exit(1);
-}
-
-static void parse_addr(int family, void *addr, const char *optarg)
-{
-	int ret;
-
-	ret = inet_pton(family, optarg, addr);
-	if (ret == -1)
-		error(1, errno, "inet_pton");
-	if (ret == 0)
-		error(1, 0, "inet_pton: bad string");
-}
-
-static void parse_addr4(struct sockaddr_in *addr, const char *optarg)
-{
-	parse_addr(AF_INET, &addr->sin_addr, optarg);
-}
-
-static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg)
-{
-	parse_addr(AF_INET6, &addr->sin6_addr, optarg);
-}
-
-static int parse_protocol_family(const char *filepath, const char *optarg)
-{
-	if (!strcmp(optarg, "4"))
-		return PF_INET;
-	if (!strcmp(optarg, "6"))
-		return PF_INET6;
-
-	usage(filepath);
-}
-
-static void parse_opts(int argc, char **argv)
-{
-	int c;
-
-	while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
-		switch (c) {
-		case 'd':
-			if (cfg_l3_outer == AF_UNSPEC)
-				error(1, 0, "-d must be preceded by -o");
-			if (cfg_l3_outer == AF_INET)
-				parse_addr4(&out_daddr4, optarg);
-			else
-				parse_addr6(&out_daddr6, optarg);
-			break;
-		case 'D':
-			if (cfg_l3_inner == AF_UNSPEC)
-				error(1, 0, "-D must be preceded by -i");
-			if (cfg_l3_inner == AF_INET)
-				parse_addr4(&in_daddr4, optarg);
-			else
-				parse_addr6(&in_daddr6, optarg);
-			break;
-		case 'e':
-			if (!strcmp(optarg, "gre"))
-				cfg_encap_proto = IPPROTO_GRE;
-			else if (!strcmp(optarg, "gue"))
-				cfg_encap_proto = IPPROTO_UDP;
-			else if (!strcmp(optarg, "bare"))
-				cfg_encap_proto = IPPROTO_IPIP;
-			else if (!strcmp(optarg, "none"))
-				cfg_encap_proto = IPPROTO_IP;	/* == 0 */
-			else
-				usage(argv[0]);
-			break;
-		case 'f':
-			cfg_src_port = strtol(optarg, NULL, 0);
-			break;
-		case 'F':
-			cfg_expect_failure = true;
-			break;
-		case 'h':
-			usage(argv[0]);
-			break;
-		case 'i':
-			if (!strcmp(optarg, "4"))
-				cfg_l3_inner = PF_INET;
-			else if (!strcmp(optarg, "6"))
-				cfg_l3_inner = PF_INET6;
-			else
-				usage(argv[0]);
-			break;
-		case 'l':
-			cfg_payload_len = strtol(optarg, NULL, 0);
-			break;
-		case 'n':
-			cfg_num_pkt = strtol(optarg, NULL, 0);
-			break;
-		case 'o':
-			cfg_l3_outer = parse_protocol_family(argv[0], optarg);
-			break;
-		case 'O':
-			cfg_l3_extra = parse_protocol_family(argv[0], optarg);
-			break;
-		case 'R':
-			cfg_only_rx = true;
-			break;
-		case 's':
-			if (cfg_l3_outer == AF_INET)
-				parse_addr4(&out_saddr4, optarg);
-			else
-				parse_addr6(&out_saddr6, optarg);
-			break;
-		case 'S':
-			if (cfg_l3_inner == AF_INET)
-				parse_addr4(&in_saddr4, optarg);
-			else
-				parse_addr6(&in_saddr6, optarg);
-			break;
-		case 't':
-			cfg_num_secs = strtol(optarg, NULL, 0);
-			break;
-		case 'T':
-			cfg_only_tx = true;
-			break;
-		case 'x':
-			cfg_dsfield_outer = strtol(optarg, NULL, 0);
-			break;
-		case 'X':
-			cfg_dsfield_inner = strtol(optarg, NULL, 0);
-			break;
-		}
-	}
-
-	if (cfg_only_rx && cfg_only_tx)
-		error(1, 0, "options: cannot combine rx-only and tx-only");
-
-	if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC)
-		error(1, 0, "options: must specify outer with encap");
-	else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC)
-		error(1, 0, "options: cannot combine no-encap and outer");
-	else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC)
-		error(1, 0, "options: cannot combine no-encap and extra");
-
-	if (cfg_l3_inner == AF_UNSPEC)
-		cfg_l3_inner = AF_INET6;
-	if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP)
-		cfg_encap_proto = IPPROTO_IPV6;
-
-	/* RFC 6040 4.2:
-	 *   on decap, if outer encountered congestion (CE == 0x3),
-	 *   but inner cannot encode ECN (NoECT == 0x0), then drop packet.
-	 */
-	if (((cfg_dsfield_outer & 0x3) == 0x3) &&
-	    ((cfg_dsfield_inner & 0x3) == 0x0))
-		cfg_expect_failure = true;
-}
-
-static void print_opts(void)
-{
-	if (cfg_l3_inner == PF_INET6) {
-		util_printaddr("inner.dest6", (void *) &in_daddr6);
-		util_printaddr("inner.source6", (void *) &in_saddr6);
-	} else {
-		util_printaddr("inner.dest4", (void *) &in_daddr4);
-		util_printaddr("inner.source4", (void *) &in_saddr4);
-	}
-
-	if (!cfg_l3_outer)
-		return;
-
-	fprintf(stderr, "encap proto:   %u\n", cfg_encap_proto);
-
-	if (cfg_l3_outer == PF_INET6) {
-		util_printaddr("outer.dest6", (void *) &out_daddr6);
-		util_printaddr("outer.source6", (void *) &out_saddr6);
-	} else {
-		util_printaddr("outer.dest4", (void *) &out_daddr4);
-		util_printaddr("outer.source4", (void *) &out_saddr4);
-	}
-
-	if (!cfg_l3_extra)
-		return;
-
-	if (cfg_l3_outer == PF_INET6) {
-		util_printaddr("extra.dest6", (void *) &extra_daddr6);
-		util_printaddr("extra.source6", (void *) &extra_saddr6);
-	} else {
-		util_printaddr("extra.dest4", (void *) &extra_daddr4);
-		util_printaddr("extra.source4", (void *) &extra_saddr4);
-	}
-
-}
-
-int main(int argc, char **argv)
-{
-	parse_opts(argc, argv);
-	print_opts();
-	return do_main();
-}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
deleted file mode 100755
index 4b298863797a..000000000000
--- a/tools/testing/selftests/bpf/test_flow_dissector.sh
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Load BPF flow dissector and verify it correctly dissects traffic
-
-BPF_FILE="bpf_flow.bpf.o"
-export TESTNAME=test_flow_dissector
-unmount=0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-msg="skip all tests:"
-if [ $UID != 0 ]; then
-	echo $msg please run this as root >&2
-	exit $ksft_skip
-fi
-
-# This test needs to be run in a network namespace with in_netns.sh. Check if
-# this is the case and run it with in_netns.sh if it is being run in the root
-# namespace.
-if [[ -z $(ip netns identify $$) ]]; then
-	err=0
-	if bpftool="$(which bpftool)"; then
-		echo "Testing global flow dissector..."
-
-		$bpftool prog loadall $BPF_FILE /sys/fs/bpf/flow \
-			type flow_dissector
-
-		if ! unshare --net $bpftool prog attach pinned \
-			/sys/fs/bpf/flow/_dissect flow_dissector; then
-			echo "Unexpected unsuccessful attach in namespace" >&2
-			err=1
-		fi
-
-		$bpftool prog attach pinned /sys/fs/bpf/flow/_dissect \
-			flow_dissector
-
-		if unshare --net $bpftool prog attach pinned \
-			/sys/fs/bpf/flow/_dissect flow_dissector; then
-			echo "Unexpected successful attach in namespace" >&2
-			err=1
-		fi
-
-		if ! $bpftool prog detach pinned \
-			/sys/fs/bpf/flow/_dissect flow_dissector; then
-			echo "Failed to detach flow dissector" >&2
-			err=1
-		fi
-
-		rm -rf /sys/fs/bpf/flow
-	else
-		echo "Skipping root flow dissector test, bpftool not found" >&2
-	fi
-
-	# Run the rest of the tests in a net namespace.
-	../net/in_netns.sh "$0" "$@"
-	err=$(( $err + $? ))
-
-	if (( $err == 0 )); then
-		echo "selftests: $TESTNAME [PASS]";
-	else
-		echo "selftests: $TESTNAME [FAILED]";
-	fi
-
-	exit $err
-fi
-
-# Determine selftest success via shell exit code
-exit_handler()
-{
-	set +e
-
-	# Cleanup
-	tc filter del dev lo ingress pref 1337 2> /dev/null
-	tc qdisc del dev lo ingress 2> /dev/null
-	./flow_dissector_load -d 2> /dev/null
-	if [ $unmount -ne 0 ]; then
-		umount bpffs 2> /dev/null
-	fi
-}
-
-# Exit script immediately (well catched by trap handler) if any
-# program/thing exits with a non-zero status.
-set -e
-
-# (Use 'trap -l' to list meaning of numbers)
-trap exit_handler 0 2 3 6 9
-
-# Mount BPF file system
-if /bin/mount | grep /sys/fs/bpf > /dev/null; then
-	echo "bpffs already mounted"
-else
-	echo "bpffs not mounted. Mounting..."
-	unmount=1
-	/bin/mount bpffs /sys/fs/bpf -t bpf
-fi
-
-# Attach BPF program
-./flow_dissector_load -p $BPF_FILE -s _dissect
-
-# Setup
-tc qdisc add dev lo ingress
-echo 0 > /proc/sys/net/ipv4/conf/default/rp_filter
-echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
-echo 0 > /proc/sys/net/ipv4/conf/lo/rp_filter
-
-echo "Testing IPv4..."
-# Drops all IP/UDP packets coming from port 9
-tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
-	udp src_port 9 action drop
-
-# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
-./test_flow_dissector -i 4 -f 8
-# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
-./test_flow_dissector -i 4 -f 9 -F
-# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
-./test_flow_dissector -i 4 -f 10
-
-echo "Testing IPv4 from 127.0.0.127 (fallback to generic dissector)..."
-# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
-./test_flow_dissector -i 4 -S 127.0.0.127 -f 8
-# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
-./test_flow_dissector -i 4 -S 127.0.0.127 -f 9 -F
-# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
-./test_flow_dissector -i 4 -S 127.0.0.127 -f 10
-
-echo "Testing IPIP..."
-# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
-./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
-	-D 192.168.0.1 -S 1.1.1.1 -f 8
-# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all.
-./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
-	-D 192.168.0.1 -S 1.1.1.1 -f 9 -F
-# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any.
-./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
-	-D 192.168.0.1 -S 1.1.1.1 -f 10
-
-echo "Testing IPv4 + GRE..."
-# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any.
-./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
-	-D 192.168.0.1 -S 1.1.1.1 -f 8
-# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all.
-./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
-	-D 192.168.0.1 -S 1.1.1.1 -f 9 -F
-# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any.
-./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
-	-D 192.168.0.1 -S 1.1.1.1 -f 10
-
-tc filter del dev lo ingress pref 1337
-
-echo "Testing port range..."
-# Drops all IP/UDP packets coming from port 8-10
-tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
-	udp src_port 8-10 action drop
-
-# Send 10 IPv4/UDP packets from port 7. Filter should not drop any.
-./test_flow_dissector -i 4 -f 7
-# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
-./test_flow_dissector -i 4 -f 9 -F
-# Send 10 IPv4/UDP packets from port 11. Filter should not drop any.
-./test_flow_dissector -i 4 -f 11
-
-tc filter del dev lo ingress pref 1337
-
-echo "Testing IPv6..."
-# Drops all IPv6/UDP packets coming from port 9
-tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
-	udp src_port 9 action drop
-
-# Send 10 IPv6/UDP packets from port 8. Filter should not drop any.
-./test_flow_dissector -i 6 -f 8
-# Send 10 IPv6/UDP packets from port 9. Filter should drop all.
-./test_flow_dissector -i 6 -f 9 -F
-# Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
-./test_flow_dissector -i 6 -f 10
-
-exit 0
-- 
cgit v1.2.3


From 98ebe5ef6f5c4517ba92fb3e56f95827ebea83fd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 21 Nov 2024 14:45:58 -0800
Subject: libbpf: don't adjust USDT semaphore address if .stapsdt.base addr is
 missing

USDT ELF note optionally can record an offset of .stapsdt.base, which is
used to make adjustments to USDT target attach address. Currently,
libbpf will do this address adjustment unconditionally if it finds
.stapsdt.base ELF section in target binary. But there is a corner case
where .stapsdt.base ELF section is present, but specific USDT note
doesn't reference it. In such case, libbpf will basically just add base
address and end up with absolutely incorrect USDT target address.

This adjustment has to be done only if both .stapsdt.sema section is
present and USDT note is recording a reference to it.

Fixes: 74cc6311cec9 ("libbpf: Add USDT notes parsing and resolution logic")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20241121224558.796110-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/usdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index 5f085736c6c4..4e4a52742b01 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -661,7 +661,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *
 		 *   [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
 		 */
 		usdt_abs_ip = note.loc_addr;
-		if (base_addr)
+		if (base_addr && note.base_addr)
 			usdt_abs_ip += base_addr - note.base_addr;
 
 		/* When attaching uprobes (which is what USDTs basically are)
-- 
cgit v1.2.3


From 9aef3aaa7059c4dd0cc875107e05bb3198a7fc33 Mon Sep 17 00:00:00 2001
From: Mahe Tardy <mahe.tardy@gmail.com>
Date: Mon, 25 Nov 2024 15:26:03 +0000
Subject: selftests/bpf: add cgroup skb direct packet access test

This verifies that programs of BPF_PROG_TYPE_CGROUP_SKB can access
skb->data_end with direct packet access when being run with
BPF_PROG_TEST_RUN.

Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
Link: https://lore.kernel.org/r/20241125152603.375898-2-mahe.tardy@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../prog_tests/cgroup_skb_direct_packet_access.c   | 28 ++++++++++++++++++++++
 .../bpf/progs/cgroup_skb_direct_packet_access.c    | 15 ++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_skb_direct_packet_access.c
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_skb_direct_packet_access.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_direct_packet_access.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_direct_packet_access.c
new file mode 100644
index 000000000000..e1a90c10db8c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_direct_packet_access.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "cgroup_skb_direct_packet_access.skel.h"
+
+void test_cgroup_skb_prog_run_direct_packet_access(void)
+{
+	int err;
+	struct cgroup_skb_direct_packet_access *skel;
+	char test_skb[64] = {};
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = test_skb,
+		.data_size_in = sizeof(test_skb),
+	);
+
+	skel = cgroup_skb_direct_packet_access__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "cgroup_skb_direct_packet_access__open_and_load"))
+		return;
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.direct_packet_access), &topts);
+	ASSERT_OK(err, "bpf_prog_test_run_opts err");
+	ASSERT_EQ(topts.retval, 1, "retval");
+
+	ASSERT_NEQ(skel->bss->data_end, 0, "data_end");
+
+	cgroup_skb_direct_packet_access__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_skb_direct_packet_access.c b/tools/testing/selftests/bpf/progs/cgroup_skb_direct_packet_access.c
new file mode 100644
index 000000000000..e32b07d802bb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_skb_direct_packet_access.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+__u32 data_end;
+
+SEC("cgroup_skb/ingress")
+int direct_packet_access(struct __sk_buff *skb)
+{
+	data_end = skb->data_end;
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 9a17db586d722c0875f7a0580c438dc80afee1e7 Mon Sep 17 00:00:00 2001
From: Ben Olson <matthew.olson@intel.com>
Date: Tue, 26 Nov 2024 14:08:45 -0600
Subject: libbpf: Improve debug message when the base BTF cannot be found

When running `bpftool` on a kernel module installed in `/lib/modules...`,
this error is encountered if the user does not specify `--base-btf` to
point to a valid base BTF (e.g. usually in `/sys/kernel/btf/vmlinux`).
However, looking at the debug output to determine the cause of the error
simply says `Invalid BTF string section`, which does not point to the
actual source of the error. This just improves that debug message to tell
users what happened.

Signed-off-by: Ben Olson <matthew.olson@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/Z0YqzQ5lNz7obQG7@bolson-desk
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 12468ae0d573..a4ae2df68b91 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -283,7 +283,7 @@ static int btf_parse_str_sec(struct btf *btf)
 		return -EINVAL;
 	}
 	if (!btf->base_btf && start[0]) {
-		pr_debug("Invalid BTF string section\n");
+		pr_debug("Malformed BTF string section, did you forget to provide base BTF?\n");
 		return -EINVAL;
 	}
 	return 0;
-- 
cgit v1.2.3


From c721d8f8b196285a59ed5c940e856bce9890523f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?=
 <alexis.lothore@bootlin.com>
Date: Thu, 28 Nov 2024 15:38:43 +0100
Subject: selftests/bpf: ensure proper root namespace cleanup when test fail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

serial_test_flow_dissector_namespace manipulates both the root net
namespace and a dedicated non-root net namespace. If for some reason a
program attach on root namespace succeeds while it was expected to
fail, the unexpected program will remain attached to the root namespace,
possibly affecting other runs or even other tests in the same run.

Fix undesired test failure side effect by explicitly detaching programs
on failing tests expecting attach to fail. As a side effect of this
change, do not test errno value if the tested operation do not fail.

Fixes: 284ed00a59dd ("selftests/bpf: migrate flow_dissector namespace exclusivity test")
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241128-small_flow_test_fix-v1-1-c12d45c98c59@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 8e6e483fead3..08bae13248c4 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -525,11 +525,14 @@ void serial_test_flow_dissector_namespace(void)
 	ns = open_netns(TEST_NS);
 	if (!ASSERT_OK_PTR(ns, "enter non-root net namespace"))
 		goto out_clean_ns;
-
 	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+	if (!ASSERT_ERR(err,
+			"refuse new flow dissector in non-root net namespace"))
+		bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
+	else
+		ASSERT_EQ(errno, EEXIST,
+			  "refused because of already attached prog");
 	close_netns(ns);
-	ASSERT_ERR(err, "refuse new flow dissector in non-root net namespace");
-	ASSERT_EQ(errno, EEXIST, "refused because of already attached prog");
 
 	/* If no flow dissector is attached to the root namespace, we must
 	 * be able to attach one to a non-root net namespace
@@ -545,8 +548,11 @@ void serial_test_flow_dissector_namespace(void)
 	 * a flow dissector to root namespace must fail
 	 */
 	err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
-	ASSERT_ERR(err, "refuse new flow dissector on root namespace");
-	ASSERT_EQ(errno, EEXIST, "refused because of already attached prog");
+	if (!ASSERT_ERR(err, "refuse new flow dissector on root namespace"))
+		bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
+	else
+		ASSERT_EQ(errno, EEXIST,
+			  "refused because of already attached prog");
 
 	ns = open_netns(TEST_NS);
 	bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
-- 
cgit v1.2.3


From 793baff3f24f16dab9061045e23eea67724feae6 Mon Sep 17 00:00:00 2001
From: Honglei Wang <jameshongleiwang@126.com>
Date: Fri, 29 Nov 2024 17:10:03 +0800
Subject: sched_ext: Add __weak to fix the build errors

commit 5cbb302880f5 ("sched_ext: Rename
scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*()")
introduced several new functions which caused compilation errors when
compiled with clang.

Let's fix this by adding __weak markers.

Signed-off-by: Honglei Wang <jameshongleiwang@126.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 5cbb302880f5 ("sched_ext: Rename scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*()")
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 2f36b7b6418d..625f5b046776 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -40,9 +40,9 @@ void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_fl
 void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
-bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym;
-void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
-void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
+void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
-- 
cgit v1.2.3


From e8a99af68c068865dbac7f3330e97bf8e96edf33 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 3 Dec 2024 15:44:17 +0800
Subject: tools/power turbostat: Add initial support for PantherLake

Add initial support for PantherLake.
It shares the same features with Lunarlake.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 58a487c225a7..540336138ce9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1024,6 +1024,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_ARROWLAKE_U, &adl_features },
 	{ INTEL_ARROWLAKE, &adl_features },
 	{ INTEL_LUNARLAKE_M, &lnl_features },
+	{ INTEL_PANTHERLAKE_L, &lnl_features },
 	{ INTEL_ATOM_SILVERMONT, &slv_features },
 	{ INTEL_ATOM_SILVERMONT_D, &slvd_features },
 	{ INTEL_ATOM_AIRMONT, &amt_features },
-- 
cgit v1.2.3


From 6b47ed23e2f1bc2c177da47437970e6208ac9ea0 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 3 Dec 2024 15:44:18 +0800
Subject: tools/power turbostat: Add initial support for ClearwaterForest

Add initial support for ClearwaterForest.
It shares the same features with SierraForest.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 540336138ce9..e203f109dd2e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1037,6 +1037,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_ATOM_GRACEMONT, &adl_features },
 	{ INTEL_ATOM_CRESTMONT_X, &srf_features },
 	{ INTEL_ATOM_CRESTMONT, &grr_features },
+	{ INTEL_ATOM_DARKMONT_X, &srf_features },
 	{ INTEL_XEON_PHI_KNL, &knl_features },
 	{ INTEL_XEON_PHI_KNM, &knl_features },
 	/*
-- 
cgit v1.2.3


From 9e47f8adb053b69e2e8310551e6fd5156704cef4 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 3 Dec 2024 12:23:22 -0500
Subject: tools/power turbostat: update turbostat(8)

Clarify how to get the latest version.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index a7f7ed01421c..59b89e6b25bf 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -516,14 +516,40 @@ that they count at TSC rate, which is true on all processors tested to date.
 Volume 3B: System Programming Guide"
 https://www.intel.com/products/processor/manuals/
 
+.SH RUN THE LATEST VERSION
+If turbostat complains that it doesn't recognize your processor,
+please try the latest version.
+
+The latest version of turbostat does not require the latest version of the Linux kernel.
+However, some features, such as perf(1) counters, do require kernel support.
+
+The latest turbostat release is available in the upstream Linux Kernel source tree.
+eg. "git pull https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git"
+and run make in tools/power/x86/turbostat/.
+
+n.b. "make install" will update your system manually, but a distro update may subsequently downgrade your turbostat to an older version.
+For this reason, manually installing to /usr/local/bin may be what you want.
+
+Note that turbostat/Makefile has a "make snapshot" target, which will create a tar file
+that can build without a local kernel source tree.
+
+If the upstream version isn't new enough, the development tree can be found here:
+"git pull https://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat"
+
+If the development tree doesn't work, please contact the author via chat,
+or via email with the word "turbostat" on the Subject line.
+
 .SH FILES
 .ta
 .nf
+/sys/bus/event_source/devices/
 /dev/cpu/*/msr
+/sys/class/intel_pmt/
+/sys/devices/system/cpu/
 .fi
 
 .SH "SEE ALSO"
-msr(4), vmstat(8)
+perf(1), msr(4), vmstat(8)
 .PP
 .SH AUTHOR
 .nf
-- 
cgit v1.2.3


From 4133be39e216130a86382fb5cfbaf6851a6f7a45 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 3 Dec 2024 15:51:16 +0800
Subject: tools/power turbostat: Exit on unsupported Intel models

Turbostat requires per-platform enabling for Intel CPU models due to
platform-specific features. When running on unsupported Intel CPU
models, turbostat currently operates with limited default features,
which can lead to users unknowingly using an outdated version of the
tool.

Enhance turbostat to exit by default when run on unsupported Intel CPU
models, with a clear message to users, informing them that their CPU
model is not supported and advising them to update to the latest version
of turbostat for full functionality.

[lenb: updated error message wording]

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index e203f109dd2e..5e894b71003c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1079,6 +1079,10 @@ void probe_platform_features(unsigned int family, unsigned int model)
 			return;
 		}
 	}
+
+	fprintf(stderr, "Unsupported platform detected.\n"
+		"\tSee RUN THE LATEST VERSION on turbostat(8)\n");
+	exit(1);
 }
 
 /* Model specific support End */
-- 
cgit v1.2.3


From 48c62ba1b407140229e92f5cfae6ae113fc4af8e Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 3 Dec 2024 15:51:17 +0800
Subject: tools/power turbostat: Exit on unsupported Vendors

Turbostat currently supports x86 processors from Intel, AMD, and Hygon.
The behavior of turbostat on CPUs from other vendors has not been
evaluated and may lead to incorrect or undefined behavior.

Enhance turbostat to exit by default when running on an unsupported CPU
vendor. This ensures that users are aware that their CPU is not
currently supported by turbostat, guiding them to seek support for their
specific hardware through future patches.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5e894b71003c..cb659b274554 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1056,9 +1056,9 @@ void probe_platform_features(unsigned int family, unsigned int model)
 {
 	int i;
 
-	platform = &default_features;
 
 	if (authentic_amd || hygon_genuine) {
+		platform = &default_features;
 		if (max_extended_level >= 0x80000007) {
 			unsigned int eax, ebx, ecx, edx;
 
@@ -1071,7 +1071,7 @@ void probe_platform_features(unsigned int family, unsigned int model)
 	}
 
 	if (!genuine_intel)
-		return;
+		goto end;
 
 	for (i = 0; turbostat_pdata[i].features; i++) {
 		if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) {
@@ -1080,6 +1080,10 @@ void probe_platform_features(unsigned int family, unsigned int model)
 		}
 	}
 
+end:
+	if (platform)
+		return;
+
 	fprintf(stderr, "Unsupported platform detected.\n"
 		"\tSee RUN THE LATEST VERSION on turbostat(8)\n");
 	exit(1);
-- 
cgit v1.2.3


From cc63f89ef9db70f74c563317d36028bb5e6196a1 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 3 Dec 2024 15:51:18 +0800
Subject: tools/power turbostat: Improve --help output

Improve the `--help` output of turbostat by standardizing the format
and enhancing readability. The following changes are made to ensure
consistency and clarity in the help message:
1. Use a consistent pattern for each parameter's help message:
   - Display the parameter and its input (if any) on the same line,
     separated by a space.
   - Provide the detailed description on a separate line.
2. Ensure that the first character of each description is in lower-case.

These changes make the help output more uniform and easier to read,
helping users quickly understand the available options and their usage.

No functional change.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 41 ++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index cb659b274554..5165450a8187 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2145,41 +2145,52 @@ void help(void)
 		"when COMMAND completes.\n"
 		"If no COMMAND is specified, turbostat wakes every 5-seconds\n"
 		"to print statistics, until interrupted.\n"
-		"  -a, --add	add a counter\n"
+		"  -a, --add counter\n"
+		"		add a counter\n"
 		"		  eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
 		"		  eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n"
 		"		  eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n"
-		"  -c, --cpu	cpu-set	limit output to summary plus cpu-set:\n"
+		"  -c, --cpu cpu-set\n"
+		"		limit output to summary plus cpu-set:\n"
 		"		  {core | package | j,k,l..m,n-p }\n"
-		"  -d, --debug	displays usec, Time_Of_Day_Seconds and more debugging\n"
+		"  -d, --debug\n"
+		"		displays usec, Time_Of_Day_Seconds and more debugging\n"
 		"		debug messages are printed to stderr\n"
-		"  -D, --Dump	displays the raw counter values\n"
-		"  -e, --enable	[all | column]\n"
+		"  -D, --Dump\n"
+		"		displays the raw counter values\n"
+		"  -e, --enable [all | column]\n"
 		"		shows all or the specified disabled column\n"
-		"  -H, --hide [column|column,column,...]\n"
+		"  -H, --hide [column | column,column,...]\n"
 		"		hide the specified column(s)\n"
 		"  -i, --interval sec.subsec\n"
-		"		Override default 5-second measurement interval\n"
-		"  -J, --Joules	displays energy in Joules instead of Watts\n"
-		"  -l, --list	list column headers only\n"
-		"  -M, --no-msr Disable all uses of the MSR driver\n"
-		"  -P, --no-perf Disable all uses of the perf API\n"
+		"		override default 5-second measurement interval\n"
+		"  -J, --Joules\n"
+		"		displays energy in Joules instead of Watts\n"
+		"  -l, --list\n"
+		"		list column headers only\n"
+		"  -M, --no-msr\n"
+		"		disable all uses of the MSR driver\n"
+		"  -P, --no-perf\n"
+		"		disable all uses of the perf API\n"
 		"  -n, --num_iterations num\n"
 		"		number of the measurement iterations\n"
 		"  -N, --header_iterations num\n"
 		"		print header every num iterations\n"
 		"  -o, --out file\n"
 		"		create or truncate \"file\" for all output\n"
-		"  -q, --quiet	skip decoding system configuration header\n"
-		"  -s, --show [column|column,column,...]\n"
+		"  -q, --quiet\n"
+		"		skip decoding system configuration header\n"
+		"  -s, --show [column | column,column,...]\n"
 		"		show only the specified column(s)\n"
 		"  -S, --Summary\n"
 		"		limits output to 1-line system summary per interval\n"
 		"  -T, --TCC temperature\n"
 		"		sets the Thermal Control Circuit temperature in\n"
 		"		  degrees Celsius\n"
-		"  -h, --help	print this help message\n"
-		"  -v, --version	print version information\n" "\n" "For more help, run \"man turbostat\"\n");
+		"  -h, --help\n"
+		"		print this help message\n"
+		"  -v, --version\n"
+		"		print version information\n" "\n" "For more help, run \"man turbostat\"\n");
 }
 
 /*
-- 
cgit v1.2.3


From 3d94026af328d3d355d15c1d7fe73278f77c6a42 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 3 Dec 2024 15:51:19 +0800
Subject: tools/power turbostat: Introduce --force parameter

Turbostat currently exits under the following conditions:
1. When running on non-Intel/AMD/Hygon x86 vendors.
2. When running on Intel models that lack specific platform features.

Introduce a new `--force` parameter that allows turbostat to run on
these unsupported platforms with minimal default feature support. This
provides users with the flexibility to gather basic information even on
unsupported systems.

[lenb: updated warning message text]

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5165450a8187..7accc4a73366 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -326,6 +326,7 @@ unsigned int rapl_joules;
 unsigned int summary_only;
 unsigned int list_header_only;
 unsigned int dump_only;
+unsigned int force_load;
 unsigned int has_aperf;
 unsigned int has_aperf_access;
 unsigned int has_epb;
@@ -1058,7 +1059,8 @@ void probe_platform_features(unsigned int family, unsigned int model)
 
 
 	if (authentic_amd || hygon_genuine) {
-		platform = &default_features;
+		/* fallback to default features on unsupported models */
+		force_load++;
 		if (max_extended_level >= 0x80000007) {
 			unsigned int eax, ebx, ecx, edx;
 
@@ -1067,7 +1069,7 @@ void probe_platform_features(unsigned int family, unsigned int model)
 			if ((edx & (1 << 14)) && family >= 0x17)
 				platform = &amd_features_with_rapl;
 		}
-		return;
+		goto end;
 	}
 
 	if (!genuine_intel)
@@ -1081,6 +1083,11 @@ void probe_platform_features(unsigned int family, unsigned int model)
 	}
 
 end:
+	if (force_load && !platform) {
+		fprintf(outf, "Forced to run on unsupported platform!\n");
+		platform = &default_features;
+	}
+
 	if (platform)
 		return;
 
@@ -2160,6 +2167,8 @@ void help(void)
 		"		displays the raw counter values\n"
 		"  -e, --enable [all | column]\n"
 		"		shows all or the specified disabled column\n"
+		"  -f, --force\n"
+		"		force load turbostat with minimum default features on unsupported platforms.\n"
 		"  -H, --hide [column | column,column,...]\n"
 		"		hide the specified column(s)\n"
 		"  -i, --interval sec.subsec\n"
@@ -9942,6 +9951,7 @@ void cmdline(int argc, char **argv)
 		{ "Dump", no_argument, 0, 'D' },
 		{ "debug", no_argument, 0, 'd' },	/* internal, not documented */
 		{ "enable", required_argument, 0, 'e' },
+		{ "force", no_argument, 0, 'f' },
 		{ "interval", required_argument, 0, 'i' },
 		{ "IPC", no_argument, 0, 'I' },
 		{ "num_iterations", required_argument, 0, 'n' },
@@ -10002,6 +10012,9 @@ void cmdline(int argc, char **argv)
 			/* --enable specified counter */
 			bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST);
 			break;
+		case 'f':
+			force_load++;
+			break;
 		case 'd':
 			debug++;
 			ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
-- 
cgit v1.2.3


From cbd8730aea8d79cda6b0f3c18b406dfdef0c1b80 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 3 Dec 2024 19:03:58 -0800
Subject: bpf: Improve verifier log for resource leak on exit

The verifier log when leaking resources on BPF_EXIT may be a bit
confusing, as it's a problem only when finally existing from the main
prog, not from any of the subprogs. Hence, update the verifier error
string and the corresponding selftests matching on it.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241204030400.208005-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/exceptions_fail.c    |  4 ++--
 tools/testing/selftests/bpf/progs/preempt_lock.c       | 14 +++++++-------
 tools/testing/selftests/bpf/progs/verifier_spin_lock.c |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c
index fe0f3fa5aab6..8a0fdff89927 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_fail.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c
@@ -131,7 +131,7 @@ int reject_subprog_with_lock(void *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_rcu_read_lock-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_rcu_read_lock-ed region")
 int reject_with_rcu_read_lock(void *ctx)
 {
 	bpf_rcu_read_lock();
@@ -147,7 +147,7 @@ __noinline static int throwing_subprog(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_rcu_read_lock-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_rcu_read_lock-ed region")
 int reject_subprog_with_rcu_read_lock(void *ctx)
 {
 	bpf_rcu_read_lock();
diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c
index 885377e83607..5269571cf7b5 100644
--- a/tools/testing/selftests/bpf/progs/preempt_lock.c
+++ b/tools/testing/selftests/bpf/progs/preempt_lock.c
@@ -6,7 +6,7 @@
 #include "bpf_experimental.h"
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_1(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -14,7 +14,7 @@ int preempt_lock_missing_1(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_2(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -23,7 +23,7 @@ int preempt_lock_missing_2(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_3(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -33,7 +33,7 @@ int preempt_lock_missing_3(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx)
 {
 	bpf_preempt_disable();
@@ -55,7 +55,7 @@ static __noinline void preempt_enable(void)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_1_subprog(struct __sk_buff *ctx)
 {
 	preempt_disable();
@@ -63,7 +63,7 @@ int preempt_lock_missing_1_subprog(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_2_subprog(struct __sk_buff *ctx)
 {
 	preempt_disable();
@@ -72,7 +72,7 @@ int preempt_lock_missing_2_subprog(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_preempt_disable-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx)
 {
 	preempt_disable();
diff --git a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
index 3f679de73229..25599eac9a70 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
@@ -187,7 +187,7 @@ l0_%=:	r6 = r0;					\
 
 SEC("cgroup/skb")
 __description("spin_lock: test6 missing unlock")
-__failure __msg("BPF_EXIT instruction cannot be used inside bpf_spin_lock-ed region")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_spin_lock-ed region")
 __failure_unpriv __msg_unpriv("")
 __naked void spin_lock_test6_missing_unlock(void)
 {
-- 
cgit v1.2.3


From e8c6c80b76e53632992ec345a02c05942aa8f3f2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 3 Dec 2024 19:03:59 -0800
Subject: selftests/bpf: Expand coverage of preempt tests to sleepable kfunc

For preemption-related kfuncs, we don't test their interaction with
sleepable kfuncs (we do test helpers) even though the verifier has
code to protect against such a pattern. Expand coverage of the selftest
to include this case.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241204030400.208005-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/preempt_lock.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c
index 5269571cf7b5..6c5797bf0ead 100644
--- a/tools/testing/selftests/bpf/progs/preempt_lock.c
+++ b/tools/testing/selftests/bpf/progs/preempt_lock.c
@@ -5,6 +5,8 @@
 #include "bpf_misc.h"
 #include "bpf_experimental.h"
 
+extern int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void *unsafe_ptr__ign, u64 flags) __weak __ksym;
+
 SEC("?tc")
 __failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_preempt_disable-ed region")
 int preempt_lock_missing_1(struct __sk_buff *ctx)
@@ -113,6 +115,18 @@ int preempt_sleepable_helper(void *ctx)
 	return 0;
 }
 
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+__failure __msg("kernel func bpf_copy_from_user_str is sleepable within non-preemptible region")
+int preempt_sleepable_kfunc(void *ctx)
+{
+	u32 data;
+
+	bpf_preempt_disable();
+	bpf_copy_from_user_str(&data, sizeof(data), NULL, 0);
+	bpf_preempt_enable();
+	return 0;
+}
+
 int __noinline preempt_global_subprog(void)
 {
 	preempt_balance_subprog();
-- 
cgit v1.2.3


From 4fec4c22f046a64741a1ae417de718504fd2cda2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 3 Dec 2024 19:04:00 -0800
Subject: selftests/bpf: Add IRQ save/restore tests

Include tests that check for rejection in erroneous cases, like
unbalanced IRQ-disabled counts, within and across subprogs, invalid IRQ
flag state or input to kfuncs, behavior upon overwriting IRQ saved state
on stack, interaction with sleepable kfuncs/helpers, global functions,
and out of order restore. Include some success scenarios as well to
demonstrate usage.

#128/1   irq/irq_save_bad_arg:OK
#128/2   irq/irq_restore_bad_arg:OK
#128/3   irq/irq_restore_missing_2:OK
#128/4   irq/irq_restore_missing_3:OK
#128/5   irq/irq_restore_missing_3_minus_2:OK
#128/6   irq/irq_restore_missing_1_subprog:OK
#128/7   irq/irq_restore_missing_2_subprog:OK
#128/8   irq/irq_restore_missing_3_subprog:OK
#128/9   irq/irq_restore_missing_3_minus_2_subprog:OK
#128/10  irq/irq_balance:OK
#128/11  irq/irq_balance_n:OK
#128/12  irq/irq_balance_subprog:OK
#128/13  irq/irq_global_subprog:OK
#128/14  irq/irq_restore_ooo:OK
#128/15  irq/irq_restore_ooo_3:OK
#128/16  irq/irq_restore_3_subprog:OK
#128/17  irq/irq_restore_4_subprog:OK
#128/18  irq/irq_restore_ooo_3_subprog:OK
#128/19  irq/irq_restore_invalid:OK
#128/20  irq/irq_save_invalid:OK
#128/21  irq/irq_restore_iter:OK
#128/22  irq/irq_save_iter:OK
#128/23  irq/irq_flag_overwrite:OK
#128/24  irq/irq_flag_overwrite_partial:OK
#128/25  irq/irq_ooo_refs_array:OK
#128/26  irq/irq_sleepable_helper:OK
#128/27  irq/irq_sleepable_kfunc:OK
#128     irq:OK
Summary: 1/27 PASSED, 0 SKIPPED, 0 FAILED

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241204030400.208005-8-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c |   2 +
 tools/testing/selftests/bpf/progs/irq.c           | 444 ++++++++++++++++++++++
 2 files changed, 446 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/irq.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index d9f65adb456b..b1b4d69c407a 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -98,6 +98,7 @@
 #include "verifier_xdp_direct_packet_access.skel.h"
 #include "verifier_bits_iter.skel.h"
 #include "verifier_lsm.skel.h"
+#include "irq.skel.h"
 
 #define MAX_ENTRIES 11
 
@@ -225,6 +226,7 @@ void test_verifier_xdp(void)                  { RUN(verifier_xdp); }
 void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); }
 void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); }
 void test_verifier_lsm(void)                  { RUN(verifier_lsm); }
+void test_irq(void)			      { RUN(irq); }
 
 void test_verifier_mtu(void)
 {
diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c
new file mode 100644
index 000000000000..b0b53d980964
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/irq.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+unsigned long global_flags;
+
+extern void bpf_local_irq_save(unsigned long *) __weak __ksym;
+extern void bpf_local_irq_restore(unsigned long *) __weak __ksym;
+extern int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void *unsafe_ptr__ign, u64 flags) __weak __ksym;
+
+SEC("?tc")
+__failure __msg("arg#0 doesn't point to an irq flag on stack")
+int irq_save_bad_arg(struct __sk_buff *ctx)
+{
+	bpf_local_irq_save(&global_flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("arg#0 doesn't point to an irq flag on stack")
+int irq_restore_bad_arg(struct __sk_buff *ctx)
+{
+	bpf_local_irq_restore(&global_flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_2(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_save(&flags2);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_3(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_save(&flags2);
+	bpf_local_irq_save(&flags3);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_3_minus_2(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_save(&flags2);
+	bpf_local_irq_save(&flags3);
+	bpf_local_irq_restore(&flags3);
+	bpf_local_irq_restore(&flags2);
+	return 0;
+}
+
+static __noinline void local_irq_save(unsigned long *flags)
+{
+	bpf_local_irq_save(flags);
+}
+
+static __noinline void local_irq_restore(unsigned long *flags)
+{
+	bpf_local_irq_restore(flags);
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_1_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags;
+
+	local_irq_save(&flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_2_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+
+	local_irq_save(&flags1);
+	local_irq_save(&flags2);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_3_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	local_irq_save(&flags1);
+	local_irq_save(&flags2);
+	local_irq_save(&flags3);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("BPF_EXIT instruction in main prog cannot be used inside bpf_local_irq_save-ed region")
+int irq_restore_missing_3_minus_2_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	local_irq_save(&flags1);
+	local_irq_save(&flags2);
+	local_irq_save(&flags3);
+	local_irq_restore(&flags3);
+	local_irq_restore(&flags2);
+	return 0;
+}
+
+SEC("?tc")
+__success
+int irq_balance(struct __sk_buff *ctx)
+{
+	unsigned long flags;
+
+	local_irq_save(&flags);
+	local_irq_restore(&flags);
+	return 0;
+}
+
+SEC("?tc")
+__success
+int irq_balance_n(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	local_irq_save(&flags1);
+	local_irq_save(&flags2);
+	local_irq_save(&flags3);
+	local_irq_restore(&flags3);
+	local_irq_restore(&flags2);
+	local_irq_restore(&flags1);
+	return 0;
+}
+
+static __noinline void local_irq_balance(void)
+{
+	unsigned long flags;
+
+	local_irq_save(&flags);
+	local_irq_restore(&flags);
+}
+
+static __noinline void local_irq_balance_n(void)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	local_irq_save(&flags1);
+	local_irq_save(&flags2);
+	local_irq_save(&flags3);
+	local_irq_restore(&flags3);
+	local_irq_restore(&flags2);
+	local_irq_restore(&flags1);
+}
+
+SEC("?tc")
+__success
+int irq_balance_subprog(struct __sk_buff *ctx)
+{
+	local_irq_balance();
+	return 0;
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+__failure __msg("sleepable helper bpf_copy_from_user#")
+int irq_sleepable_helper(void *ctx)
+{
+	unsigned long flags;
+	u32 data;
+
+	local_irq_save(&flags);
+	bpf_copy_from_user(&data, sizeof(data), NULL);
+	local_irq_restore(&flags);
+	return 0;
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+__failure __msg("kernel func bpf_copy_from_user_str is sleepable within IRQ-disabled region")
+int irq_sleepable_kfunc(void *ctx)
+{
+	unsigned long flags;
+	u32 data;
+
+	local_irq_save(&flags);
+	bpf_copy_from_user_str(&data, sizeof(data), NULL, 0);
+	local_irq_restore(&flags);
+	return 0;
+}
+
+int __noinline global_local_irq_balance(void)
+{
+	local_irq_balance_n();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("global function calls are not allowed with IRQs disabled")
+int irq_global_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags;
+
+	bpf_local_irq_save(&flags);
+	global_local_irq_balance();
+	bpf_local_irq_restore(&flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot restore irq state out of order")
+int irq_restore_ooo(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_save(&flags2);
+	bpf_local_irq_restore(&flags1);
+	bpf_local_irq_restore(&flags2);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot restore irq state out of order")
+int irq_restore_ooo_3(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_save(&flags2);
+	bpf_local_irq_restore(&flags2);
+	bpf_local_irq_save(&flags3);
+	bpf_local_irq_restore(&flags1);
+	bpf_local_irq_restore(&flags3);
+	return 0;
+}
+
+static __noinline void local_irq_save_3(unsigned long *flags1, unsigned long *flags2,
+					unsigned long *flags3)
+{
+	local_irq_save(flags1);
+	local_irq_save(flags2);
+	local_irq_save(flags3);
+}
+
+SEC("?tc")
+__success
+int irq_restore_3_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	local_irq_save_3(&flags1, &flags2, &flags3);
+	bpf_local_irq_restore(&flags3);
+	bpf_local_irq_restore(&flags2);
+	bpf_local_irq_restore(&flags1);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot restore irq state out of order")
+int irq_restore_4_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+	unsigned long flags4;
+
+	local_irq_save_3(&flags1, &flags2, &flags3);
+	bpf_local_irq_restore(&flags3);
+	bpf_local_irq_save(&flags4);
+	bpf_local_irq_restore(&flags4);
+	bpf_local_irq_restore(&flags1);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot restore irq state out of order")
+int irq_restore_ooo_3_subprog(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags2;
+	unsigned long flags3;
+
+	local_irq_save_3(&flags1, &flags2, &flags3);
+	bpf_local_irq_restore(&flags3);
+	bpf_local_irq_restore(&flags2);
+	bpf_local_irq_save(&flags3);
+	bpf_local_irq_restore(&flags1);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("expected an initialized")
+int irq_restore_invalid(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+	unsigned long flags = 0xfaceb00c;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_restore(&flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("expected uninitialized")
+int irq_save_invalid(struct __sk_buff *ctx)
+{
+	unsigned long flags1;
+
+	bpf_local_irq_save(&flags1);
+	bpf_local_irq_save(&flags1);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("expected an initialized")
+int irq_restore_iter(struct __sk_buff *ctx)
+{
+	struct bpf_iter_num it;
+
+	bpf_iter_num_new(&it, 0, 42);
+	bpf_local_irq_restore((unsigned long *)&it);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("Unreleased reference id=1")
+int irq_save_iter(struct __sk_buff *ctx)
+{
+	struct bpf_iter_num it;
+
+	/* Ensure same sized slot has st->ref_obj_id set, so we reject based on
+	 * slot_type != STACK_IRQ_FLAG...
+	 */
+	_Static_assert(sizeof(it) == sizeof(unsigned long), "broken iterator size");
+
+	bpf_iter_num_new(&it, 0, 42);
+	bpf_local_irq_save((unsigned long *)&it);
+	bpf_local_irq_restore((unsigned long *)&it);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("expected an initialized")
+int irq_flag_overwrite(struct __sk_buff *ctx)
+{
+	unsigned long flags;
+
+	bpf_local_irq_save(&flags);
+	flags = 0xdeadbeef;
+	bpf_local_irq_restore(&flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("expected an initialized")
+int irq_flag_overwrite_partial(struct __sk_buff *ctx)
+{
+	unsigned long flags;
+
+	bpf_local_irq_save(&flags);
+	*(((char *)&flags) + 1) = 0xff;
+	bpf_local_irq_restore(&flags);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot restore irq state out of order")
+int irq_ooo_refs_array(struct __sk_buff *ctx)
+{
+	unsigned long flags[4];
+	struct { int i; } *p;
+
+	/* refs=1 */
+	bpf_local_irq_save(&flags[0]);
+
+	/* refs=1,2 */
+	p = bpf_obj_new(typeof(*p));
+	if (!p) {
+		bpf_local_irq_restore(&flags[0]);
+		return 0;
+	}
+
+	/* refs=1,2,3 */
+	bpf_local_irq_save(&flags[1]);
+
+	/* refs=1,2,3,4 */
+	bpf_local_irq_save(&flags[2]);
+
+	/* Now when we remove ref=2, the verifier must not break the ordering in
+	 * the refs array between 1,3,4. With an older implementation, the
+	 * verifier would swap the last element with the removed element, but to
+	 * maintain the stack property we need to use memmove.
+	 */
+	bpf_obj_drop(p);
+
+	/* Save and restore to reset active_irq_id to 3, as the ordering is now
+	 * refs=1,4,3. When restoring the linear scan will find prev_id in order
+	 * as 3 instead of 4.
+	 */
+	bpf_local_irq_save(&flags[3]);
+	bpf_local_irq_restore(&flags[3]);
+
+	/* With the incorrect implementation, we can release flags[1], flags[2],
+	 * and flags[0], i.e. in the wrong order.
+	 */
+	bpf_local_irq_restore(&flags[1]);
+	bpf_local_irq_restore(&flags[2]);
+	bpf_local_irq_restore(&flags[0]);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From e2f0791124a1b6ca8d570110cbd487969d9d41ef Mon Sep 17 00:00:00 2001
From: Marco Leogrande <leogrande@google.com>
Date: Mon, 2 Dec 2024 12:45:30 -0800
Subject: tools/testing/selftests/bpf/test_tc_tunnel.sh: Fix wait for server
 bind

Commit f803bcf9208a ("selftests/bpf: Prevent client connect before
server bind in test_tc_tunnel.sh") added code that waits for the
netcat server to start before the netcat client attempts to connect to
it. However, not all calls to 'server_listen' were guarded.

This patch adds the existing 'wait_for_port' guard after the remaining
call to 'server_listen'.

Fixes: f803bcf9208a ("selftests/bpf: Prevent client connect before server bind in test_tc_tunnel.sh")
Signed-off-by: Marco Leogrande <leogrande@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20241202204530.1143448-1-leogrande@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_tc_tunnel.sh | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 7989ec608454..cb55a908bb0d 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -305,6 +305,7 @@ else
 	client_connect
 	verify_data
 	server_listen
+	wait_for_port ${port} ${netcat_opt}
 fi
 
 # serverside, use BPF for decap
-- 
cgit v1.2.3


From ef7009decc30eb2515a64253791d61b72229c119 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Thu, 21 Nov 2024 21:40:17 +0000
Subject: selftests/sched_ext: fix build after renames in sched_ext API

The selftests are falining to build on current tip of bpf-next and
sched_ext [1]. This has broken BPF CI [2] after merge from upstream.

Use appropriate function names in the selftests according to the
recent changes in the sched_ext API [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fc39fb56917bb3cb53e99560ca3612a84456ada2
[2] https://github.com/kernel-patches/bpf/actions/runs/11959327258/job/33340923745
[3] https://lore.kernel.org/all/20241109194853.580310-1-tj@kernel.org/

Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Acked-by: Andrea Righi <arighi@nvidia.com>
Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c       | 2 +-
 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c      | 4 ++--
 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c              | 2 +-
 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c      | 2 +-
 tools/testing/selftests/sched_ext/exit.bpf.c                      | 4 ++--
 tools/testing/selftests/sched_ext/maximal.bpf.c                   | 4 ++--
 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c            | 2 +-
 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c | 2 +-
 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c       | 2 +-
 .../testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 2 +-
 .../testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 4 ++--
 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c          | 8 ++++----
 12 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
index 37d9bf6fb745..6f4c3f5a1c5d 100644
--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
 		 * If we dispatch to a bogus DSQ that will fall back to the
 		 * builtin global DSQ, we fail gracefully.
 		 */
-		scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
+		scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
 				       p->scx.dsq_vtime, 0);
 		return cpu;
 	}
diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
index dffc97d9cdf1..e4a55027778f 100644
--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
@@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
 
 	if (cpu >= 0) {
 		/* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */
-		scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
-				       p->scx.dsq_vtime, 0);
+		scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+					 p->scx.dsq_vtime, 0);
 		return cpu;
 	}
 
diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
index 6a7db1502c29..6325bf76f47e 100644
--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
@@ -45,7 +45,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev)
 
 	target = bpf_get_prandom_u32() % nr_cpus;
 
-	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
 	bpf_task_release(p);
 }
 
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
index 1efb50d61040..a7cf868d5e31 100644
--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
@@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
 	/* Can only call from ops.select_cpu() */
 	scx_bpf_select_cpu_dfl(p, 0, 0, &found);
 
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
 SEC(".struct_ops.link")
diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c
index d75d4faf07f6..4bc36182d3ff 100644
--- a/tools/testing/selftests/sched_ext/exit.bpf.c
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
 	if (exit_point == EXIT_ENQUEUE)
 		EXIT_CLEANLY();
 
-	scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
 }
 
 void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
 	if (exit_point == EXIT_DISPATCH)
 		EXIT_CLEANLY();
 
-	scx_bpf_consume(DSQ_ID);
+	scx_bpf_dsq_move_to_local(DSQ_ID);
 }
 
 void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 4d4cd8d966db..4c005fa71810 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
 void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
@@ -28,7 +28,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
 
 void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_consume(SCX_DSQ_GLOBAL);
+	scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL);
 }
 
 void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
index f171ac470970..13d0f5be788d 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
 	}
 	scx_bpf_put_idle_cpumask(idle_mask);
 
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
 SEC(".struct_ops.link")
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
index 9efdbb7da928..815f1d5d61ac 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
@@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p,
 		saw_local = true;
 	}
 
-	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags);
 }
 
 s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
index 59bfc4f36167..4bb99699e920 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
@@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
 	cpu = prev_cpu;
 
 dispatch:
-	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0);
 	return cpu;
 }
 
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
index 3bbd5fcdfb18..2a75de11b2cf 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
@@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p
 		   s32 prev_cpu, u64 wake_flags)
 {
 	/* Dispatching to a random DSQ should fail. */
-	scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0);
 
 	return prev_cpu;
 }
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
index 0fda57fe0ecf..99d075695c97 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
@@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p
 		   s32 prev_cpu, u64 wake_flags)
 {
 	/* Dispatching twice in a row is disallowed. */
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
 
 	return prev_cpu;
 }
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
index e6c67bcf5e6e..bfcb96cd4954 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@@ -2,8 +2,8 @@
 /*
  * A scheduler that validates that enqueue flags are properly stored and
  * applied at dispatch time when a task is directly dispatched from
- * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and
- * making the test a very basic vtime scheduler.
+ * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(),
+ * and making the test a very basic vtime scheduler.
  *
  * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2024 David Vernet <dvernet@meta.com>
@@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p,
 	cpu = prev_cpu;
 	scx_bpf_test_and_clear_cpu_idle(cpu);
 ddsp:
-	scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
+	scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
 	return cpu;
 }
 
 void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
 {
-	if (scx_bpf_consume(VTIME_DSQ))
+	if (scx_bpf_dsq_move_to_local(VTIME_DSQ))
 		consumed = true;
 }
 
-- 
cgit v1.2.3


From f24d192985cbd6782850fdbb3839039da2f0ee76 Mon Sep 17 00:00:00 2001
From: guanjing <guanjing@cmss.chinamobile.com>
Date: Sun, 17 Nov 2024 10:51:29 +0800
Subject: sched_ext: fix application of sizeof to pointer

sizeof when applied to a pointer typed expression gives the size of
the pointer.

The proper fix in this particular case is to code sizeof(*cpuset)
instead of sizeof(cpuset).

This issue was detected with the help of Coccinelle.

Fixes: 22a920209ab6 ("sched_ext: Implement tickless support")
Signed-off-by: guanjing <guanjing@cmss.chinamobile.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 21deea320bd7..e938156ed0a0 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -97,7 +97,7 @@ restart:
 	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
 	CPU_ZERO(cpuset);
 	CPU_SET(skel->rodata->central_cpu, cpuset);
-	SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
+	SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset),
 		   "Failed to affinitize to central CPU %d (max %d)",
 		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
 	CPU_FREE(cpuset);
-- 
cgit v1.2.3


From ac98b3132402e1b892c16f87d766f21ef18dd344 Mon Sep 17 00:00:00 2001
From: Kenjiro Nakayama <nakayamakenjiro@gmail.com>
Date: Wed, 4 Dec 2024 07:28:44 +0900
Subject: selftests/net: call sendmmsg via udpgso_bench.sh

Currently, sendmmsg is implemented in udpgso_bench_tx.c,
but it is not called by any test script.

This patch adds a test for sendmmsg in udpgso_bench.sh.
This allows for basic API testing and benchmarking
comparisons with GSO.

Signed-off-by: Kenjiro Nakayama <nakayamakenjiro@gmail.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241203222843.26983-1-nakayamakenjiro@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/udpgso_bench.sh | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
index 640bc43452fa..88fa1d53ba2b 100755
--- a/tools/testing/selftests/net/udpgso_bench.sh
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -92,6 +92,9 @@ run_udp() {
 	echo "udp"
 	run_in_netns ${args}
 
+	echo "udp sendmmsg"
+	run_in_netns ${args} -m
+
 	echo "udp gso"
 	run_in_netns ${args} -S 0
 
-- 
cgit v1.2.3


From 523d3cc4b6d1ae18bfa516345d48332d455181e6 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Wed, 4 Dec 2024 07:55:42 -0800
Subject: ynl: support enum-cnt-name attribute in legacy definitions

This is similar to existing attr-cnt-name in the attributes
to allow changing the name of the 'count' enum entry.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241204155549.641348-2-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index d8201c4b1520..bfe95826ae3e 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -801,6 +801,7 @@ class EnumSet(SpecEnumSet):
             self.user_type = 'int'
 
         self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-")
+        self.enum_cnt_name = yaml.get('enum-cnt-name', None)
 
         super().__init__(family, yaml)
 
@@ -2472,9 +2473,12 @@ def render_uapi(family, cw):
                     max_val = f' = {enum.get_mask()},'
                     cw.p(max_name + max_val)
                 else:
+                    cnt_name = enum.enum_cnt_name
                     max_name = c_upper(name_pfx + 'max')
-                    cw.p('__' + max_name + ',')
-                    cw.p(max_name + ' = (__' + max_name + ' - 1)')
+                    if not cnt_name:
+                        cnt_name = '__' + name_pfx + 'max'
+                    cw.p(c_upper(cnt_name) + ',')
+                    cw.p(max_name + ' = (' + c_upper(cnt_name) + ' - 1)')
             cw.block_end(line=';')
             cw.nl()
         elif const['type'] == 'const':
-- 
cgit v1.2.3


From 8c843ecde4e49e11063ad942675246ec685ea19a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Wed, 4 Dec 2024 07:55:43 -0800
Subject: ynl: skip rendering attributes with header property in uapi mode

To allow omitting some of the attributes in the final generated file.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241204155549.641348-3-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index bfe95826ae3e..79829ce39139 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -801,6 +801,7 @@ class EnumSet(SpecEnumSet):
             self.user_type = 'int'
 
         self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-")
+        self.header = yaml.get('header', None)
         self.enum_cnt_name = yaml.get('enum-cnt-name', None)
 
         super().__init__(family, yaml)
@@ -2441,6 +2442,9 @@ def render_uapi(family, cw):
         if const['type'] == 'enum' or const['type'] == 'flags':
             enum = family.consts[const['name']]
 
+            if enum.header:
+                continue
+
             if enum.has_doc():
                 if enum.has_entry_doc():
                     cw.p('/**')
-- 
cgit v1.2.3


From 56881d07f0b4cb97f3c460dc3908eee91fc51a17 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Wed, 4 Dec 2024 07:55:44 -0800
Subject: ynl: support directional specs in ynl-gen-c.py

The intent is to generate ethtool uapi headers. For now, some of the
things are hard-coded:
- <FAMILY>_MSG_{USER,KERNEL}_MAX
- the split between USER and KERNEL messages

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241204155549.641348-4-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 118 +++++++++++++++++++++++++++++++++------------
 1 file changed, 87 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 79829ce39139..2bf4d992e54a 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2419,6 +2419,87 @@ def uapi_enum_start(family, cw, obj, ckey='', enum_name='enum-name'):
     cw.block_start(line=start_line)
 
 
+def render_uapi_unified(family, cw, max_by_define, separate_ntf):
+    max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX"))
+    cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX"))
+    max_value = f"({cnt_name} - 1)"
+
+    uapi_enum_start(family, cw, family['operations'], 'enum-name')
+    val = 0
+    for op in family.msgs.values():
+        if separate_ntf and ('notify' in op or 'event' in op):
+            continue
+
+        suffix = ','
+        if op.value != val:
+            suffix = f" = {op.value},"
+            val = op.value
+        cw.p(op.enum_name + suffix)
+        val += 1
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+
+def render_uapi_directional(family, cw, max_by_define):
+    max_name = f"{family.op_prefix}USER_MAX"
+    cnt_name = f"__{family.op_prefix}USER_CNT"
+    max_value = f"({cnt_name} - 1)"
+
+    cw.block_start(line='enum')
+    cw.p(c_upper(f'{family.name}_MSG_USER_NONE = 0,'))
+    val = 0
+    for op in family.msgs.values():
+        if 'do' in op and 'event' not in op:
+            suffix = ','
+            if op.value and op.value != val:
+                suffix = f" = {op.value},"
+                val = op.value
+            cw.p(op.enum_name + suffix)
+            val += 1
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+    max_name = f"{family.op_prefix}KERNEL_MAX"
+    cnt_name = f"__{family.op_prefix}KERNEL_CNT"
+    max_value = f"({cnt_name} - 1)"
+
+    cw.block_start(line='enum')
+    cw.p(c_upper(f'{family.name}_MSG_KERNEL_NONE = 0,'))
+    val = 0
+    for op in family.msgs.values():
+        if ('do' in op and 'reply' in op['do']) or 'notify' in op or 'event' in op:
+            enum_name = op.enum_name
+            if 'event' not in op and 'notify' not in op:
+                enum_name = f'{enum_name}_REPLY'
+
+            suffix = ','
+            if op.value and op.value != val:
+                suffix = f" = {op.value},"
+                val = op.value
+            cw.p(enum_name + suffix)
+            val += 1
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+
 def render_uapi(family, cw):
     hdr_prot = f"_UAPI_LINUX_{c_upper(family.uapi_header_name)}_H"
     hdr_prot = hdr_prot.replace('/', '_')
@@ -2523,30 +2604,12 @@ def render_uapi(family, cw):
     # Commands
     separate_ntf = 'async-prefix' in family['operations']
 
-    max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX"))
-    cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX"))
-    max_value = f"({cnt_name} - 1)"
-
-    uapi_enum_start(family, cw, family['operations'], 'enum-name')
-    val = 0
-    for op in family.msgs.values():
-        if separate_ntf and ('notify' in op or 'event' in op):
-            continue
-
-        suffix = ','
-        if op.value != val:
-            suffix = f" = {op.value},"
-            val = op.value
-        cw.p(op.enum_name + suffix)
-        val += 1
-    cw.nl()
-    cw.p(cnt_name + ('' if max_by_define else ','))
-    if not max_by_define:
-        cw.p(f"{max_name} = {max_value}")
-    cw.block_end(line=';')
-    if max_by_define:
-        cw.p(f"#define {max_name} {max_value}")
-    cw.nl()
+    if family.msg_id_model == 'unified':
+        render_uapi_unified(family, cw, max_by_define, separate_ntf)
+    elif family.msg_id_model == 'directional':
+        render_uapi_directional(family, cw, max_by_define)
+    else:
+        raise Exception(f'Unsupported message enum-model {family.msg_id_model}')
 
     if separate_ntf:
         uapi_enum_start(family, cw, family['operations'], enum_name='async-enum')
@@ -2670,13 +2733,6 @@ def main():
         os.sys.exit(1)
         return
 
-    supported_models = ['unified']
-    if args.mode in ['user', 'kernel']:
-        supported_models += ['directional']
-    if parsed.msg_id_model not in supported_models:
-        print(f'Message enum-model {parsed.msg_id_model} not supported for {args.mode} generation')
-        os.sys.exit(1)
-
     cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out))
 
     _, spec_kernel = find_kernel_root(args.spec)
-- 
cgit v1.2.3


From 001b0b59efbbdf54126c2ae512009d4a7c9f9f88 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Wed, 4 Dec 2024 07:55:46 -0800
Subject: ynl: include uapi header after all dependencies

Essentially reverse the order of headers for userspace generated files.

Before (make -C tools/net/ynl/; cat tools/net/ynl/ethtool-user.h):
  #include <linux/ethtool_netlink_generated.h>
  #include <linux/ethtool.h>
  #include <linux/ethtool.h>
  #include <linux/ethtool.h>

After:
  #include <linux/ethtool.h>
  #include <linux/ethtool_netlink_generated.h>

While at it, make sure we track which headers we've already included
and include the headers only once.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241204155549.641348-6-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 2bf4d992e54a..8098bcbb6f40 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2782,12 +2782,17 @@ def main():
         else:
             cw.p(f'#include "{hdr_file}"')
             cw.p('#include "ynl.h"')
-        headers = [parsed.uapi_header]
+        headers = []
     for definition in parsed['definitions']:
         if 'header' in definition:
             headers.append(definition['header'])
+    if args.mode == 'user':
+        headers.append(parsed.uapi_header)
+    seen_header = []
     for one in headers:
-        cw.p(f"#include <{one}>")
+        if one not in seen_header:
+            cw.p(f"#include <{one}>")
+            seen_header.append(one)
     cw.nl()
 
     if args.mode == "user":
-- 
cgit v1.2.3


From e10500b69c3f3378f3dcfc8c2fe4cdb74fc844f5 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <qmo@kernel.org>
Date: Thu, 5 Dec 2024 13:59:42 +0000
Subject: libbpf: Fix segfault due to libelf functions not setting errno

Libelf functions do not set errno on failure. Instead, it relies on its
internal _elf_errno value, that can be retrieved via elf_errno (or the
corresponding message via elf_errmsg()). From "man libelf":

    If a libelf function encounters an error it will set an internal
    error code that can be retrieved with elf_errno. Each thread
    maintains its own separate error code. The meaning of each error
    code can be determined with elf_errmsg, which returns a string
    describing the error.

As a consequence, libbpf should not return -errno when a function from
libelf fails, because an empty value will not be interpreted as an error
and won't prevent the program to stop. This is visible in
bpf_linker__add_file(), for example, where we call a succession of
functions that rely on libelf:

    err = err ?: linker_load_obj_file(linker, filename, opts, &obj);
    err = err ?: linker_append_sec_data(linker, &obj);
    err = err ?: linker_append_elf_syms(linker, &obj);
    err = err ?: linker_append_elf_relos(linker, &obj);
    err = err ?: linker_append_btf(linker, &obj);
    err = err ?: linker_append_btf_ext(linker, &obj);

If the object file that we try to process is not, in fact, a correct
object file, linker_load_obj_file() may fail with errno not being set,
and return 0. In this case we attempt to run linker_append_elf_sysms()
and may segfault.

This can happen (and was discovered) with bpftool:

    $ bpftool gen object output.o sample_ret0.bpf.c
    libbpf: failed to get ELF header for sample_ret0.bpf.c: invalid `Elf' handle
    zsh: segmentation fault (core dumped)  bpftool gen object output.o sample_ret0.bpf.c

Fix the issue by returning a non-null error code (-EINVAL) when libelf
functions fail.

Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs")
Signed-off-by: Quentin Monnet <qmo@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241205135942.65262-1-qmo@kernel.org
---
 tools/lib/bpf/linker.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index cf71d149fe26..e56ba6e67451 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -566,17 +566,15 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 	}
 	obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL);
 	if (!obj->elf) {
-		err = -errno;
 		pr_warn_elf("failed to parse ELF file '%s'", filename);
-		return err;
+		return -EINVAL;
 	}
 
 	/* Sanity check ELF file high-level properties */
 	ehdr = elf64_getehdr(obj->elf);
 	if (!ehdr) {
-		err = -errno;
 		pr_warn_elf("failed to get ELF header for %s", filename);
-		return err;
+		return -EINVAL;
 	}
 
 	/* Linker output endianness set by first input object */
@@ -606,9 +604,8 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 	}
 
 	if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) {
-		err = -errno;
 		pr_warn_elf("failed to get SHSTRTAB section index for %s", filename);
-		return err;
+		return -EINVAL;
 	}
 
 	scn = NULL;
@@ -618,26 +615,23 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 
 		shdr = elf64_getshdr(scn);
 		if (!shdr) {
-			err = -errno;
 			pr_warn_elf("failed to get section #%zu header for %s",
 				    sec_idx, filename);
-			return err;
+			return -EINVAL;
 		}
 
 		sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name);
 		if (!sec_name) {
-			err = -errno;
 			pr_warn_elf("failed to get section #%zu name for %s",
 				    sec_idx, filename);
-			return err;
+			return -EINVAL;
 		}
 
 		data = elf_getdata(scn, 0);
 		if (!data) {
-			err = -errno;
 			pr_warn_elf("failed to get section #%zu (%s) data from %s",
 				    sec_idx, sec_name, filename);
-			return err;
+			return -EINVAL;
 		}
 
 		sec = add_src_sec(obj, sec_name);
@@ -2680,14 +2674,14 @@ int bpf_linker__finalize(struct bpf_linker *linker)
 
 	/* Finalize ELF layout */
 	if (elf_update(linker->elf, ELF_C_NULL) < 0) {
-		err = -errno;
+		err = -EINVAL;
 		pr_warn_elf("failed to finalize ELF layout");
 		return libbpf_err(err);
 	}
 
 	/* Write out final ELF contents */
 	if (elf_update(linker->elf, ELF_C_WRITE) < 0) {
-		err = -errno;
+		err = -EINVAL;
 		pr_warn_elf("failed to write ELF contents");
 		return libbpf_err(err);
 	}
-- 
cgit v1.2.3


From 0bee36d1a51366fa57b731f8975f26f92943b43e Mon Sep 17 00:00:00 2001
From: Song Yoong Siang <yoong.siang.song@intel.com>
Date: Thu, 5 Dec 2024 12:42:58 +0800
Subject: selftests/bpf: Actuate tx_metadata_len in xdp_hw_metadata

set XDP_UMEM_TX_METADATA_LEN flag to reserve tx_metadata_len bytes of
per-chunk metadata.

Fixes: d5e726d9143c ("xsk: Require XDP_UMEM_TX_METADATA_LEN to actuate tx_metadata_len")
Signed-off-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241205044258.3155799-1-yoong.siang.song@intel.com
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 6f9956eed797..ad6c08dfd6c8 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -79,7 +79,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
 		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
 		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
-		.flags = XSK_UMEM__DEFAULT_FLAGS,
+		.flags = XDP_UMEM_TX_METADATA_LEN,
 		.tx_metadata_len = sizeof(struct xsk_tx_metadata),
 	};
 	__u32 idx = 0;
-- 
cgit v1.2.3


From 2309132fc5d9d87deb15bda3497326aded6bfe4a Mon Sep 17 00:00:00 2001
From: Song Yoong Siang <yoong.siang.song@intel.com>
Date: Thu, 5 Dec 2024 13:19:36 +0800
Subject: selftests/bpf: Enable Tx hwtstamp in xdp_hw_metadata

Currently, user needs to manually enable transmit hardware timestamp
feature of certain Ethernet drivers, e.g. stmmac and igc drivers, through
following command after running the xdp_hw_metadata app.

sudo hwstamp_ctl -i eth0 -t 1

To simplify the step test of xdp_hw_metadata, set tx_type to HWTSTAMP_TX_ON
to enable hardware timestamping for all outgoing packets, so that user no
longer need to execute hwstamp_ctl command.

Signed-off-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241205051936.3156307-1-yoong.siang.song@intel.com
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index ad6c08dfd6c8..e38675d9b118 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -551,6 +551,7 @@ static void hwtstamp_enable(const char *ifname)
 {
 	struct hwtstamp_config cfg = {
 		.rx_filter = HWTSTAMP_FILTER_ALL,
+		.tx_type = HWTSTAMP_TX_ON,
 	};
 
 	hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg);
-- 
cgit v1.2.3


From 1e7e1f0e8be147ae98fe88ec82150c97265965a6 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <tavip@google.com>
Date: Tue, 3 Dec 2024 19:05:20 -0800
Subject: selftests/tc-testing: sfq: test that kernel rejects limit of 1

Add test to check that the kernel rejects a configuration with the
limit set to 1.

Signed-off-by: Octavian Purdila <tavip@google.com>
Link: https://patch.msgid.link/20241204030520.2084663-3-tavip@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../tc-testing/scripts/sfq_rejects_limit_1.py       | 21 +++++++++++++++++++++
 .../selftests/tc-testing/tc-tests/qdiscs/sfq.json   | 20 ++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100755 tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py b/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py
new file mode 100755
index 000000000000..0f44a6199495
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script that checks that SFQ rejects a limit of 1 at the kernel
+# level. We can't use iproute2's tc because it does not accept a limit
+# of 1.
+
+import sys
+import os
+
+from pyroute2 import IPRoute
+from pyroute2.netlink.exceptions import NetlinkError
+
+ip = IPRoute()
+ifidx = ip.link_lookup(ifname=sys.argv[1])
+
+try:
+    ip.tc('add', 'sfq', ifidx, limit=1)
+    sys.exit(1)
+except NetlinkError:
+    sys.exit(0)
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json
index 16d51936b385..50e8d72781cb 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json
@@ -208,5 +208,25 @@
         "teardown": [
             "$TC qdisc del dev $DUMMY handle 1: root"
         ]
+    },
+    {
+        "id": "4d6f",
+        "name": "Check that limit of 1 is rejected",
+        "category": [
+            "qdisc",
+            "sfq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+        ],
+        "cmdUnderTest": "./scripts/sfq_rejects_limit_1.py $DUMMY",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $DUMMY",
+        "matchPattern": "sfq",
+        "matchCount": "0",
+        "teardown": [
+        ]
     }
 ]
-- 
cgit v1.2.3


From d6212d82bf26f3cbd30b84df064080dd98051ae9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Wed, 4 Dec 2024 14:28:26 +0100
Subject: selftests/bpf: Consolidate kernel modules into common directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The selftests build four kernel modules which use copy-pasted Makefile
targets. This is a bit messy, and doesn't scale so well when we add more
modules, so let's consolidate these rules into a single rule generated
for each module name, and move the module sources into a single
directory.

To avoid parallel builds of the different modules stepping on each
other's toes during the 'modpost' phase of the Kbuild 'make modules',
the module files should really be a grouped target. However, make only
added explicit support for grouped targets in version 4.3, which is
newer than the minimum version supported by the kernel. However, make
implicitly treats pattern matching rules with multiple targets as a
grouped target, so we can work around this by turning the rule into a
pattern matching target. We do this by replacing '.ko' with '%ko' in the
targets with subst().

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Viktor Malik <vmalik@redhat.com>
Link: https://lore.kernel.org/bpf/20241204-bpf-selftests-mod-compile-v5-1-b96231134a49@redhat.com
---
 tools/testing/selftests/bpf/Makefile               |   64 +-
 .../selftests/bpf/bpf_test_modorder_x/Makefile     |   19 -
 .../bpf/bpf_test_modorder_x/bpf_test_modorder_x.c  |   39 -
 .../selftests/bpf/bpf_test_modorder_y/Makefile     |   19 -
 .../bpf/bpf_test_modorder_y/bpf_test_modorder_y.c  |   39 -
 .../testing/selftests/bpf/bpf_test_no_cfi/Makefile |   19 -
 .../bpf/bpf_test_no_cfi/bpf_test_no_cfi.c          |   84 --
 tools/testing/selftests/bpf/bpf_testmod/.gitignore |    6 -
 tools/testing/selftests/bpf/bpf_testmod/Makefile   |   20 -
 .../selftests/bpf/bpf_testmod/bpf_testmod-events.h |   71 -
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        | 1487 --------------------
 .../selftests/bpf/bpf_testmod/bpf_testmod.h        |  113 --
 .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h  |  162 ---
 .../testing/selftests/bpf/prog_tests/core_reloc.c  |    2 +-
 tools/testing/selftests/bpf/progs/bad_struct_ops.c |    2 +-
 tools/testing/selftests/bpf/progs/cb_refs.c        |    2 +-
 tools/testing/selftests/bpf/progs/epilogue_exit.c  |    4 +-
 .../selftests/bpf/progs/epilogue_tailcall.c        |    4 +-
 tools/testing/selftests/bpf/progs/iters_testmod.c  |    2 +-
 tools/testing/selftests/bpf/progs/jit_probe_mem.c  |    2 +-
 .../selftests/bpf/progs/kfunc_call_destructive.c   |    2 +-
 .../testing/selftests/bpf/progs/kfunc_call_fail.c  |    2 +-
 .../testing/selftests/bpf/progs/kfunc_call_race.c  |    2 +-
 .../testing/selftests/bpf/progs/kfunc_call_test.c  |    2 +-
 .../selftests/bpf/progs/kfunc_call_test_subprog.c  |    2 +-
 .../testing/selftests/bpf/progs/local_kptr_stash.c |    2 +-
 tools/testing/selftests/bpf/progs/map_kptr.c       |    2 +-
 tools/testing/selftests/bpf/progs/map_kptr_fail.c  |    2 +-
 tools/testing/selftests/bpf/progs/missed_kprobe.c  |    2 +-
 .../selftests/bpf/progs/missed_kprobe_recursion.c  |    2 +-
 tools/testing/selftests/bpf/progs/nested_acquire.c |    2 +-
 tools/testing/selftests/bpf/progs/pro_epilogue.c   |    4 +-
 .../selftests/bpf/progs/pro_epilogue_goto_start.c  |    4 +-
 tools/testing/selftests/bpf/progs/sock_addr_kern.c |    2 +-
 .../selftests/bpf/progs/struct_ops_detach.c        |    2 +-
 .../selftests/bpf/progs/struct_ops_forgotten_cb.c  |    2 +-
 .../selftests/bpf/progs/struct_ops_maybe_null.c    |    2 +-
 .../bpf/progs/struct_ops_maybe_null_fail.c         |    2 +-
 .../selftests/bpf/progs/struct_ops_module.c        |    2 +-
 .../selftests/bpf/progs/struct_ops_multi_pages.c   |    2 +-
 .../selftests/bpf/progs/struct_ops_nulled_out_cb.c |    2 +-
 .../selftests/bpf/progs/struct_ops_private_stack.c |    2 +-
 .../bpf/progs/struct_ops_private_stack_fail.c      |    2 +-
 .../bpf/progs/struct_ops_private_stack_recur.c     |    2 +-
 .../bpf/progs/test_kfunc_param_nullable.c          |    2 +-
 .../selftests/bpf/progs/test_module_attach.c       |    2 +-
 .../selftests/bpf/progs/test_tp_btf_nullable.c     |    2 +-
 .../testing/selftests/bpf/progs/unsupported_ops.c  |    2 +-
 tools/testing/selftests/bpf/progs/wq.c             |    2 +-
 tools/testing/selftests/bpf/progs/wq_failures.c    |    2 +-
 tools/testing/selftests/bpf/test_kmods/.gitignore  |    6 +
 tools/testing/selftests/bpf/test_kmods/Makefile    |   21 +
 .../selftests/bpf/test_kmods/bpf_test_modorder_x.c |   39 +
 .../selftests/bpf/test_kmods/bpf_test_modorder_y.c |   39 +
 .../selftests/bpf/test_kmods/bpf_test_no_cfi.c     |   84 ++
 .../selftests/bpf/test_kmods/bpf_testmod-events.h  |   71 +
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 1487 ++++++++++++++++++++
 .../testing/selftests/bpf/test_kmods/bpf_testmod.h |  113 ++
 .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h   |  162 +++
 59 files changed, 2084 insertions(+), 2162 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/bpf_test_modorder_x/Makefile
 delete mode 100644 tools/testing/selftests/bpf/bpf_test_modorder_x/bpf_test_modorder_x.c
 delete mode 100644 tools/testing/selftests/bpf/bpf_test_modorder_y/Makefile
 delete mode 100644 tools/testing/selftests/bpf/bpf_test_modorder_y/bpf_test_modorder_y.c
 delete mode 100644 tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile
 delete mode 100644 tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
 delete mode 100644 tools/testing/selftests/bpf/bpf_testmod/.gitignore
 delete mode 100644 tools/testing/selftests/bpf/bpf_testmod/Makefile
 delete mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
 delete mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
 delete mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
 delete mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
 create mode 100644 tools/testing/selftests/bpf/test_kmods/.gitignore
 create mode 100644 tools/testing/selftests/bpf/test_kmods/Makefile
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_x.c
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_y.c
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_test_no_cfi.c
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_testmod.h
 create mode 100644 tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a1964d40a60e..b03df5d295c8 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -150,13 +150,13 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh ima_setup.sh verify_sig_setup.sh \
 	test_xdp_vlan.sh test_bpftool.py
 
+TEST_KMODS := bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \
+	bpf_test_modorder_y.ko
+TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS))
+
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = \
 	bench \
-	bpf_testmod.ko \
-	bpf_test_modorder_x.ko \
-	bpf_test_modorder_y.ko \
-	bpf_test_no_cfi.ko \
 	flow_dissector_load \
 	runqslower \
 	test_cpp \
@@ -182,8 +182,9 @@ override define CLEAN
 	$(Q)$(RM) -r $(TEST_GEN_PROGS)
 	$(Q)$(RM) -r $(TEST_GEN_PROGS_EXTENDED)
 	$(Q)$(RM) -r $(TEST_GEN_FILES)
+	$(Q)$(RM) -r $(TEST_KMODS)
 	$(Q)$(RM) -r $(EXTRA_CLEAN)
-	$(Q)$(MAKE) -C bpf_testmod clean
+	$(Q)$(MAKE) -C test_kmods clean
 	$(Q)$(MAKE) docs-clean
 endef
 
@@ -249,7 +250,7 @@ endif
 # to build individual tests.
 # NOTE: Semicolon at the end is critical to override lib.mk's default static
 # rule for binaries.
-$(notdir $(TEST_GEN_PROGS)						\
+$(notdir $(TEST_GEN_PROGS) $(TEST_KMODS)				\
 	 $(TEST_GEN_PROGS_EXTENDED)): %: $(OUTPUT)/% ;
 
 # sort removes libbpf duplicates when not cross-building
@@ -303,37 +304,19 @@ $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c
 		  $< -o $@ \
 		  $(shell $(PKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto)
 
-$(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
-	$(call msg,MOD,,$@)
-	$(Q)$(RM) bpf_testmod/bpf_testmod.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) -C bpf_testmod \
-		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)     \
-		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
-	$(Q)cp bpf_testmod/bpf_testmod.ko $@
-
-$(OUTPUT)/bpf_test_no_cfi.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_no_cfi/Makefile bpf_test_no_cfi/*.[ch])
-	$(call msg,MOD,,$@)
-	$(Q)$(RM) bpf_test_no_cfi/bpf_test_no_cfi.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) -C bpf_test_no_cfi \
-		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	 \
+# This should really be a grouped target, but make versions before 4.3 don't
+# support that for regular rules. However, pattern matching rules are implicitly
+# treated as grouped even with older versions of make, so as a workaround, the
+# subst() turns the rule into a pattern matching rule
+$(addprefix test_kmods/,$(subst .ko,%ko,$(TEST_KMODS))): $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard test_kmods/Makefile test_kmods/*.[ch])
+	$(Q)$(RM) test_kmods/*.ko test_kmods/*.mod.o # force re-compilation
+	$(Q)$(MAKE) $(submake_extras) -C test_kmods	\
+		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	\
 		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
-	$(Q)cp bpf_test_no_cfi/bpf_test_no_cfi.ko $@
 
-$(OUTPUT)/bpf_test_modorder_x.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_modorder_x/Makefile bpf_test_modorder_x/*.[ch])
+$(TEST_KMOD_TARGETS): $(addprefix test_kmods/,$(TEST_KMODS))
 	$(call msg,MOD,,$@)
-	$(Q)$(RM) bpf_test_modorder_x/bpf_test_modorder_x.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) -C bpf_test_modorder_x \
-		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	     \
-		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
-	$(Q)cp bpf_test_modorder_x/bpf_test_modorder_x.ko $@
-
-$(OUTPUT)/bpf_test_modorder_y.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_test_modorder_y/Makefile bpf_test_modorder_y/*.[ch])
-	$(call msg,MOD,,$@)
-	$(Q)$(RM) bpf_test_modorder_y/bpf_test_modorder_y.ko # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) -C bpf_test_modorder_y \
-		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	     \
-		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
-	$(Q)cp bpf_test_modorder_y/bpf_test_modorder_y.ko $@
+	$(Q)cp test_kmods/$(@F) $@
 
 
 DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
@@ -758,14 +741,12 @@ TRUNNER_EXTRA_SOURCES := test_progs.c		\
 			 json_writer.c 		\
 			 flow_dissector_load.h	\
 			 ip_check_defrag_frags.h
-TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko	\
-		       $(OUTPUT)/bpf_test_no_cfi.ko			\
-		       $(OUTPUT)/bpf_test_modorder_x.ko		\
-		       $(OUTPUT)/bpf_test_modorder_y.ko		\
+TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read				\
 		       $(OUTPUT)/liburandom_read.so			\
 		       $(OUTPUT)/xdp_synproxy				\
 		       $(OUTPUT)/sign-file				\
 		       $(OUTPUT)/uprobe_multi				\
+		       $(TEST_KMOD_TARGETS)				\
 		       ima_setup.sh 					\
 		       verify_sig_setup.sh				\
 		       $(wildcard progs/btf_dump_test_case_*.c)		\
@@ -892,12 +873,9 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c uprobe_multi.ld
 
 EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
-	feature bpftool 						\
+	feature bpftool $(TEST_KMOD_TARGETS)				\
 	$(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h	\
-			       no_alu32 cpuv4 bpf_gcc bpf_testmod.ko	\
-			       bpf_test_no_cfi.ko			\
-			       bpf_test_modorder_x.ko			\
-			       bpf_test_modorder_y.ko			\
+			       no_alu32 cpuv4 bpf_gcc			\
 			       liburandom_read.so)			\
 	$(OUTPUT)/FEATURE-DUMP.selftests
 
diff --git a/tools/testing/selftests/bpf/bpf_test_modorder_x/Makefile b/tools/testing/selftests/bpf/bpf_test_modorder_x/Makefile
deleted file mode 100644
index 40b25b98ad1b..000000000000
--- a/tools/testing/selftests/bpf/bpf_test_modorder_x/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-BPF_TESTMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= $(abspath $(BPF_TESTMOD_DIR)/../../../../..)
-
-ifeq ($(V),1)
-Q =
-else
-Q = @
-endif
-
-MODULES = bpf_test_modorder_x.ko
-
-obj-m += bpf_test_modorder_x.o
-
-all:
-	+$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) modules
-
-clean:
-	+$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) clean
-
diff --git a/tools/testing/selftests/bpf/bpf_test_modorder_x/bpf_test_modorder_x.c b/tools/testing/selftests/bpf/bpf_test_modorder_x/bpf_test_modorder_x.c
deleted file mode 100644
index 0cc747fa912f..000000000000
--- a/tools/testing/selftests/bpf/bpf_test_modorder_x/bpf_test_modorder_x.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
-#include <linux/btf.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-__bpf_kfunc_start_defs();
-
-__bpf_kfunc int bpf_test_modorder_retx(void)
-{
-	return 'x';
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(bpf_test_modorder_kfunc_x_ids)
-BTF_ID_FLAGS(func, bpf_test_modorder_retx);
-BTF_KFUNCS_END(bpf_test_modorder_kfunc_x_ids)
-
-static const struct btf_kfunc_id_set bpf_test_modorder_x_set = {
-	.owner = THIS_MODULE,
-	.set = &bpf_test_modorder_kfunc_x_ids,
-};
-
-static int __init bpf_test_modorder_x_init(void)
-{
-	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
-					 &bpf_test_modorder_x_set);
-}
-
-static void __exit bpf_test_modorder_x_exit(void)
-{
-}
-
-module_init(bpf_test_modorder_x_init);
-module_exit(bpf_test_modorder_x_exit);
-
-MODULE_DESCRIPTION("BPF selftest ordertest module X");
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/bpf/bpf_test_modorder_y/Makefile b/tools/testing/selftests/bpf/bpf_test_modorder_y/Makefile
deleted file mode 100644
index 52c3ab9d84e2..000000000000
--- a/tools/testing/selftests/bpf/bpf_test_modorder_y/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-BPF_TESTMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= $(abspath $(BPF_TESTMOD_DIR)/../../../../..)
-
-ifeq ($(V),1)
-Q =
-else
-Q = @
-endif
-
-MODULES = bpf_test_modorder_y.ko
-
-obj-m += bpf_test_modorder_y.o
-
-all:
-	+$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) modules
-
-clean:
-	+$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) clean
-
diff --git a/tools/testing/selftests/bpf/bpf_test_modorder_y/bpf_test_modorder_y.c b/tools/testing/selftests/bpf/bpf_test_modorder_y/bpf_test_modorder_y.c
deleted file mode 100644
index c627ee085d13..000000000000
--- a/tools/testing/selftests/bpf/bpf_test_modorder_y/bpf_test_modorder_y.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
-#include <linux/btf.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-__bpf_kfunc_start_defs();
-
-__bpf_kfunc int bpf_test_modorder_rety(void)
-{
-	return 'y';
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(bpf_test_modorder_kfunc_y_ids)
-BTF_ID_FLAGS(func, bpf_test_modorder_rety);
-BTF_KFUNCS_END(bpf_test_modorder_kfunc_y_ids)
-
-static const struct btf_kfunc_id_set bpf_test_modorder_y_set = {
-	.owner = THIS_MODULE,
-	.set = &bpf_test_modorder_kfunc_y_ids,
-};
-
-static int __init bpf_test_modorder_y_init(void)
-{
-	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
-					 &bpf_test_modorder_y_set);
-}
-
-static void __exit bpf_test_modorder_y_exit(void)
-{
-}
-
-module_init(bpf_test_modorder_y_init);
-module_exit(bpf_test_modorder_y_exit);
-
-MODULE_DESCRIPTION("BPF selftest ordertest module Y");
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile b/tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile
deleted file mode 100644
index ed5143b79edf..000000000000
--- a/tools/testing/selftests/bpf/bpf_test_no_cfi/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-BPF_TEST_NO_CFI_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= $(abspath $(BPF_TEST_NO_CFI_DIR)/../../../../..)
-
-ifeq ($(V),1)
-Q =
-else
-Q = @
-endif
-
-MODULES = bpf_test_no_cfi.ko
-
-obj-m += bpf_test_no_cfi.o
-
-all:
-	+$(Q)make -C $(KDIR) M=$(BPF_TEST_NO_CFI_DIR) modules
-
-clean:
-	+$(Q)make -C $(KDIR) M=$(BPF_TEST_NO_CFI_DIR) clean
-
diff --git a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
deleted file mode 100644
index 948eb3962732..000000000000
--- a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
-#include <linux/bpf.h>
-#include <linux/btf.h>
-#include <linux/init.h>
-#include <linux/module.h>
-
-struct bpf_test_no_cfi_ops {
-	void (*fn_1)(void);
-	void (*fn_2)(void);
-};
-
-static int dummy_init(struct btf *btf)
-{
-	return 0;
-}
-
-static int dummy_init_member(const struct btf_type *t,
-			     const struct btf_member *member,
-			     void *kdata, const void *udata)
-{
-	return 0;
-}
-
-static int dummy_reg(void *kdata, struct bpf_link *link)
-{
-	return 0;
-}
-
-static void dummy_unreg(void *kdata, struct bpf_link *link)
-{
-}
-
-static const struct bpf_verifier_ops dummy_verifier_ops;
-
-static void bpf_test_no_cfi_ops__fn_1(void)
-{
-}
-
-static void bpf_test_no_cfi_ops__fn_2(void)
-{
-}
-
-static struct bpf_test_no_cfi_ops __test_no_cif_ops = {
-	.fn_1 = bpf_test_no_cfi_ops__fn_1,
-	.fn_2 = bpf_test_no_cfi_ops__fn_2,
-};
-
-static struct bpf_struct_ops test_no_cif_ops = {
-	.verifier_ops = &dummy_verifier_ops,
-	.init = dummy_init,
-	.init_member = dummy_init_member,
-	.reg = dummy_reg,
-	.unreg = dummy_unreg,
-	.name = "bpf_test_no_cfi_ops",
-	.owner = THIS_MODULE,
-};
-
-static int bpf_test_no_cfi_init(void)
-{
-	int ret;
-
-	ret = register_bpf_struct_ops(&test_no_cif_ops,
-				      bpf_test_no_cfi_ops);
-	if (!ret)
-		return -EINVAL;
-
-	test_no_cif_ops.cfi_stubs = &__test_no_cif_ops;
-	ret = register_bpf_struct_ops(&test_no_cif_ops,
-				      bpf_test_no_cfi_ops);
-	return ret;
-}
-
-static void bpf_test_no_cfi_exit(void)
-{
-}
-
-module_init(bpf_test_no_cfi_init);
-module_exit(bpf_test_no_cfi_exit);
-
-MODULE_AUTHOR("Kuifeng Lee");
-MODULE_DESCRIPTION("BPF no cfi_stubs test module");
-MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/tools/testing/selftests/bpf/bpf_testmod/.gitignore b/tools/testing/selftests/bpf/bpf_testmod/.gitignore
deleted file mode 100644
index ded513777281..000000000000
--- a/tools/testing/selftests/bpf/bpf_testmod/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-*.mod
-*.mod.c
-*.o
-.ko
-/Module.symvers
-/modules.order
diff --git a/tools/testing/selftests/bpf/bpf_testmod/Makefile b/tools/testing/selftests/bpf/bpf_testmod/Makefile
deleted file mode 100644
index 15cb36c4483a..000000000000
--- a/tools/testing/selftests/bpf/bpf_testmod/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-BPF_TESTMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= $(abspath $(BPF_TESTMOD_DIR)/../../../../..)
-
-ifeq ($(V),1)
-Q =
-else
-Q = @
-endif
-
-MODULES = bpf_testmod.ko
-
-obj-m += bpf_testmod.o
-CFLAGS_bpf_testmod.o = -I$(src)
-
-all:
-	+$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) modules
-
-clean:
-	+$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) clean
-
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
deleted file mode 100644
index aeef86b3da74..000000000000
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2020 Facebook */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bpf_testmod
-
-#if !defined(_BPF_TESTMOD_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _BPF_TESTMOD_EVENTS_H
-
-#include <linux/tracepoint.h>
-#include "bpf_testmod.h"
-
-TRACE_EVENT(bpf_testmod_test_read,
-	TP_PROTO(struct task_struct *task, struct bpf_testmod_test_read_ctx *ctx),
-	TP_ARGS(task, ctx),
-	TP_STRUCT__entry(
-		__field(pid_t, pid)
-		__array(char, comm, TASK_COMM_LEN)
-		__field(loff_t, off)
-		__field(size_t, len)
-	),
-	TP_fast_assign(
-		__entry->pid = task->pid;
-		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
-		__entry->off = ctx->off;
-		__entry->len = ctx->len;
-	),
-	TP_printk("pid=%d comm=%s off=%llu len=%zu",
-		  __entry->pid, __entry->comm, __entry->off, __entry->len)
-);
-
-/* A bare tracepoint with no event associated with it */
-DECLARE_TRACE(bpf_testmod_test_write_bare,
-	TP_PROTO(struct task_struct *task, struct bpf_testmod_test_write_ctx *ctx),
-	TP_ARGS(task, ctx)
-);
-
-/* Used in bpf_testmod_test_read() to test __nullable suffix */
-DECLARE_TRACE(bpf_testmod_test_nullable_bare,
-	TP_PROTO(struct bpf_testmod_test_read_ctx *ctx__nullable),
-	TP_ARGS(ctx__nullable)
-);
-
-struct sk_buff;
-
-DECLARE_TRACE(bpf_testmod_test_raw_tp_null,
-	TP_PROTO(struct sk_buff *skb),
-	TP_ARGS(skb)
-);
-
-
-#undef BPF_TESTMOD_DECLARE_TRACE
-#ifdef DECLARE_TRACE_WRITABLE
-#define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
-	DECLARE_TRACE_WRITABLE(call, PARAMS(proto), PARAMS(args), size)
-#else
-#define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
-	DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))
-#endif
-
-BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare,
-	TP_PROTO(struct bpf_testmod_test_writable_ctx *ctx),
-	TP_ARGS(ctx),
-	sizeof(struct bpf_testmod_test_writable_ctx)
-);
-
-#endif /* _BPF_TESTMOD_EVENTS_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE bpf_testmod-events
-#include <trace/define_trace.h>
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
deleted file mode 100644
index cc9dde507aba..000000000000
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ /dev/null
@@ -1,1487 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <linux/bpf.h>
-#include <linux/btf.h>
-#include <linux/btf_ids.h>
-#include <linux/delay.h>
-#include <linux/error-injection.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/percpu-defs.h>
-#include <linux/sysfs.h>
-#include <linux/tracepoint.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/nsproxy.h>
-#include <linux/inet.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/un.h>
-#include <linux/filter.h>
-#include <net/sock.h>
-#include <linux/namei.h>
-#include "bpf_testmod.h"
-#include "bpf_testmod_kfunc.h"
-
-#define CREATE_TRACE_POINTS
-#include "bpf_testmod-events.h"
-
-#define CONNECT_TIMEOUT_SEC 1
-
-typedef int (*func_proto_typedef)(long);
-typedef int (*func_proto_typedef_nested1)(func_proto_typedef);
-typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1);
-
-DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123;
-long bpf_testmod_test_struct_arg_result;
-static DEFINE_MUTEX(sock_lock);
-static struct socket *sock;
-
-struct bpf_testmod_struct_arg_1 {
-	int a;
-};
-struct bpf_testmod_struct_arg_2 {
-	long a;
-	long b;
-};
-
-struct bpf_testmod_struct_arg_3 {
-	int a;
-	int b[];
-};
-
-struct bpf_testmod_struct_arg_4 {
-	u64 a;
-	int b;
-};
-
-struct bpf_testmod_struct_arg_5 {
-	char a;
-	short b;
-	int c;
-	long d;
-};
-
-__bpf_hook_start();
-
-noinline int
-bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) {
-	bpf_testmod_test_struct_arg_result = a.a + a.b  + b + c;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_2(int a, struct bpf_testmod_struct_arg_2 b, int c) {
-	bpf_testmod_test_struct_arg_result = a + b.a + b.b + c;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_3(int a, int b, struct bpf_testmod_struct_arg_2 c) {
-	bpf_testmod_test_struct_arg_result = a + b + c.a + c.b;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_4(struct bpf_testmod_struct_arg_1 a, int b,
-			      int c, int d, struct bpf_testmod_struct_arg_2 e) {
-	bpf_testmod_test_struct_arg_result = a.a + b + c + d + e.a + e.b;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_5(void) {
-	bpf_testmod_test_struct_arg_result = 1;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_6(struct bpf_testmod_struct_arg_3 *a) {
-	bpf_testmod_test_struct_arg_result = a->b[0];
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_7(u64 a, void *b, short c, int d, void *e,
-			      struct bpf_testmod_struct_arg_4 f)
-{
-	bpf_testmod_test_struct_arg_result = a + (long)b + c + d +
-		(long)e + f.a + f.b;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_8(u64 a, void *b, short c, int d, void *e,
-			      struct bpf_testmod_struct_arg_4 f, int g)
-{
-	bpf_testmod_test_struct_arg_result = a + (long)b + c + d +
-		(long)e + f.a + f.b + g;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_struct_arg_9(u64 a, void *b, short c, int d, void *e, char f,
-			      short g, struct bpf_testmod_struct_arg_5 h, long i)
-{
-	bpf_testmod_test_struct_arg_result = a + (long)b + c + d + (long)e +
-		f + g + h.a + h.b + h.c + h.d + i;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-noinline int
-bpf_testmod_test_arg_ptr_to_struct(struct bpf_testmod_struct_arg_1 *a) {
-	bpf_testmod_test_struct_arg_result = a->a;
-	return bpf_testmod_test_struct_arg_result;
-}
-
-__bpf_kfunc void
-bpf_testmod_test_mod_kfunc(int i)
-{
-	*(int *)this_cpu_ptr(&bpf_testmod_ksym_percpu) = i;
-}
-
-__bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt)
-{
-	it->cnt = cnt;
-
-	if (cnt < 0)
-		return -EINVAL;
-
-	it->value = value;
-
-	return 0;
-}
-
-__bpf_kfunc s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq* it)
-{
-	if (it->cnt <= 0)
-		return NULL;
-
-	it->cnt--;
-
-	return &it->value;
-}
-
-__bpf_kfunc s64 bpf_iter_testmod_seq_value(int val, struct bpf_iter_testmod_seq* it__iter)
-{
-	if (it__iter->cnt < 0)
-		return 0;
-
-	return val + it__iter->value;
-}
-
-__bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it)
-{
-	it->cnt = 0;
-}
-
-__bpf_kfunc void bpf_kfunc_common_test(void)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr,
-				       struct bpf_dynptr *ptr__nullable)
-{
-}
-
-__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr)
-{
-	return NULL;
-}
-
-__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr)
-{
-	return NULL;
-}
-
-__bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_trusted_task_test(struct task_struct *ptr)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_trusted_num_test(int *ptr)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr)
-{
-}
-
-__bpf_kfunc struct bpf_testmod_ctx *
-bpf_testmod_ctx_create(int *err)
-{
-	struct bpf_testmod_ctx *ctx;
-
-	ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
-	if (!ctx) {
-		*err = -ENOMEM;
-		return NULL;
-	}
-	refcount_set(&ctx->usage, 1);
-
-	return ctx;
-}
-
-static void testmod_free_cb(struct rcu_head *head)
-{
-	struct bpf_testmod_ctx *ctx;
-
-	ctx = container_of(head, struct bpf_testmod_ctx, rcu);
-	kfree(ctx);
-}
-
-__bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx)
-{
-	if (!ctx)
-		return;
-	if (refcount_dec_and_test(&ctx->usage))
-		call_rcu(&ctx->rcu, testmod_free_cb);
-}
-
-static struct bpf_testmod_ops3 *st_ops3;
-
-static int bpf_testmod_test_3(void)
-{
-	return 0;
-}
-
-static int bpf_testmod_test_4(void)
-{
-	return 0;
-}
-
-static struct bpf_testmod_ops3 __bpf_testmod_ops3 = {
-	.test_1 = bpf_testmod_test_3,
-	.test_2 = bpf_testmod_test_4,
-};
-
-static void bpf_testmod_test_struct_ops3(void)
-{
-	if (st_ops3)
-		st_ops3->test_1();
-}
-
-__bpf_kfunc void bpf_testmod_ops3_call_test_1(void)
-{
-	st_ops3->test_1();
-}
-
-__bpf_kfunc void bpf_testmod_ops3_call_test_2(void)
-{
-	st_ops3->test_2();
-}
-
-struct bpf_testmod_btf_type_tag_1 {
-	int a;
-};
-
-struct bpf_testmod_btf_type_tag_2 {
-	struct bpf_testmod_btf_type_tag_1 __user *p;
-};
-
-struct bpf_testmod_btf_type_tag_3 {
-	struct bpf_testmod_btf_type_tag_1 __percpu *p;
-};
-
-noinline int
-bpf_testmod_test_btf_type_tag_user_1(struct bpf_testmod_btf_type_tag_1 __user *arg) {
-	BTF_TYPE_EMIT(func_proto_typedef);
-	BTF_TYPE_EMIT(func_proto_typedef_nested1);
-	BTF_TYPE_EMIT(func_proto_typedef_nested2);
-	return arg->a;
-}
-
-noinline int
-bpf_testmod_test_btf_type_tag_user_2(struct bpf_testmod_btf_type_tag_2 *arg) {
-	return arg->p->a;
-}
-
-noinline int
-bpf_testmod_test_btf_type_tag_percpu_1(struct bpf_testmod_btf_type_tag_1 __percpu *arg) {
-	return arg->a;
-}
-
-noinline int
-bpf_testmod_test_btf_type_tag_percpu_2(struct bpf_testmod_btf_type_tag_3 *arg) {
-	return arg->p->a;
-}
-
-noinline int bpf_testmod_loop_test(int n)
-{
-	/* Make sum volatile, so smart compilers, such as clang, will not
-	 * optimize the code by removing the loop.
-	 */
-	volatile int sum = 0;
-	int i;
-
-	/* the primary goal of this test is to test LBR. Create a lot of
-	 * branches in the function, so we can catch it easily.
-	 */
-	for (i = 0; i < n; i++)
-		sum += i;
-	return sum;
-}
-
-__weak noinline struct file *bpf_testmod_return_ptr(int arg)
-{
-	static struct file f = {};
-
-	switch (arg) {
-	case 1: return (void *)EINVAL;		/* user addr */
-	case 2: return (void *)0xcafe4a11;	/* user addr */
-	case 3: return (void *)-EINVAL;		/* canonical, but invalid */
-	case 4: return (void *)(1ull << 60);	/* non-canonical and invalid */
-	case 5: return (void *)~(1ull << 30);	/* trigger extable */
-	case 6: return &f;			/* valid addr */
-	case 7: return (void *)((long)&f | 1);	/* kernel tricks */
-#ifdef CONFIG_X86_64
-	case 8: return (void *)VSYSCALL_ADDR;   /* vsyscall page address */
-#endif
-	default: return NULL;
-	}
-}
-
-noinline int bpf_testmod_fentry_test1(int a)
-{
-	return a + 1;
-}
-
-noinline int bpf_testmod_fentry_test2(int a, u64 b)
-{
-	return a + b;
-}
-
-noinline int bpf_testmod_fentry_test3(char a, int b, u64 c)
-{
-	return a + b + c;
-}
-
-noinline int bpf_testmod_fentry_test7(u64 a, void *b, short c, int d,
-				      void *e, char f, int g)
-{
-	return a + (long)b + c + d + (long)e + f + g;
-}
-
-noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d,
-				       void *e, char f, int g,
-				       unsigned int h, long i, __u64 j,
-				       unsigned long k)
-{
-	return a + (long)b + c + d + (long)e + f + g + h + i + j + k;
-}
-
-int bpf_testmod_fentry_ok;
-
-noinline ssize_t
-bpf_testmod_test_read(struct file *file, struct kobject *kobj,
-		      struct bin_attribute *bin_attr,
-		      char *buf, loff_t off, size_t len)
-{
-	struct bpf_testmod_test_read_ctx ctx = {
-		.buf = buf,
-		.off = off,
-		.len = len,
-	};
-	struct bpf_testmod_struct_arg_1 struct_arg1 = {10}, struct_arg1_2 = {-1};
-	struct bpf_testmod_struct_arg_2 struct_arg2 = {2, 3};
-	struct bpf_testmod_struct_arg_3 *struct_arg3;
-	struct bpf_testmod_struct_arg_4 struct_arg4 = {21, 22};
-	struct bpf_testmod_struct_arg_5 struct_arg5 = {23, 24, 25, 26};
-	int i = 1;
-
-	while (bpf_testmod_return_ptr(i))
-		i++;
-
-	(void)bpf_testmod_test_struct_arg_1(struct_arg2, 1, 4);
-	(void)bpf_testmod_test_struct_arg_2(1, struct_arg2, 4);
-	(void)bpf_testmod_test_struct_arg_3(1, 4, struct_arg2);
-	(void)bpf_testmod_test_struct_arg_4(struct_arg1, 1, 2, 3, struct_arg2);
-	(void)bpf_testmod_test_struct_arg_5();
-	(void)bpf_testmod_test_struct_arg_7(16, (void *)17, 18, 19,
-					    (void *)20, struct_arg4);
-	(void)bpf_testmod_test_struct_arg_8(16, (void *)17, 18, 19,
-					    (void *)20, struct_arg4, 23);
-	(void)bpf_testmod_test_struct_arg_9(16, (void *)17, 18, 19, (void *)20,
-					    21, 22, struct_arg5, 27);
-
-	(void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2);
-
-	(void)trace_bpf_testmod_test_raw_tp_null(NULL);
-
-	bpf_testmod_test_struct_ops3();
-
-	struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) +
-				sizeof(int)), GFP_KERNEL);
-	if (struct_arg3 != NULL) {
-		struct_arg3->b[0] = 1;
-		(void)bpf_testmod_test_struct_arg_6(struct_arg3);
-		kfree(struct_arg3);
-	}
-
-	/* This is always true. Use the check to make sure the compiler
-	 * doesn't remove bpf_testmod_loop_test.
-	 */
-	if (bpf_testmod_loop_test(101) > 100)
-		trace_bpf_testmod_test_read(current, &ctx);
-
-	trace_bpf_testmod_test_nullable_bare(NULL);
-
-	/* Magic number to enable writable tp */
-	if (len == 64) {
-		struct bpf_testmod_test_writable_ctx writable = {
-			.val = 1024,
-		};
-		trace_bpf_testmod_test_writable_bare(&writable);
-		if (writable.early_ret)
-			return snprintf(buf, len, "%d\n", writable.val);
-	}
-
-	if (bpf_testmod_fentry_test1(1) != 2 ||
-	    bpf_testmod_fentry_test2(2, 3) != 5 ||
-	    bpf_testmod_fentry_test3(4, 5, 6) != 15 ||
-	    bpf_testmod_fentry_test7(16, (void *)17, 18, 19, (void *)20,
-			21, 22) != 133 ||
-	    bpf_testmod_fentry_test11(16, (void *)17, 18, 19, (void *)20,
-			21, 22, 23, 24, 25, 26) != 231)
-		goto out;
-
-	bpf_testmod_fentry_ok = 1;
-out:
-	return -EIO; /* always fail */
-}
-EXPORT_SYMBOL(bpf_testmod_test_read);
-ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO);
-
-noinline ssize_t
-bpf_testmod_test_write(struct file *file, struct kobject *kobj,
-		      struct bin_attribute *bin_attr,
-		      char *buf, loff_t off, size_t len)
-{
-	struct bpf_testmod_test_write_ctx ctx = {
-		.buf = buf,
-		.off = off,
-		.len = len,
-	};
-
-	trace_bpf_testmod_test_write_bare(current, &ctx);
-
-	return -EIO; /* always fail */
-}
-EXPORT_SYMBOL(bpf_testmod_test_write);
-ALLOW_ERROR_INJECTION(bpf_testmod_test_write, ERRNO);
-
-noinline int bpf_fentry_shadow_test(int a)
-{
-	return a + 2;
-}
-EXPORT_SYMBOL_GPL(bpf_fentry_shadow_test);
-
-__bpf_hook_end();
-
-static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
-	.attr = { .name = "bpf_testmod", .mode = 0666, },
-	.read = bpf_testmod_test_read,
-	.write = bpf_testmod_test_write,
-};
-
-/* bpf_testmod_uprobe sysfs attribute is so far enabled for x86_64 only,
- * please see test_uretprobe_regs_change test
- */
-#ifdef __x86_64__
-
-static int
-uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
-		   struct pt_regs *regs, __u64 *data)
-
-{
-	regs->ax  = 0x12345678deadbeef;
-	regs->cx  = 0x87654321feebdaed;
-	regs->r11 = (u64) -1;
-	return true;
-}
-
-struct testmod_uprobe {
-	struct path path;
-	struct uprobe *uprobe;
-	struct uprobe_consumer consumer;
-};
-
-static DEFINE_MUTEX(testmod_uprobe_mutex);
-
-static struct testmod_uprobe uprobe = {
-	.consumer.ret_handler = uprobe_ret_handler,
-};
-
-static int testmod_register_uprobe(loff_t offset)
-{
-	int err = -EBUSY;
-
-	if (uprobe.uprobe)
-		return -EBUSY;
-
-	mutex_lock(&testmod_uprobe_mutex);
-
-	if (uprobe.uprobe)
-		goto out;
-
-	err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
-	if (err)
-		goto out;
-
-	uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
-					offset, 0, &uprobe.consumer);
-	if (IS_ERR(uprobe.uprobe)) {
-		err = PTR_ERR(uprobe.uprobe);
-		path_put(&uprobe.path);
-		uprobe.uprobe = NULL;
-	}
-out:
-	mutex_unlock(&testmod_uprobe_mutex);
-	return err;
-}
-
-static void testmod_unregister_uprobe(void)
-{
-	mutex_lock(&testmod_uprobe_mutex);
-
-	if (uprobe.uprobe) {
-		uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer);
-		uprobe_unregister_sync();
-		path_put(&uprobe.path);
-		uprobe.uprobe = NULL;
-	}
-
-	mutex_unlock(&testmod_uprobe_mutex);
-}
-
-static ssize_t
-bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
-			 struct bin_attribute *bin_attr,
-			 char *buf, loff_t off, size_t len)
-{
-	unsigned long offset = 0;
-	int err = 0;
-
-	if (kstrtoul(buf, 0, &offset))
-		return -EINVAL;
-
-	if (offset)
-		err = testmod_register_uprobe(offset);
-	else
-		testmod_unregister_uprobe();
-
-	return err ?: strlen(buf);
-}
-
-static struct bin_attribute bin_attr_bpf_testmod_uprobe_file __ro_after_init = {
-	.attr = { .name = "bpf_testmod_uprobe", .mode = 0666, },
-	.write = bpf_testmod_uprobe_write,
-};
-
-static int register_bpf_testmod_uprobe(void)
-{
-	return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_uprobe_file);
-}
-
-static void unregister_bpf_testmod_uprobe(void)
-{
-	testmod_unregister_uprobe();
-	sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_uprobe_file);
-}
-
-#else
-static int register_bpf_testmod_uprobe(void)
-{
-	return 0;
-}
-
-static void unregister_bpf_testmod_uprobe(void) { }
-#endif
-
-BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
-BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value)
-BTF_ID_FLAGS(func, bpf_kfunc_common_test)
-BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test)
-BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
-BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1)
-BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2)
-BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
-
-BTF_ID_LIST(bpf_testmod_dtor_ids)
-BTF_ID(struct, bpf_testmod_ctx)
-BTF_ID(func, bpf_testmod_ctx_release)
-
-static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = {
-	.owner = THIS_MODULE,
-	.set   = &bpf_testmod_common_kfunc_ids,
-};
-
-__bpf_kfunc u64 bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
-{
-	return a + b + c + d;
-}
-
-__bpf_kfunc int bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
-{
-	return a + b;
-}
-
-__bpf_kfunc struct sock *bpf_kfunc_call_test3(struct sock *sk)
-{
-	return sk;
-}
-
-__bpf_kfunc long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d)
-{
-	/* Provoke the compiler to assume that the caller has sign-extended a,
-	 * b and c on platforms where this is required (e.g. s390x).
-	 */
-	return (long)a + (long)b + (long)c + d;
-}
-
-static struct prog_test_ref_kfunc prog_test_struct = {
-	.a = 42,
-	.b = 108,
-	.next = &prog_test_struct,
-	.cnt = REFCOUNT_INIT(1),
-};
-
-__bpf_kfunc struct prog_test_ref_kfunc *
-bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
-{
-	refcount_inc(&prog_test_struct.cnt);
-	return &prog_test_struct;
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p)
-{
-	WARN_ON_ONCE(1);
-}
-
-__bpf_kfunc struct prog_test_member *
-bpf_kfunc_call_memb_acquire(void)
-{
-	WARN_ON_ONCE(1);
-	return NULL;
-}
-
-__bpf_kfunc void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
-{
-	WARN_ON_ONCE(1);
-}
-
-static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const int size)
-{
-	if (size > 2 * sizeof(int))
-		return NULL;
-
-	return (int *)p;
-}
-
-__bpf_kfunc int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p,
-						  const int rdwr_buf_size)
-{
-	return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size);
-}
-
-__bpf_kfunc int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p,
-						    const int rdonly_buf_size)
-{
-	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
-}
-
-/* the next 2 ones can't be really used for testing expect to ensure
- * that the verifier rejects the call.
- * Acquire functions must return struct pointers, so these ones are
- * failing.
- */
-__bpf_kfunc int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p,
-						    const int rdonly_buf_size)
-{
-	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
-}
-
-__bpf_kfunc void bpf_kfunc_call_int_mem_release(int *p)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
-{
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
-{
-	/* p != NULL, but p->cnt could be 0 */
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_destructive(void)
-{
-}
-
-__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused)
-{
-	return arg;
-}
-
-__bpf_kfunc void bpf_kfunc_call_test_sleepable(void)
-{
-}
-
-__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args)
-{
-	int proto;
-	int err;
-
-	mutex_lock(&sock_lock);
-
-	if (sock) {
-		pr_err("%s called without releasing old sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	switch (args->af) {
-	case AF_INET:
-	case AF_INET6:
-		proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP;
-		break;
-	case AF_UNIX:
-		proto = PF_UNIX;
-		break;
-	default:
-		pr_err("invalid address family %d\n", args->af);
-		err = -EINVAL;
-		goto out;
-	}
-
-	err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type,
-			       proto, &sock);
-
-	if (!err)
-		/* Set timeout for call to kernel_connect() to prevent it from hanging,
-		 * and consider the connection attempt failed if it returns
-		 * -EINPROGRESS.
-		 */
-		sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ;
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc void bpf_kfunc_close_sock(void)
-{
-	mutex_lock(&sock_lock);
-
-	if (sock) {
-		sock_release(sock);
-		sock = NULL;
-	}
-
-	mutex_unlock(&sock_lock);
-}
-
-__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args)
-{
-	int err;
-
-	if (args->addrlen > sizeof(args->addr))
-		return -EINVAL;
-
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = kernel_connect(sock, (struct sockaddr *)&args->addr,
-			     args->addrlen, 0);
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args)
-{
-	int err;
-
-	if (args->addrlen > sizeof(args->addr))
-		return -EINVAL;
-
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen);
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc int bpf_kfunc_call_kernel_listen(void)
-{
-	int err;
-
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = kernel_listen(sock, 128);
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args)
-{
-	struct msghdr msg = {
-		.msg_name	= &args->addr.addr,
-		.msg_namelen	= args->addr.addrlen,
-	};
-	struct kvec iov;
-	int err;
-
-	if (args->addr.addrlen > sizeof(args->addr.addr) ||
-	    args->msglen > sizeof(args->msg))
-		return -EINVAL;
-
-	iov.iov_base = args->msg;
-	iov.iov_len  = args->msglen;
-
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen);
-	args->addr.addrlen = msg.msg_namelen;
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args)
-{
-	struct msghdr msg = {
-		.msg_name	= &args->addr.addr,
-		.msg_namelen	= args->addr.addrlen,
-	};
-	struct kvec iov;
-	int err;
-
-	if (args->addr.addrlen > sizeof(args->addr.addr) ||
-	    args->msglen > sizeof(args->msg))
-		return -EINVAL;
-
-	iov.iov_base = args->msg;
-	iov.iov_len  = args->msglen;
-
-	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen);
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = sock_sendmsg(sock, &msg);
-	args->addr.addrlen = msg.msg_namelen;
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args)
-{
-	int err;
-
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = kernel_getsockname(sock, (struct sockaddr *)&args->addr);
-	if (err < 0)
-		goto out;
-
-	args->addrlen = err;
-	err = 0;
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args)
-{
-	int err;
-
-	mutex_lock(&sock_lock);
-
-	if (!sock) {
-		pr_err("%s called without initializing sock", __func__);
-		err = -EPERM;
-		goto out;
-	}
-
-	err = kernel_getpeername(sock, (struct sockaddr *)&args->addr);
-	if (err < 0)
-		goto out;
-
-	args->addrlen = err;
-	err = 0;
-out:
-	mutex_unlock(&sock_lock);
-
-	return err;
-}
-
-static DEFINE_MUTEX(st_ops_mutex);
-static struct bpf_testmod_st_ops *st_ops;
-
-__bpf_kfunc int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args)
-{
-	int ret = -1;
-
-	mutex_lock(&st_ops_mutex);
-	if (st_ops && st_ops->test_prologue)
-		ret = st_ops->test_prologue(args);
-	mutex_unlock(&st_ops_mutex);
-
-	return ret;
-}
-
-__bpf_kfunc int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args)
-{
-	int ret = -1;
-
-	mutex_lock(&st_ops_mutex);
-	if (st_ops && st_ops->test_epilogue)
-		ret = st_ops->test_epilogue(args);
-	mutex_unlock(&st_ops_mutex);
-
-	return ret;
-}
-
-__bpf_kfunc int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args)
-{
-	int ret = -1;
-
-	mutex_lock(&st_ops_mutex);
-	if (st_ops && st_ops->test_pro_epilogue)
-		ret = st_ops->test_pro_epilogue(args);
-	mutex_unlock(&st_ops_mutex);
-
-	return ret;
-}
-
-__bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args)
-{
-	args->a += 10;
-	return args->a;
-}
-
-BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test4)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdwr_mem, KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdonly_mem, KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_acq_rdonly_mem, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_kfunc_call_int_mem_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS)
-BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
-
-static int bpf_testmod_ops_init(struct btf *btf)
-{
-	return 0;
-}
-
-static bool bpf_testmod_ops_is_valid_access(int off, int size,
-					    enum bpf_access_type type,
-					    const struct bpf_prog *prog,
-					    struct bpf_insn_access_aux *info)
-{
-	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
-}
-
-static int bpf_testmod_ops_init_member(const struct btf_type *t,
-				       const struct btf_member *member,
-				       void *kdata, const void *udata)
-{
-	if (member->offset == offsetof(struct bpf_testmod_ops, data) * 8) {
-		/* For data fields, this function has to copy it and return
-		 * 1 to indicate that the data has been handled by the
-		 * struct_ops type, or the verifier will reject the map if
-		 * the value of the data field is not zero.
-		 */
-		((struct bpf_testmod_ops *)kdata)->data = ((struct bpf_testmod_ops *)udata)->data;
-		return 1;
-	}
-	return 0;
-}
-
-static const struct btf_kfunc_id_set bpf_testmod_kfunc_set = {
-	.owner = THIS_MODULE,
-	.set   = &bpf_testmod_check_kfunc_ids,
-};
-
-static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
-	.is_valid_access = bpf_testmod_ops_is_valid_access,
-};
-
-static const struct bpf_verifier_ops bpf_testmod_verifier_ops3 = {
-	.is_valid_access = bpf_testmod_ops_is_valid_access,
-};
-
-static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
-{
-	struct bpf_testmod_ops *ops = kdata;
-
-	if (ops->test_1)
-		ops->test_1();
-	/* Some test cases (ex. struct_ops_maybe_null) may not have test_2
-	 * initialized, so we need to check for NULL.
-	 */
-	if (ops->test_2)
-		ops->test_2(4, ops->data);
-
-	return 0;
-}
-
-static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
-{
-}
-
-static int bpf_testmod_test_1(void)
-{
-	return 0;
-}
-
-static void bpf_testmod_test_2(int a, int b)
-{
-}
-
-static int bpf_testmod_tramp(int value)
-{
-	return 0;
-}
-
-static int bpf_testmod_ops__test_maybe_null(int dummy,
-					    struct task_struct *task__nullable)
-{
-	return 0;
-}
-
-static struct bpf_testmod_ops __bpf_testmod_ops = {
-	.test_1 = bpf_testmod_test_1,
-	.test_2 = bpf_testmod_test_2,
-	.test_maybe_null = bpf_testmod_ops__test_maybe_null,
-};
-
-struct bpf_struct_ops bpf_bpf_testmod_ops = {
-	.verifier_ops = &bpf_testmod_verifier_ops,
-	.init = bpf_testmod_ops_init,
-	.init_member = bpf_testmod_ops_init_member,
-	.reg = bpf_dummy_reg,
-	.unreg = bpf_dummy_unreg,
-	.cfi_stubs = &__bpf_testmod_ops,
-	.name = "bpf_testmod_ops",
-	.owner = THIS_MODULE,
-};
-
-static int bpf_dummy_reg2(void *kdata, struct bpf_link *link)
-{
-	struct bpf_testmod_ops2 *ops = kdata;
-
-	ops->test_1();
-	return 0;
-}
-
-static struct bpf_testmod_ops2 __bpf_testmod_ops2 = {
-	.test_1 = bpf_testmod_test_1,
-};
-
-struct bpf_struct_ops bpf_testmod_ops2 = {
-	.verifier_ops = &bpf_testmod_verifier_ops,
-	.init = bpf_testmod_ops_init,
-	.init_member = bpf_testmod_ops_init_member,
-	.reg = bpf_dummy_reg2,
-	.unreg = bpf_dummy_unreg,
-	.cfi_stubs = &__bpf_testmod_ops2,
-	.name = "bpf_testmod_ops2",
-	.owner = THIS_MODULE,
-};
-
-static int st_ops3_reg(void *kdata, struct bpf_link *link)
-{
-	int err = 0;
-
-	mutex_lock(&st_ops_mutex);
-	if (st_ops3) {
-		pr_err("st_ops has already been registered\n");
-		err = -EEXIST;
-		goto unlock;
-	}
-	st_ops3 = kdata;
-
-unlock:
-	mutex_unlock(&st_ops_mutex);
-	return err;
-}
-
-static void st_ops3_unreg(void *kdata, struct bpf_link *link)
-{
-	mutex_lock(&st_ops_mutex);
-	st_ops3 = NULL;
-	mutex_unlock(&st_ops_mutex);
-}
-
-static void test_1_recursion_detected(struct bpf_prog *prog)
-{
-	struct bpf_prog_stats *stats;
-
-	stats = this_cpu_ptr(prog->stats);
-	printk("bpf_testmod: oh no, recursing into test_1, recursion_misses %llu",
-	       u64_stats_read(&stats->misses));
-}
-
-static int st_ops3_check_member(const struct btf_type *t,
-				const struct btf_member *member,
-				const struct bpf_prog *prog)
-{
-	u32 moff = __btf_member_bit_offset(t, member) / 8;
-
-	switch (moff) {
-	case offsetof(struct bpf_testmod_ops3, test_1):
-		prog->aux->priv_stack_requested = true;
-		prog->aux->recursion_detected = test_1_recursion_detected;
-		fallthrough;
-	default:
-		break;
-	}
-	return 0;
-}
-
-struct bpf_struct_ops bpf_testmod_ops3 = {
-	.verifier_ops = &bpf_testmod_verifier_ops3,
-	.init = bpf_testmod_ops_init,
-	.init_member = bpf_testmod_ops_init_member,
-	.reg = st_ops3_reg,
-	.unreg = st_ops3_unreg,
-	.check_member = st_ops3_check_member,
-	.cfi_stubs = &__bpf_testmod_ops3,
-	.name = "bpf_testmod_ops3",
-	.owner = THIS_MODULE,
-};
-
-static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args)
-{
-	return 0;
-}
-
-static int bpf_test_mod_st_ops__test_epilogue(struct st_ops_args *args)
-{
-	return 0;
-}
-
-static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args)
-{
-	return 0;
-}
-
-static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
-			       const struct bpf_prog *prog)
-{
-	struct bpf_insn *insn = insn_buf;
-
-	if (strcmp(prog->aux->attach_func_name, "test_prologue") &&
-	    strcmp(prog->aux->attach_func_name, "test_pro_epilogue"))
-		return 0;
-
-	/* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx".
-	 * r7 = r6->a;
-	 * r7 += 1000;
-	 * r6->a = r7;
-	 */
-	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0);
-	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, offsetof(struct st_ops_args, a));
-	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1000);
-	*insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a));
-	*insn++ = prog->insnsi[0];
-
-	return insn - insn_buf;
-}
-
-static int st_ops_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
-			       s16 ctx_stack_off)
-{
-	struct bpf_insn *insn = insn_buf;
-
-	if (strcmp(prog->aux->attach_func_name, "test_epilogue") &&
-	    strcmp(prog->aux->attach_func_name, "test_pro_epilogue"))
-		return 0;
-
-	/* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx"
-	 * r1 = r1[0]; // r1 will be "struct st_ops *args"
-	 * r6 = r1->a;
-	 * r6 += 10000;
-	 * r1->a = r6;
-	 * r0 = r6;
-	 * r0 *= 2;
-	 * BPF_EXIT;
-	 */
-	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off);
-	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
-	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, offsetof(struct st_ops_args, a));
-	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 10000);
-	*insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a));
-	*insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6);
-	*insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2);
-	*insn++ = BPF_EXIT_INSN();
-
-	return insn - insn_buf;
-}
-
-static int st_ops_btf_struct_access(struct bpf_verifier_log *log,
-				    const struct bpf_reg_state *reg,
-				    int off, int size)
-{
-	if (off < 0 || off + size > sizeof(struct st_ops_args))
-		return -EACCES;
-	return 0;
-}
-
-static const struct bpf_verifier_ops st_ops_verifier_ops = {
-	.is_valid_access = bpf_testmod_ops_is_valid_access,
-	.btf_struct_access = st_ops_btf_struct_access,
-	.gen_prologue = st_ops_gen_prologue,
-	.gen_epilogue = st_ops_gen_epilogue,
-	.get_func_proto = bpf_base_func_proto,
-};
-
-static struct bpf_testmod_st_ops st_ops_cfi_stubs = {
-	.test_prologue = bpf_test_mod_st_ops__test_prologue,
-	.test_epilogue = bpf_test_mod_st_ops__test_epilogue,
-	.test_pro_epilogue = bpf_test_mod_st_ops__test_pro_epilogue,
-};
-
-static int st_ops_reg(void *kdata, struct bpf_link *link)
-{
-	int err = 0;
-
-	mutex_lock(&st_ops_mutex);
-	if (st_ops) {
-		pr_err("st_ops has already been registered\n");
-		err = -EEXIST;
-		goto unlock;
-	}
-	st_ops = kdata;
-
-unlock:
-	mutex_unlock(&st_ops_mutex);
-	return err;
-}
-
-static void st_ops_unreg(void *kdata, struct bpf_link *link)
-{
-	mutex_lock(&st_ops_mutex);
-	st_ops = NULL;
-	mutex_unlock(&st_ops_mutex);
-}
-
-static int st_ops_init(struct btf *btf)
-{
-	return 0;
-}
-
-static int st_ops_init_member(const struct btf_type *t,
-			      const struct btf_member *member,
-			      void *kdata, const void *udata)
-{
-	return 0;
-}
-
-static struct bpf_struct_ops testmod_st_ops = {
-	.verifier_ops = &st_ops_verifier_ops,
-	.init = st_ops_init,
-	.init_member = st_ops_init_member,
-	.reg = st_ops_reg,
-	.unreg = st_ops_unreg,
-	.cfi_stubs = &st_ops_cfi_stubs,
-	.name = "bpf_testmod_st_ops",
-	.owner = THIS_MODULE,
-};
-
-extern int bpf_fentry_test1(int a);
-
-static int bpf_testmod_init(void)
-{
-	const struct btf_id_dtor_kfunc bpf_testmod_dtors[] = {
-		{
-			.btf_id		= bpf_testmod_dtor_ids[0],
-			.kfunc_btf_id	= bpf_testmod_dtor_ids[1]
-		},
-	};
-	void **tramp;
-	int ret;
-
-	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set);
-	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set);
-	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set);
-	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set);
-	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set);
-	ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
-	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
-	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3);
-	ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops);
-	ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors,
-						 ARRAY_SIZE(bpf_testmod_dtors),
-						 THIS_MODULE);
-	if (ret < 0)
-		return ret;
-	if (bpf_fentry_test1(0) < 0)
-		return -EINVAL;
-	sock = NULL;
-	mutex_init(&sock_lock);
-	ret = sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
-	if (ret < 0)
-		return ret;
-	ret = register_bpf_testmod_uprobe();
-	if (ret < 0)
-		return ret;
-
-	/* Ensure nothing is between tramp_1..tramp_40 */
-	BUILD_BUG_ON(offsetof(struct bpf_testmod_ops, tramp_1) + 40 * sizeof(long) !=
-		     offsetofend(struct bpf_testmod_ops, tramp_40));
-	tramp = (void **)&__bpf_testmod_ops.tramp_1;
-	while (tramp <= (void **)&__bpf_testmod_ops.tramp_40)
-		*tramp++ = bpf_testmod_tramp;
-
-	return 0;
-}
-
-static void bpf_testmod_exit(void)
-{
-        /* Need to wait for all references to be dropped because
-         * bpf_kfunc_call_test_release() which currently resides in kernel can
-         * be called after bpf_testmod is unloaded. Once release function is
-         * moved into the module this wait can be removed.
-         */
-	while (refcount_read(&prog_test_struct.cnt) > 1)
-		msleep(20);
-
-	bpf_kfunc_close_sock();
-	sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
-	unregister_bpf_testmod_uprobe();
-}
-
-module_init(bpf_testmod_init);
-module_exit(bpf_testmod_exit);
-
-MODULE_AUTHOR("Andrii Nakryiko");
-MODULE_DESCRIPTION("BPF selftests module");
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
deleted file mode 100644
index 356803d1c10e..000000000000
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2020 Facebook */
-#ifndef _BPF_TESTMOD_H
-#define _BPF_TESTMOD_H
-
-#include <linux/types.h>
-
-struct task_struct;
-
-struct bpf_testmod_test_read_ctx {
-	char *buf;
-	loff_t off;
-	size_t len;
-};
-
-struct bpf_testmod_test_write_ctx {
-	char *buf;
-	loff_t off;
-	size_t len;
-};
-
-struct bpf_testmod_test_writable_ctx {
-	bool early_ret;
-	int val;
-};
-
-/* BPF iter that returns *value* *n* times in a row */
-struct bpf_iter_testmod_seq {
-	s64 value;
-	int cnt;
-};
-
-struct bpf_testmod_ops {
-	int (*test_1)(void);
-	void (*test_2)(int a, int b);
-	/* Used to test nullable arguments. */
-	int (*test_maybe_null)(int dummy, struct task_struct *task);
-	int (*unsupported_ops)(void);
-
-	/* The following fields are used to test shadow copies. */
-	char onebyte;
-	struct {
-		int a;
-		int b;
-	} unsupported;
-	int data;
-
-	/* The following pointers are used to test the maps having multiple
-	 * pages of trampolines.
-	 */
-	int (*tramp_1)(int value);
-	int (*tramp_2)(int value);
-	int (*tramp_3)(int value);
-	int (*tramp_4)(int value);
-	int (*tramp_5)(int value);
-	int (*tramp_6)(int value);
-	int (*tramp_7)(int value);
-	int (*tramp_8)(int value);
-	int (*tramp_9)(int value);
-	int (*tramp_10)(int value);
-	int (*tramp_11)(int value);
-	int (*tramp_12)(int value);
-	int (*tramp_13)(int value);
-	int (*tramp_14)(int value);
-	int (*tramp_15)(int value);
-	int (*tramp_16)(int value);
-	int (*tramp_17)(int value);
-	int (*tramp_18)(int value);
-	int (*tramp_19)(int value);
-	int (*tramp_20)(int value);
-	int (*tramp_21)(int value);
-	int (*tramp_22)(int value);
-	int (*tramp_23)(int value);
-	int (*tramp_24)(int value);
-	int (*tramp_25)(int value);
-	int (*tramp_26)(int value);
-	int (*tramp_27)(int value);
-	int (*tramp_28)(int value);
-	int (*tramp_29)(int value);
-	int (*tramp_30)(int value);
-	int (*tramp_31)(int value);
-	int (*tramp_32)(int value);
-	int (*tramp_33)(int value);
-	int (*tramp_34)(int value);
-	int (*tramp_35)(int value);
-	int (*tramp_36)(int value);
-	int (*tramp_37)(int value);
-	int (*tramp_38)(int value);
-	int (*tramp_39)(int value);
-	int (*tramp_40)(int value);
-};
-
-struct bpf_testmod_ops2 {
-	int (*test_1)(void);
-};
-
-struct bpf_testmod_ops3 {
-	int (*test_1)(void);
-	int (*test_2)(void);
-};
-
-struct st_ops_args {
-	u64 a;
-};
-
-struct bpf_testmod_st_ops {
-	int (*test_prologue)(struct st_ops_args *args);
-	int (*test_epilogue)(struct st_ops_args *args);
-	int (*test_pro_epilogue)(struct st_ops_args *args);
-	struct module *owner;
-};
-
-#endif /* _BPF_TESTMOD_H */
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
deleted file mode 100644
index b58817938deb..000000000000
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _BPF_TESTMOD_KFUNC_H
-#define _BPF_TESTMOD_KFUNC_H
-
-#ifndef __KERNEL__
-#include <vmlinux.h>
-#include <bpf/bpf_helpers.h>
-#else
-#define __ksym
-struct prog_test_member1 {
-	int a;
-};
-
-struct prog_test_member {
-	struct prog_test_member1 m;
-	int c;
-};
-
-struct prog_test_ref_kfunc {
-	int a;
-	int b;
-	struct prog_test_member memb;
-	struct prog_test_ref_kfunc *next;
-	refcount_t cnt;
-};
-#endif
-
-struct prog_test_pass1 {
-	int x0;
-	struct {
-		int x1;
-		struct {
-			int x2;
-			struct {
-				int x3;
-			};
-		};
-	};
-};
-
-struct prog_test_pass2 {
-	int len;
-	short arr1[4];
-	struct {
-		char arr2[4];
-		unsigned long arr3[8];
-	} x;
-};
-
-struct prog_test_fail1 {
-	void *p;
-	int x;
-};
-
-struct prog_test_fail2 {
-	int x8;
-	struct prog_test_pass1 x;
-};
-
-struct prog_test_fail3 {
-	int len;
-	char arr1[2];
-	char arr2[];
-};
-
-struct init_sock_args {
-	int af;
-	int type;
-};
-
-struct addr_args {
-	char addr[sizeof(struct __kernel_sockaddr_storage)];
-	int addrlen;
-};
-
-struct sendmsg_args {
-	struct addr_args addr;
-	char msg[10];
-	int msglen;
-};
-
-struct bpf_testmod_ctx {
-	struct callback_head	rcu;
-	refcount_t		usage;
-};
-
-struct prog_test_ref_kfunc *
-bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym;
-void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
-void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) __ksym;
-
-void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
-int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym;
-int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
-int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
-void bpf_kfunc_call_int_mem_release(int *p) __ksym;
-
-/* The bpf_kfunc_call_test_static_unused_arg is defined as static,
- * but bpf program compilation needs to see it as global symbol.
- */
-#ifndef __KERNEL__
-u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused) __ksym;
-#endif
-
-void bpf_testmod_test_mod_kfunc(int i) __ksym;
-
-__u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
-				__u32 c, __u64 d) __ksym;
-int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
-struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
-long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym;
-
-void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym;
-void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym;
-void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym;
-void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
-
-void bpf_kfunc_call_test_destructive(void) __ksym;
-void bpf_kfunc_call_test_sleepable(void) __ksym;
-
-void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p);
-struct prog_test_member *bpf_kfunc_call_memb_acquire(void);
-void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p);
-void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p);
-void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p);
-void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p);
-void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len);
-
-void bpf_kfunc_common_test(void) __ksym;
-
-int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym;
-void bpf_kfunc_close_sock(void) __ksym;
-int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym;
-int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym;
-int bpf_kfunc_call_kernel_listen(void) __ksym;
-int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym;
-int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym;
-int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym;
-int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym;
-
-void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, struct bpf_dynptr *ptr__nullable) __ksym;
-
-struct bpf_testmod_ctx *bpf_testmod_ctx_create(int *err) __ksym;
-void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) __ksym;
-
-struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) __ksym;
-struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym;
-void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym;
-
-struct st_ops_args;
-int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) __ksym;
-int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym;
-int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym;
-int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym;
-
-void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym;
-void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym;
-void bpf_kfunc_trusted_num_test(int *ptr) __ksym;
-void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym;
-
-#endif /* _BPF_TESTMOD_KFUNC_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 1c682550e0e7..e10ea92c3fe2 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -2,7 +2,7 @@
 #define _GNU_SOURCE
 #include <test_progs.h>
 #include "progs/core_reloc_types.h"
-#include "bpf_testmod/bpf_testmod.h"
+#include "test_kmods/bpf_testmod.h"
 #include <linux/limits.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
diff --git a/tools/testing/selftests/bpf/progs/bad_struct_ops.c b/tools/testing/selftests/bpf/progs/bad_struct_ops.c
index b7e175cd0af0..b3f77b4561c8 100644
--- a/tools/testing/selftests/bpf/progs/bad_struct_ops.c
+++ b/tools/testing/selftests/bpf/progs/bad_struct_ops.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c
index 56c764df8196..5d6fc7f01ebb 100644
--- a/tools/testing/selftests/bpf/progs/cb_refs.c
+++ b/tools/testing/selftests/bpf/progs/cb_refs.c
@@ -2,7 +2,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 struct map_value {
 	struct prog_test_ref_kfunc __kptr *ptr;
diff --git a/tools/testing/selftests/bpf/progs/epilogue_exit.c b/tools/testing/selftests/bpf/progs/epilogue_exit.c
index 33d3a57bee90..35fec7c75bef 100644
--- a/tools/testing/selftests/bpf/progs/epilogue_exit.c
+++ b/tools/testing/selftests/bpf/progs/epilogue_exit.c
@@ -4,8 +4,8 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/epilogue_tailcall.c b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c
index 7275dd594de0..153514691ba4 100644
--- a/tools/testing/selftests/bpf/progs/epilogue_tailcall.c
+++ b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c
@@ -4,8 +4,8 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c
index df1d3db60b1b..9e4b45201e69 100644
--- a/tools/testing/selftests/bpf/progs/iters_testmod.c
+++ b/tools/testing/selftests/bpf/progs/iters_testmod.c
@@ -4,7 +4,7 @@
 #include "bpf_experimental.h"
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/jit_probe_mem.c b/tools/testing/selftests/bpf/progs/jit_probe_mem.c
index f9789e668297..82190d79de37 100644
--- a/tools/testing/selftests/bpf/progs/jit_probe_mem.c
+++ b/tools/testing/selftests/bpf/progs/jit_probe_mem.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 static struct prog_test_ref_kfunc __kptr *v;
 long total_sum = -1;
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c b/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c
index 7632d9ecb253..b9670e9a6e3d 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 SEC("tc")
 int kfunc_destructive_test(void)
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
index 08fae306539c..a1963497f0bf 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2021 Facebook */
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 struct syscall_test_args {
 	__u8 data[16];
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_race.c b/tools/testing/selftests/bpf/progs/kfunc_call_race.c
index d532af07decf..48f64827cd93 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_race.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_race.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 SEC("tc")
 int kfunc_call_fail(struct __sk_buff *ctx)
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
index f502f755f567..8b86113a0126 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2021 Facebook */
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 SEC("tc")
 int kfunc_call_test4(struct __sk_buff *skb)
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
index 2380c75e74ce..8e150e85b50d 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 extern const int bpf_prog_active __ksym;
 int active_res = -1;
diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
index b092a72b2c9d..d736506a4c80 100644
--- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c
+++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
@@ -6,7 +6,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
 #include "../bpf_experimental.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 struct plain_local;
 
diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c
index ab0ce1d01a4a..edaba481db9d 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr.c
@@ -2,7 +2,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 struct map_value {
 	struct prog_test_ref_kfunc __kptr_untrusted *unref_ptr;
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index 450bb373b179..c2a6bd392e48 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -4,7 +4,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 struct map_value {
 	char buf[8];
diff --git a/tools/testing/selftests/bpf/progs/missed_kprobe.c b/tools/testing/selftests/bpf/progs/missed_kprobe.c
index 7f9ef701f5de..51a4fe64c917 100644
--- a/tools/testing/selftests/bpf/progs/missed_kprobe.c
+++ b/tools/testing/selftests/bpf/progs/missed_kprobe.c
@@ -2,7 +2,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c b/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c
index 8ea71cbd6c45..c4bf679a9876 100644
--- a/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c
+++ b/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c
@@ -2,7 +2,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/nested_acquire.c b/tools/testing/selftests/bpf/progs/nested_acquire.c
index 8e521a21d995..49ad7b9adf56 100644
--- a/tools/testing/selftests/bpf/progs/nested_acquire.c
+++ b/tools/testing/selftests/bpf/progs/nested_acquire.c
@@ -4,7 +4,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue.c b/tools/testing/selftests/bpf/progs/pro_epilogue.c
index 44bc3f06b4b6..d97d6e07ef5c 100644
--- a/tools/testing/selftests/bpf/progs/pro_epilogue.c
+++ b/tools/testing/selftests/bpf/progs/pro_epilogue.c
@@ -4,8 +4,8 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c
index 3529e53be355..6048d79be48b 100644
--- a/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c
+++ b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c
@@ -4,8 +4,8 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/sock_addr_kern.c b/tools/testing/selftests/bpf/progs/sock_addr_kern.c
index 8386bb15ccdc..84ad515eafd6 100644
--- a/tools/testing/selftests/bpf/progs/sock_addr_kern.c
+++ b/tools/testing/selftests/bpf/progs/sock_addr_kern.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2024 Google LLC */
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 SEC("syscall")
 int init_sock(struct init_sock_args *args)
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_detach.c b/tools/testing/selftests/bpf/progs/struct_ops_detach.c
index d7fdcabe7d90..284a5b008e0c 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_detach.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_detach.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c
index 3c822103bd40..d8cc99f5c2e2 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c
index b450f72e744a..ccab3935aa42 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c
index 6283099ec383..8b5515f4f724 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_maybe_null_fail.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c
index 4c56d4a9d9f4..71c420c3a5a6 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_module.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c b/tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c
index 9efcc6e4d356..5b23ea817f1f 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c
index fa2021388485..5d0937fa07be 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
index 8ea57e5348ab..0e4d2ff63ab8 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
index 1f55ec4cee37..58d5d8dc2235 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
index f2f300d50988..31e58389bb8b 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
@@ -3,7 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
index 7ac7e1de34d8..0ad1bf1ede8d 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
@@ -4,7 +4,7 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 #include "bpf_kfuncs.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 SEC("tc")
 int kfunc_dynptr_nullable_test1(struct __sk_buff *skb)
diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c
index cc1a012d038f..fb07f5773888 100644
--- a/tools/testing/selftests/bpf/progs/test_module_attach.c
+++ b/tools/testing/selftests/bpf/progs/test_module_attach.c
@@ -5,7 +5,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 __u32 raw_tp_read_sz = 0;
 
diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
index 5aaf2b065f86..3bce838e92d5 100644
--- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
@@ -3,7 +3,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 #include "bpf_misc.h"
 
 SEC("tp_btf/bpf_testmod_test_nullable_bare")
diff --git a/tools/testing/selftests/bpf/progs/unsupported_ops.c b/tools/testing/selftests/bpf/progs/unsupported_ops.c
index 9180365a3568..8aa2e0dd624e 100644
--- a/tools/testing/selftests/bpf/progs/unsupported_ops.c
+++ b/tools/testing/selftests/bpf/progs/unsupported_ops.c
@@ -4,7 +4,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c
index f8d3ae0c29ae..2f1ba08c293e 100644
--- a/tools/testing/selftests/bpf/progs/wq.c
+++ b/tools/testing/selftests/bpf/progs/wq.c
@@ -5,7 +5,7 @@
 #include "bpf_experimental.h"
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c
index 25b51a72fe0f..4240211a1900 100644
--- a/tools/testing/selftests/bpf/progs/wq_failures.c
+++ b/tools/testing/selftests/bpf/progs/wq_failures.c
@@ -5,7 +5,7 @@
 #include "bpf_experimental.h"
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
-#include "../bpf_testmod/bpf_testmod_kfunc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/test_kmods/.gitignore b/tools/testing/selftests/bpf/test_kmods/.gitignore
new file mode 100644
index 000000000000..ded513777281
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/.gitignore
@@ -0,0 +1,6 @@
+*.mod
+*.mod.c
+*.o
+.ko
+/Module.symvers
+/modules.order
diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile
new file mode 100644
index 000000000000..d4e50c4509c9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/Makefile
@@ -0,0 +1,21 @@
+TEST_KMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
+KDIR ?= $(abspath $(TEST_KMOD_DIR)/../../../../..)
+
+ifeq ($(V),1)
+Q =
+else
+Q = @
+endif
+
+MODULES = bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \
+	bpf_test_modorder_y.ko
+
+$(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o)))
+
+CFLAGS_bpf_testmod.o = -I$(src)
+
+all:
+	$(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) modules
+
+clean:
+	$(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) clean
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_x.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_x.c
new file mode 100644
index 000000000000..0cc747fa912f
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_x.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_test_modorder_retx(void)
+{
+	return 'x';
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_test_modorder_kfunc_x_ids)
+BTF_ID_FLAGS(func, bpf_test_modorder_retx);
+BTF_KFUNCS_END(bpf_test_modorder_kfunc_x_ids)
+
+static const struct btf_kfunc_id_set bpf_test_modorder_x_set = {
+	.owner = THIS_MODULE,
+	.set = &bpf_test_modorder_kfunc_x_ids,
+};
+
+static int __init bpf_test_modorder_x_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &bpf_test_modorder_x_set);
+}
+
+static void __exit bpf_test_modorder_x_exit(void)
+{
+}
+
+module_init(bpf_test_modorder_x_init);
+module_exit(bpf_test_modorder_x_exit);
+
+MODULE_DESCRIPTION("BPF selftest ordertest module X");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_y.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_y.c
new file mode 100644
index 000000000000..c627ee085d13
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_modorder_y.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_test_modorder_rety(void)
+{
+	return 'y';
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_test_modorder_kfunc_y_ids)
+BTF_ID_FLAGS(func, bpf_test_modorder_rety);
+BTF_KFUNCS_END(bpf_test_modorder_kfunc_y_ids)
+
+static const struct btf_kfunc_id_set bpf_test_modorder_y_set = {
+	.owner = THIS_MODULE,
+	.set = &bpf_test_modorder_kfunc_y_ids,
+};
+
+static int __init bpf_test_modorder_y_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &bpf_test_modorder_y_set);
+}
+
+static void __exit bpf_test_modorder_y_exit(void)
+{
+}
+
+module_init(bpf_test_modorder_y_init);
+module_exit(bpf_test_modorder_y_exit);
+
+MODULE_DESCRIPTION("BPF selftest ordertest module Y");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_no_cfi.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_no_cfi.c
new file mode 100644
index 000000000000..948eb3962732
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_no_cfi.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+struct bpf_test_no_cfi_ops {
+	void (*fn_1)(void);
+	void (*fn_2)(void);
+};
+
+static int dummy_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int dummy_init_member(const struct btf_type *t,
+			     const struct btf_member *member,
+			     void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static int dummy_reg(void *kdata, struct bpf_link *link)
+{
+	return 0;
+}
+
+static void dummy_unreg(void *kdata, struct bpf_link *link)
+{
+}
+
+static const struct bpf_verifier_ops dummy_verifier_ops;
+
+static void bpf_test_no_cfi_ops__fn_1(void)
+{
+}
+
+static void bpf_test_no_cfi_ops__fn_2(void)
+{
+}
+
+static struct bpf_test_no_cfi_ops __test_no_cif_ops = {
+	.fn_1 = bpf_test_no_cfi_ops__fn_1,
+	.fn_2 = bpf_test_no_cfi_ops__fn_2,
+};
+
+static struct bpf_struct_ops test_no_cif_ops = {
+	.verifier_ops = &dummy_verifier_ops,
+	.init = dummy_init,
+	.init_member = dummy_init_member,
+	.reg = dummy_reg,
+	.unreg = dummy_unreg,
+	.name = "bpf_test_no_cfi_ops",
+	.owner = THIS_MODULE,
+};
+
+static int bpf_test_no_cfi_init(void)
+{
+	int ret;
+
+	ret = register_bpf_struct_ops(&test_no_cif_ops,
+				      bpf_test_no_cfi_ops);
+	if (!ret)
+		return -EINVAL;
+
+	test_no_cif_ops.cfi_stubs = &__test_no_cif_ops;
+	ret = register_bpf_struct_ops(&test_no_cif_ops,
+				      bpf_test_no_cfi_ops);
+	return ret;
+}
+
+static void bpf_test_no_cfi_exit(void)
+{
+}
+
+module_init(bpf_test_no_cfi_init);
+module_exit(bpf_test_no_cfi_exit);
+
+MODULE_AUTHOR("Kuifeng Lee");
+MODULE_DESCRIPTION("BPF no cfi_stubs test module");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h
new file mode 100644
index 000000000000..aeef86b3da74
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf_testmod
+
+#if !defined(_BPF_TESTMOD_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _BPF_TESTMOD_EVENTS_H
+
+#include <linux/tracepoint.h>
+#include "bpf_testmod.h"
+
+TRACE_EVENT(bpf_testmod_test_read,
+	TP_PROTO(struct task_struct *task, struct bpf_testmod_test_read_ctx *ctx),
+	TP_ARGS(task, ctx),
+	TP_STRUCT__entry(
+		__field(pid_t, pid)
+		__array(char, comm, TASK_COMM_LEN)
+		__field(loff_t, off)
+		__field(size_t, len)
+	),
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->off = ctx->off;
+		__entry->len = ctx->len;
+	),
+	TP_printk("pid=%d comm=%s off=%llu len=%zu",
+		  __entry->pid, __entry->comm, __entry->off, __entry->len)
+);
+
+/* A bare tracepoint with no event associated with it */
+DECLARE_TRACE(bpf_testmod_test_write_bare,
+	TP_PROTO(struct task_struct *task, struct bpf_testmod_test_write_ctx *ctx),
+	TP_ARGS(task, ctx)
+);
+
+/* Used in bpf_testmod_test_read() to test __nullable suffix */
+DECLARE_TRACE(bpf_testmod_test_nullable_bare,
+	TP_PROTO(struct bpf_testmod_test_read_ctx *ctx__nullable),
+	TP_ARGS(ctx__nullable)
+);
+
+struct sk_buff;
+
+DECLARE_TRACE(bpf_testmod_test_raw_tp_null,
+	TP_PROTO(struct sk_buff *skb),
+	TP_ARGS(skb)
+);
+
+
+#undef BPF_TESTMOD_DECLARE_TRACE
+#ifdef DECLARE_TRACE_WRITABLE
+#define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
+	DECLARE_TRACE_WRITABLE(call, PARAMS(proto), PARAMS(args), size)
+#else
+#define BPF_TESTMOD_DECLARE_TRACE(call, proto, args, size) \
+	DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))
+#endif
+
+BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare,
+	TP_PROTO(struct bpf_testmod_test_writable_ctx *ctx),
+	TP_ARGS(ctx),
+	sizeof(struct bpf_testmod_test_writable_ctx)
+);
+
+#endif /* _BPF_TESTMOD_EVENTS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE bpf_testmod-events
+#include <trace/define_trace.h>
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
new file mode 100644
index 000000000000..cc9dde507aba
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -0,0 +1,1487 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/delay.h>
+#include <linux/error-injection.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/percpu-defs.h>
+#include <linux/sysfs.h>
+#include <linux/tracepoint.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/nsproxy.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/un.h>
+#include <linux/filter.h>
+#include <net/sock.h>
+#include <linux/namei.h>
+#include "bpf_testmod.h"
+#include "bpf_testmod_kfunc.h"
+
+#define CREATE_TRACE_POINTS
+#include "bpf_testmod-events.h"
+
+#define CONNECT_TIMEOUT_SEC 1
+
+typedef int (*func_proto_typedef)(long);
+typedef int (*func_proto_typedef_nested1)(func_proto_typedef);
+typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1);
+
+DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123;
+long bpf_testmod_test_struct_arg_result;
+static DEFINE_MUTEX(sock_lock);
+static struct socket *sock;
+
+struct bpf_testmod_struct_arg_1 {
+	int a;
+};
+struct bpf_testmod_struct_arg_2 {
+	long a;
+	long b;
+};
+
+struct bpf_testmod_struct_arg_3 {
+	int a;
+	int b[];
+};
+
+struct bpf_testmod_struct_arg_4 {
+	u64 a;
+	int b;
+};
+
+struct bpf_testmod_struct_arg_5 {
+	char a;
+	short b;
+	int c;
+	long d;
+};
+
+__bpf_hook_start();
+
+noinline int
+bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) {
+	bpf_testmod_test_struct_arg_result = a.a + a.b  + b + c;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_2(int a, struct bpf_testmod_struct_arg_2 b, int c) {
+	bpf_testmod_test_struct_arg_result = a + b.a + b.b + c;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_3(int a, int b, struct bpf_testmod_struct_arg_2 c) {
+	bpf_testmod_test_struct_arg_result = a + b + c.a + c.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_4(struct bpf_testmod_struct_arg_1 a, int b,
+			      int c, int d, struct bpf_testmod_struct_arg_2 e) {
+	bpf_testmod_test_struct_arg_result = a.a + b + c + d + e.a + e.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_5(void) {
+	bpf_testmod_test_struct_arg_result = 1;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_6(struct bpf_testmod_struct_arg_3 *a) {
+	bpf_testmod_test_struct_arg_result = a->b[0];
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_7(u64 a, void *b, short c, int d, void *e,
+			      struct bpf_testmod_struct_arg_4 f)
+{
+	bpf_testmod_test_struct_arg_result = a + (long)b + c + d +
+		(long)e + f.a + f.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_8(u64 a, void *b, short c, int d, void *e,
+			      struct bpf_testmod_struct_arg_4 f, int g)
+{
+	bpf_testmod_test_struct_arg_result = a + (long)b + c + d +
+		(long)e + f.a + f.b + g;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_9(u64 a, void *b, short c, int d, void *e, char f,
+			      short g, struct bpf_testmod_struct_arg_5 h, long i)
+{
+	bpf_testmod_test_struct_arg_result = a + (long)b + c + d + (long)e +
+		f + g + h.a + h.b + h.c + h.d + i;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_arg_ptr_to_struct(struct bpf_testmod_struct_arg_1 *a) {
+	bpf_testmod_test_struct_arg_result = a->a;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+__bpf_kfunc void
+bpf_testmod_test_mod_kfunc(int i)
+{
+	*(int *)this_cpu_ptr(&bpf_testmod_ksym_percpu) = i;
+}
+
+__bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt)
+{
+	it->cnt = cnt;
+
+	if (cnt < 0)
+		return -EINVAL;
+
+	it->value = value;
+
+	return 0;
+}
+
+__bpf_kfunc s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq* it)
+{
+	if (it->cnt <= 0)
+		return NULL;
+
+	it->cnt--;
+
+	return &it->value;
+}
+
+__bpf_kfunc s64 bpf_iter_testmod_seq_value(int val, struct bpf_iter_testmod_seq* it__iter)
+{
+	if (it__iter->cnt < 0)
+		return 0;
+
+	return val + it__iter->value;
+}
+
+__bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it)
+{
+	it->cnt = 0;
+}
+
+__bpf_kfunc void bpf_kfunc_common_test(void)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr,
+				       struct bpf_dynptr *ptr__nullable)
+{
+}
+
+__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr)
+{
+	return NULL;
+}
+
+__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr)
+{
+	return NULL;
+}
+
+__bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_trusted_task_test(struct task_struct *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_trusted_num_test(int *ptr)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr)
+{
+}
+
+__bpf_kfunc struct bpf_testmod_ctx *
+bpf_testmod_ctx_create(int *err)
+{
+	struct bpf_testmod_ctx *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
+	if (!ctx) {
+		*err = -ENOMEM;
+		return NULL;
+	}
+	refcount_set(&ctx->usage, 1);
+
+	return ctx;
+}
+
+static void testmod_free_cb(struct rcu_head *head)
+{
+	struct bpf_testmod_ctx *ctx;
+
+	ctx = container_of(head, struct bpf_testmod_ctx, rcu);
+	kfree(ctx);
+}
+
+__bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	if (refcount_dec_and_test(&ctx->usage))
+		call_rcu(&ctx->rcu, testmod_free_cb);
+}
+
+static struct bpf_testmod_ops3 *st_ops3;
+
+static int bpf_testmod_test_3(void)
+{
+	return 0;
+}
+
+static int bpf_testmod_test_4(void)
+{
+	return 0;
+}
+
+static struct bpf_testmod_ops3 __bpf_testmod_ops3 = {
+	.test_1 = bpf_testmod_test_3,
+	.test_2 = bpf_testmod_test_4,
+};
+
+static void bpf_testmod_test_struct_ops3(void)
+{
+	if (st_ops3)
+		st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_1(void)
+{
+	st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_2(void)
+{
+	st_ops3->test_2();
+}
+
+struct bpf_testmod_btf_type_tag_1 {
+	int a;
+};
+
+struct bpf_testmod_btf_type_tag_2 {
+	struct bpf_testmod_btf_type_tag_1 __user *p;
+};
+
+struct bpf_testmod_btf_type_tag_3 {
+	struct bpf_testmod_btf_type_tag_1 __percpu *p;
+};
+
+noinline int
+bpf_testmod_test_btf_type_tag_user_1(struct bpf_testmod_btf_type_tag_1 __user *arg) {
+	BTF_TYPE_EMIT(func_proto_typedef);
+	BTF_TYPE_EMIT(func_proto_typedef_nested1);
+	BTF_TYPE_EMIT(func_proto_typedef_nested2);
+	return arg->a;
+}
+
+noinline int
+bpf_testmod_test_btf_type_tag_user_2(struct bpf_testmod_btf_type_tag_2 *arg) {
+	return arg->p->a;
+}
+
+noinline int
+bpf_testmod_test_btf_type_tag_percpu_1(struct bpf_testmod_btf_type_tag_1 __percpu *arg) {
+	return arg->a;
+}
+
+noinline int
+bpf_testmod_test_btf_type_tag_percpu_2(struct bpf_testmod_btf_type_tag_3 *arg) {
+	return arg->p->a;
+}
+
+noinline int bpf_testmod_loop_test(int n)
+{
+	/* Make sum volatile, so smart compilers, such as clang, will not
+	 * optimize the code by removing the loop.
+	 */
+	volatile int sum = 0;
+	int i;
+
+	/* the primary goal of this test is to test LBR. Create a lot of
+	 * branches in the function, so we can catch it easily.
+	 */
+	for (i = 0; i < n; i++)
+		sum += i;
+	return sum;
+}
+
+__weak noinline struct file *bpf_testmod_return_ptr(int arg)
+{
+	static struct file f = {};
+
+	switch (arg) {
+	case 1: return (void *)EINVAL;		/* user addr */
+	case 2: return (void *)0xcafe4a11;	/* user addr */
+	case 3: return (void *)-EINVAL;		/* canonical, but invalid */
+	case 4: return (void *)(1ull << 60);	/* non-canonical and invalid */
+	case 5: return (void *)~(1ull << 30);	/* trigger extable */
+	case 6: return &f;			/* valid addr */
+	case 7: return (void *)((long)&f | 1);	/* kernel tricks */
+#ifdef CONFIG_X86_64
+	case 8: return (void *)VSYSCALL_ADDR;   /* vsyscall page address */
+#endif
+	default: return NULL;
+	}
+}
+
+noinline int bpf_testmod_fentry_test1(int a)
+{
+	return a + 1;
+}
+
+noinline int bpf_testmod_fentry_test2(int a, u64 b)
+{
+	return a + b;
+}
+
+noinline int bpf_testmod_fentry_test3(char a, int b, u64 c)
+{
+	return a + b + c;
+}
+
+noinline int bpf_testmod_fentry_test7(u64 a, void *b, short c, int d,
+				      void *e, char f, int g)
+{
+	return a + (long)b + c + d + (long)e + f + g;
+}
+
+noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d,
+				       void *e, char f, int g,
+				       unsigned int h, long i, __u64 j,
+				       unsigned long k)
+{
+	return a + (long)b + c + d + (long)e + f + g + h + i + j + k;
+}
+
+int bpf_testmod_fentry_ok;
+
+noinline ssize_t
+bpf_testmod_test_read(struct file *file, struct kobject *kobj,
+		      struct bin_attribute *bin_attr,
+		      char *buf, loff_t off, size_t len)
+{
+	struct bpf_testmod_test_read_ctx ctx = {
+		.buf = buf,
+		.off = off,
+		.len = len,
+	};
+	struct bpf_testmod_struct_arg_1 struct_arg1 = {10}, struct_arg1_2 = {-1};
+	struct bpf_testmod_struct_arg_2 struct_arg2 = {2, 3};
+	struct bpf_testmod_struct_arg_3 *struct_arg3;
+	struct bpf_testmod_struct_arg_4 struct_arg4 = {21, 22};
+	struct bpf_testmod_struct_arg_5 struct_arg5 = {23, 24, 25, 26};
+	int i = 1;
+
+	while (bpf_testmod_return_ptr(i))
+		i++;
+
+	(void)bpf_testmod_test_struct_arg_1(struct_arg2, 1, 4);
+	(void)bpf_testmod_test_struct_arg_2(1, struct_arg2, 4);
+	(void)bpf_testmod_test_struct_arg_3(1, 4, struct_arg2);
+	(void)bpf_testmod_test_struct_arg_4(struct_arg1, 1, 2, 3, struct_arg2);
+	(void)bpf_testmod_test_struct_arg_5();
+	(void)bpf_testmod_test_struct_arg_7(16, (void *)17, 18, 19,
+					    (void *)20, struct_arg4);
+	(void)bpf_testmod_test_struct_arg_8(16, (void *)17, 18, 19,
+					    (void *)20, struct_arg4, 23);
+	(void)bpf_testmod_test_struct_arg_9(16, (void *)17, 18, 19, (void *)20,
+					    21, 22, struct_arg5, 27);
+
+	(void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2);
+
+	(void)trace_bpf_testmod_test_raw_tp_null(NULL);
+
+	bpf_testmod_test_struct_ops3();
+
+	struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) +
+				sizeof(int)), GFP_KERNEL);
+	if (struct_arg3 != NULL) {
+		struct_arg3->b[0] = 1;
+		(void)bpf_testmod_test_struct_arg_6(struct_arg3);
+		kfree(struct_arg3);
+	}
+
+	/* This is always true. Use the check to make sure the compiler
+	 * doesn't remove bpf_testmod_loop_test.
+	 */
+	if (bpf_testmod_loop_test(101) > 100)
+		trace_bpf_testmod_test_read(current, &ctx);
+
+	trace_bpf_testmod_test_nullable_bare(NULL);
+
+	/* Magic number to enable writable tp */
+	if (len == 64) {
+		struct bpf_testmod_test_writable_ctx writable = {
+			.val = 1024,
+		};
+		trace_bpf_testmod_test_writable_bare(&writable);
+		if (writable.early_ret)
+			return snprintf(buf, len, "%d\n", writable.val);
+	}
+
+	if (bpf_testmod_fentry_test1(1) != 2 ||
+	    bpf_testmod_fentry_test2(2, 3) != 5 ||
+	    bpf_testmod_fentry_test3(4, 5, 6) != 15 ||
+	    bpf_testmod_fentry_test7(16, (void *)17, 18, 19, (void *)20,
+			21, 22) != 133 ||
+	    bpf_testmod_fentry_test11(16, (void *)17, 18, 19, (void *)20,
+			21, 22, 23, 24, 25, 26) != 231)
+		goto out;
+
+	bpf_testmod_fentry_ok = 1;
+out:
+	return -EIO; /* always fail */
+}
+EXPORT_SYMBOL(bpf_testmod_test_read);
+ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO);
+
+noinline ssize_t
+bpf_testmod_test_write(struct file *file, struct kobject *kobj,
+		      struct bin_attribute *bin_attr,
+		      char *buf, loff_t off, size_t len)
+{
+	struct bpf_testmod_test_write_ctx ctx = {
+		.buf = buf,
+		.off = off,
+		.len = len,
+	};
+
+	trace_bpf_testmod_test_write_bare(current, &ctx);
+
+	return -EIO; /* always fail */
+}
+EXPORT_SYMBOL(bpf_testmod_test_write);
+ALLOW_ERROR_INJECTION(bpf_testmod_test_write, ERRNO);
+
+noinline int bpf_fentry_shadow_test(int a)
+{
+	return a + 2;
+}
+EXPORT_SYMBOL_GPL(bpf_fentry_shadow_test);
+
+__bpf_hook_end();
+
+static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
+	.attr = { .name = "bpf_testmod", .mode = 0666, },
+	.read = bpf_testmod_test_read,
+	.write = bpf_testmod_test_write,
+};
+
+/* bpf_testmod_uprobe sysfs attribute is so far enabled for x86_64 only,
+ * please see test_uretprobe_regs_change test
+ */
+#ifdef __x86_64__
+
+static int
+uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
+		   struct pt_regs *regs, __u64 *data)
+
+{
+	regs->ax  = 0x12345678deadbeef;
+	regs->cx  = 0x87654321feebdaed;
+	regs->r11 = (u64) -1;
+	return true;
+}
+
+struct testmod_uprobe {
+	struct path path;
+	struct uprobe *uprobe;
+	struct uprobe_consumer consumer;
+};
+
+static DEFINE_MUTEX(testmod_uprobe_mutex);
+
+static struct testmod_uprobe uprobe = {
+	.consumer.ret_handler = uprobe_ret_handler,
+};
+
+static int testmod_register_uprobe(loff_t offset)
+{
+	int err = -EBUSY;
+
+	if (uprobe.uprobe)
+		return -EBUSY;
+
+	mutex_lock(&testmod_uprobe_mutex);
+
+	if (uprobe.uprobe)
+		goto out;
+
+	err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
+	if (err)
+		goto out;
+
+	uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
+					offset, 0, &uprobe.consumer);
+	if (IS_ERR(uprobe.uprobe)) {
+		err = PTR_ERR(uprobe.uprobe);
+		path_put(&uprobe.path);
+		uprobe.uprobe = NULL;
+	}
+out:
+	mutex_unlock(&testmod_uprobe_mutex);
+	return err;
+}
+
+static void testmod_unregister_uprobe(void)
+{
+	mutex_lock(&testmod_uprobe_mutex);
+
+	if (uprobe.uprobe) {
+		uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer);
+		uprobe_unregister_sync();
+		path_put(&uprobe.path);
+		uprobe.uprobe = NULL;
+	}
+
+	mutex_unlock(&testmod_uprobe_mutex);
+}
+
+static ssize_t
+bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
+			 struct bin_attribute *bin_attr,
+			 char *buf, loff_t off, size_t len)
+{
+	unsigned long offset = 0;
+	int err = 0;
+
+	if (kstrtoul(buf, 0, &offset))
+		return -EINVAL;
+
+	if (offset)
+		err = testmod_register_uprobe(offset);
+	else
+		testmod_unregister_uprobe();
+
+	return err ?: strlen(buf);
+}
+
+static struct bin_attribute bin_attr_bpf_testmod_uprobe_file __ro_after_init = {
+	.attr = { .name = "bpf_testmod_uprobe", .mode = 0666, },
+	.write = bpf_testmod_uprobe_write,
+};
+
+static int register_bpf_testmod_uprobe(void)
+{
+	return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_uprobe_file);
+}
+
+static void unregister_bpf_testmod_uprobe(void)
+{
+	testmod_unregister_uprobe();
+	sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_uprobe_file);
+}
+
+#else
+static int register_bpf_testmod_uprobe(void)
+{
+	return 0;
+}
+
+static void unregister_bpf_testmod_uprobe(void) { }
+#endif
+
+BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value)
+BTF_ID_FLAGS(func, bpf_kfunc_common_test)
+BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test)
+BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
+BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2)
+BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
+
+BTF_ID_LIST(bpf_testmod_dtor_ids)
+BTF_ID(struct, bpf_testmod_ctx)
+BTF_ID(func, bpf_testmod_ctx_release)
+
+static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_testmod_common_kfunc_ids,
+};
+
+__bpf_kfunc u64 bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
+{
+	return a + b + c + d;
+}
+
+__bpf_kfunc int bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
+{
+	return a + b;
+}
+
+__bpf_kfunc struct sock *bpf_kfunc_call_test3(struct sock *sk)
+{
+	return sk;
+}
+
+__bpf_kfunc long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d)
+{
+	/* Provoke the compiler to assume that the caller has sign-extended a,
+	 * b and c on platforms where this is required (e.g. s390x).
+	 */
+	return (long)a + (long)b + (long)c + d;
+}
+
+static struct prog_test_ref_kfunc prog_test_struct = {
+	.a = 42,
+	.b = 108,
+	.next = &prog_test_struct,
+	.cnt = REFCOUNT_INIT(1),
+};
+
+__bpf_kfunc struct prog_test_ref_kfunc *
+bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
+{
+	refcount_inc(&prog_test_struct.cnt);
+	return &prog_test_struct;
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p)
+{
+	WARN_ON_ONCE(1);
+}
+
+__bpf_kfunc struct prog_test_member *
+bpf_kfunc_call_memb_acquire(void)
+{
+	WARN_ON_ONCE(1);
+	return NULL;
+}
+
+__bpf_kfunc void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
+{
+	WARN_ON_ONCE(1);
+}
+
+static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const int size)
+{
+	if (size > 2 * sizeof(int))
+		return NULL;
+
+	return (int *)p;
+}
+
+__bpf_kfunc int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p,
+						  const int rdwr_buf_size)
+{
+	return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size);
+}
+
+__bpf_kfunc int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p,
+						    const int rdonly_buf_size)
+{
+	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
+}
+
+/* the next 2 ones can't be really used for testing expect to ensure
+ * that the verifier rejects the call.
+ * Acquire functions must return struct pointers, so these ones are
+ * failing.
+ */
+__bpf_kfunc int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p,
+						    const int rdonly_buf_size)
+{
+	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
+}
+
+__bpf_kfunc void bpf_kfunc_call_int_mem_release(int *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
+{
+	/* p != NULL, but p->cnt could be 0 */
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_destructive(void)
+{
+}
+
+__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused)
+{
+	return arg;
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_sleepable(void)
+{
+}
+
+__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args)
+{
+	int proto;
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (sock) {
+		pr_err("%s called without releasing old sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	switch (args->af) {
+	case AF_INET:
+	case AF_INET6:
+		proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP;
+		break;
+	case AF_UNIX:
+		proto = PF_UNIX;
+		break;
+	default:
+		pr_err("invalid address family %d\n", args->af);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type,
+			       proto, &sock);
+
+	if (!err)
+		/* Set timeout for call to kernel_connect() to prevent it from hanging,
+		 * and consider the connection attempt failed if it returns
+		 * -EINPROGRESS.
+		 */
+		sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc void bpf_kfunc_close_sock(void)
+{
+	mutex_lock(&sock_lock);
+
+	if (sock) {
+		sock_release(sock);
+		sock = NULL;
+	}
+
+	mutex_unlock(&sock_lock);
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args)
+{
+	int err;
+
+	if (args->addrlen > sizeof(args->addr))
+		return -EINVAL;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_connect(sock, (struct sockaddr *)&args->addr,
+			     args->addrlen, 0);
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args)
+{
+	int err;
+
+	if (args->addrlen > sizeof(args->addr))
+		return -EINVAL;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen);
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_listen(void)
+{
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_listen(sock, 128);
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args)
+{
+	struct msghdr msg = {
+		.msg_name	= &args->addr.addr,
+		.msg_namelen	= args->addr.addrlen,
+	};
+	struct kvec iov;
+	int err;
+
+	if (args->addr.addrlen > sizeof(args->addr.addr) ||
+	    args->msglen > sizeof(args->msg))
+		return -EINVAL;
+
+	iov.iov_base = args->msg;
+	iov.iov_len  = args->msglen;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen);
+	args->addr.addrlen = msg.msg_namelen;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args)
+{
+	struct msghdr msg = {
+		.msg_name	= &args->addr.addr,
+		.msg_namelen	= args->addr.addrlen,
+	};
+	struct kvec iov;
+	int err;
+
+	if (args->addr.addrlen > sizeof(args->addr.addr) ||
+	    args->msglen > sizeof(args->msg))
+		return -EINVAL;
+
+	iov.iov_base = args->msg;
+	iov.iov_len  = args->msglen;
+
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen);
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = sock_sendmsg(sock, &msg);
+	args->addr.addrlen = msg.msg_namelen;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args)
+{
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_getsockname(sock, (struct sockaddr *)&args->addr);
+	if (err < 0)
+		goto out;
+
+	args->addrlen = err;
+	err = 0;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args)
+{
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_getpeername(sock, (struct sockaddr *)&args->addr);
+	if (err < 0)
+		goto out;
+
+	args->addrlen = err;
+	err = 0;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+static DEFINE_MUTEX(st_ops_mutex);
+static struct bpf_testmod_st_ops *st_ops;
+
+__bpf_kfunc int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args)
+{
+	int ret = -1;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops && st_ops->test_prologue)
+		ret = st_ops->test_prologue(args);
+	mutex_unlock(&st_ops_mutex);
+
+	return ret;
+}
+
+__bpf_kfunc int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args)
+{
+	int ret = -1;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops && st_ops->test_epilogue)
+		ret = st_ops->test_epilogue(args);
+	mutex_unlock(&st_ops_mutex);
+
+	return ret;
+}
+
+__bpf_kfunc int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args)
+{
+	int ret = -1;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops && st_ops->test_pro_epilogue)
+		ret = st_ops->test_pro_epilogue(args);
+	mutex_unlock(&st_ops_mutex);
+
+	return ret;
+}
+
+__bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args)
+{
+	args->a += 10;
+	return args->a;
+}
+
+BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test4)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdwr_mem, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdonly_mem, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_acq_rdonly_mem, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_int_mem_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
+
+static int bpf_testmod_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static bool bpf_testmod_ops_is_valid_access(int off, int size,
+					    enum bpf_access_type type,
+					    const struct bpf_prog *prog,
+					    struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_testmod_ops_init_member(const struct btf_type *t,
+				       const struct btf_member *member,
+				       void *kdata, const void *udata)
+{
+	if (member->offset == offsetof(struct bpf_testmod_ops, data) * 8) {
+		/* For data fields, this function has to copy it and return
+		 * 1 to indicate that the data has been handled by the
+		 * struct_ops type, or the verifier will reject the map if
+		 * the value of the data field is not zero.
+		 */
+		((struct bpf_testmod_ops *)kdata)->data = ((struct bpf_testmod_ops *)udata)->data;
+		return 1;
+	}
+	return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_testmod_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_testmod_check_kfunc_ids,
+};
+
+static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
+	.is_valid_access = bpf_testmod_ops_is_valid_access,
+};
+
+static const struct bpf_verifier_ops bpf_testmod_verifier_ops3 = {
+	.is_valid_access = bpf_testmod_ops_is_valid_access,
+};
+
+static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_testmod_ops *ops = kdata;
+
+	if (ops->test_1)
+		ops->test_1();
+	/* Some test cases (ex. struct_ops_maybe_null) may not have test_2
+	 * initialized, so we need to check for NULL.
+	 */
+	if (ops->test_2)
+		ops->test_2(4, ops->data);
+
+	return 0;
+}
+
+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
+{
+}
+
+static int bpf_testmod_test_1(void)
+{
+	return 0;
+}
+
+static void bpf_testmod_test_2(int a, int b)
+{
+}
+
+static int bpf_testmod_tramp(int value)
+{
+	return 0;
+}
+
+static int bpf_testmod_ops__test_maybe_null(int dummy,
+					    struct task_struct *task__nullable)
+{
+	return 0;
+}
+
+static struct bpf_testmod_ops __bpf_testmod_ops = {
+	.test_1 = bpf_testmod_test_1,
+	.test_2 = bpf_testmod_test_2,
+	.test_maybe_null = bpf_testmod_ops__test_maybe_null,
+};
+
+struct bpf_struct_ops bpf_bpf_testmod_ops = {
+	.verifier_ops = &bpf_testmod_verifier_ops,
+	.init = bpf_testmod_ops_init,
+	.init_member = bpf_testmod_ops_init_member,
+	.reg = bpf_dummy_reg,
+	.unreg = bpf_dummy_unreg,
+	.cfi_stubs = &__bpf_testmod_ops,
+	.name = "bpf_testmod_ops",
+	.owner = THIS_MODULE,
+};
+
+static int bpf_dummy_reg2(void *kdata, struct bpf_link *link)
+{
+	struct bpf_testmod_ops2 *ops = kdata;
+
+	ops->test_1();
+	return 0;
+}
+
+static struct bpf_testmod_ops2 __bpf_testmod_ops2 = {
+	.test_1 = bpf_testmod_test_1,
+};
+
+struct bpf_struct_ops bpf_testmod_ops2 = {
+	.verifier_ops = &bpf_testmod_verifier_ops,
+	.init = bpf_testmod_ops_init,
+	.init_member = bpf_testmod_ops_init_member,
+	.reg = bpf_dummy_reg2,
+	.unreg = bpf_dummy_unreg,
+	.cfi_stubs = &__bpf_testmod_ops2,
+	.name = "bpf_testmod_ops2",
+	.owner = THIS_MODULE,
+};
+
+static int st_ops3_reg(void *kdata, struct bpf_link *link)
+{
+	int err = 0;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops3) {
+		pr_err("st_ops has already been registered\n");
+		err = -EEXIST;
+		goto unlock;
+	}
+	st_ops3 = kdata;
+
+unlock:
+	mutex_unlock(&st_ops_mutex);
+	return err;
+}
+
+static void st_ops3_unreg(void *kdata, struct bpf_link *link)
+{
+	mutex_lock(&st_ops_mutex);
+	st_ops3 = NULL;
+	mutex_unlock(&st_ops_mutex);
+}
+
+static void test_1_recursion_detected(struct bpf_prog *prog)
+{
+	struct bpf_prog_stats *stats;
+
+	stats = this_cpu_ptr(prog->stats);
+	printk("bpf_testmod: oh no, recursing into test_1, recursion_misses %llu",
+	       u64_stats_read(&stats->misses));
+}
+
+static int st_ops3_check_member(const struct btf_type *t,
+				const struct btf_member *member,
+				const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_testmod_ops3, test_1):
+		prog->aux->priv_stack_requested = true;
+		prog->aux->recursion_detected = test_1_recursion_detected;
+		fallthrough;
+	default:
+		break;
+	}
+	return 0;
+}
+
+struct bpf_struct_ops bpf_testmod_ops3 = {
+	.verifier_ops = &bpf_testmod_verifier_ops3,
+	.init = bpf_testmod_ops_init,
+	.init_member = bpf_testmod_ops_init_member,
+	.reg = st_ops3_reg,
+	.unreg = st_ops3_unreg,
+	.check_member = st_ops3_check_member,
+	.cfi_stubs = &__bpf_testmod_ops3,
+	.name = "bpf_testmod_ops3",
+	.owner = THIS_MODULE,
+};
+
+static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args)
+{
+	return 0;
+}
+
+static int bpf_test_mod_st_ops__test_epilogue(struct st_ops_args *args)
+{
+	return 0;
+}
+
+static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args)
+{
+	return 0;
+}
+
+static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (strcmp(prog->aux->attach_func_name, "test_prologue") &&
+	    strcmp(prog->aux->attach_func_name, "test_pro_epilogue"))
+		return 0;
+
+	/* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx".
+	 * r7 = r6->a;
+	 * r7 += 1000;
+	 * r6->a = r7;
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0);
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, offsetof(struct st_ops_args, a));
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1000);
+	*insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a));
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
+static int st_ops_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
+			       s16 ctx_stack_off)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (strcmp(prog->aux->attach_func_name, "test_epilogue") &&
+	    strcmp(prog->aux->attach_func_name, "test_pro_epilogue"))
+		return 0;
+
+	/* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx"
+	 * r1 = r1[0]; // r1 will be "struct st_ops *args"
+	 * r6 = r1->a;
+	 * r6 += 10000;
+	 * r1->a = r6;
+	 * r0 = r6;
+	 * r0 *= 2;
+	 * BPF_EXIT;
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off);
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, offsetof(struct st_ops_args, a));
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 10000);
+	*insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a));
+	*insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6);
+	*insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2);
+	*insn++ = BPF_EXIT_INSN();
+
+	return insn - insn_buf;
+}
+
+static int st_ops_btf_struct_access(struct bpf_verifier_log *log,
+				    const struct bpf_reg_state *reg,
+				    int off, int size)
+{
+	if (off < 0 || off + size > sizeof(struct st_ops_args))
+		return -EACCES;
+	return 0;
+}
+
+static const struct bpf_verifier_ops st_ops_verifier_ops = {
+	.is_valid_access = bpf_testmod_ops_is_valid_access,
+	.btf_struct_access = st_ops_btf_struct_access,
+	.gen_prologue = st_ops_gen_prologue,
+	.gen_epilogue = st_ops_gen_epilogue,
+	.get_func_proto = bpf_base_func_proto,
+};
+
+static struct bpf_testmod_st_ops st_ops_cfi_stubs = {
+	.test_prologue = bpf_test_mod_st_ops__test_prologue,
+	.test_epilogue = bpf_test_mod_st_ops__test_epilogue,
+	.test_pro_epilogue = bpf_test_mod_st_ops__test_pro_epilogue,
+};
+
+static int st_ops_reg(void *kdata, struct bpf_link *link)
+{
+	int err = 0;
+
+	mutex_lock(&st_ops_mutex);
+	if (st_ops) {
+		pr_err("st_ops has already been registered\n");
+		err = -EEXIST;
+		goto unlock;
+	}
+	st_ops = kdata;
+
+unlock:
+	mutex_unlock(&st_ops_mutex);
+	return err;
+}
+
+static void st_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	mutex_lock(&st_ops_mutex);
+	st_ops = NULL;
+	mutex_unlock(&st_ops_mutex);
+}
+
+static int st_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int st_ops_init_member(const struct btf_type *t,
+			      const struct btf_member *member,
+			      void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static struct bpf_struct_ops testmod_st_ops = {
+	.verifier_ops = &st_ops_verifier_ops,
+	.init = st_ops_init,
+	.init_member = st_ops_init_member,
+	.reg = st_ops_reg,
+	.unreg = st_ops_unreg,
+	.cfi_stubs = &st_ops_cfi_stubs,
+	.name = "bpf_testmod_st_ops",
+	.owner = THIS_MODULE,
+};
+
+extern int bpf_fentry_test1(int a);
+
+static int bpf_testmod_init(void)
+{
+	const struct btf_id_dtor_kfunc bpf_testmod_dtors[] = {
+		{
+			.btf_id		= bpf_testmod_dtor_ids[0],
+			.kfunc_btf_id	= bpf_testmod_dtor_ids[1]
+		},
+	};
+	void **tramp;
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set);
+	ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
+	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
+	ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3);
+	ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops);
+	ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors,
+						 ARRAY_SIZE(bpf_testmod_dtors),
+						 THIS_MODULE);
+	if (ret < 0)
+		return ret;
+	if (bpf_fentry_test1(0) < 0)
+		return -EINVAL;
+	sock = NULL;
+	mutex_init(&sock_lock);
+	ret = sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
+	if (ret < 0)
+		return ret;
+	ret = register_bpf_testmod_uprobe();
+	if (ret < 0)
+		return ret;
+
+	/* Ensure nothing is between tramp_1..tramp_40 */
+	BUILD_BUG_ON(offsetof(struct bpf_testmod_ops, tramp_1) + 40 * sizeof(long) !=
+		     offsetofend(struct bpf_testmod_ops, tramp_40));
+	tramp = (void **)&__bpf_testmod_ops.tramp_1;
+	while (tramp <= (void **)&__bpf_testmod_ops.tramp_40)
+		*tramp++ = bpf_testmod_tramp;
+
+	return 0;
+}
+
+static void bpf_testmod_exit(void)
+{
+        /* Need to wait for all references to be dropped because
+         * bpf_kfunc_call_test_release() which currently resides in kernel can
+         * be called after bpf_testmod is unloaded. Once release function is
+         * moved into the module this wait can be removed.
+         */
+	while (refcount_read(&prog_test_struct.cnt) > 1)
+		msleep(20);
+
+	bpf_kfunc_close_sock();
+	sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
+	unregister_bpf_testmod_uprobe();
+}
+
+module_init(bpf_testmod_init);
+module_exit(bpf_testmod_exit);
+
+MODULE_AUTHOR("Andrii Nakryiko");
+MODULE_DESCRIPTION("BPF selftests module");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h
new file mode 100644
index 000000000000..356803d1c10e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#ifndef _BPF_TESTMOD_H
+#define _BPF_TESTMOD_H
+
+#include <linux/types.h>
+
+struct task_struct;
+
+struct bpf_testmod_test_read_ctx {
+	char *buf;
+	loff_t off;
+	size_t len;
+};
+
+struct bpf_testmod_test_write_ctx {
+	char *buf;
+	loff_t off;
+	size_t len;
+};
+
+struct bpf_testmod_test_writable_ctx {
+	bool early_ret;
+	int val;
+};
+
+/* BPF iter that returns *value* *n* times in a row */
+struct bpf_iter_testmod_seq {
+	s64 value;
+	int cnt;
+};
+
+struct bpf_testmod_ops {
+	int (*test_1)(void);
+	void (*test_2)(int a, int b);
+	/* Used to test nullable arguments. */
+	int (*test_maybe_null)(int dummy, struct task_struct *task);
+	int (*unsupported_ops)(void);
+
+	/* The following fields are used to test shadow copies. */
+	char onebyte;
+	struct {
+		int a;
+		int b;
+	} unsupported;
+	int data;
+
+	/* The following pointers are used to test the maps having multiple
+	 * pages of trampolines.
+	 */
+	int (*tramp_1)(int value);
+	int (*tramp_2)(int value);
+	int (*tramp_3)(int value);
+	int (*tramp_4)(int value);
+	int (*tramp_5)(int value);
+	int (*tramp_6)(int value);
+	int (*tramp_7)(int value);
+	int (*tramp_8)(int value);
+	int (*tramp_9)(int value);
+	int (*tramp_10)(int value);
+	int (*tramp_11)(int value);
+	int (*tramp_12)(int value);
+	int (*tramp_13)(int value);
+	int (*tramp_14)(int value);
+	int (*tramp_15)(int value);
+	int (*tramp_16)(int value);
+	int (*tramp_17)(int value);
+	int (*tramp_18)(int value);
+	int (*tramp_19)(int value);
+	int (*tramp_20)(int value);
+	int (*tramp_21)(int value);
+	int (*tramp_22)(int value);
+	int (*tramp_23)(int value);
+	int (*tramp_24)(int value);
+	int (*tramp_25)(int value);
+	int (*tramp_26)(int value);
+	int (*tramp_27)(int value);
+	int (*tramp_28)(int value);
+	int (*tramp_29)(int value);
+	int (*tramp_30)(int value);
+	int (*tramp_31)(int value);
+	int (*tramp_32)(int value);
+	int (*tramp_33)(int value);
+	int (*tramp_34)(int value);
+	int (*tramp_35)(int value);
+	int (*tramp_36)(int value);
+	int (*tramp_37)(int value);
+	int (*tramp_38)(int value);
+	int (*tramp_39)(int value);
+	int (*tramp_40)(int value);
+};
+
+struct bpf_testmod_ops2 {
+	int (*test_1)(void);
+};
+
+struct bpf_testmod_ops3 {
+	int (*test_1)(void);
+	int (*test_2)(void);
+};
+
+struct st_ops_args {
+	u64 a;
+};
+
+struct bpf_testmod_st_ops {
+	int (*test_prologue)(struct st_ops_args *args);
+	int (*test_epilogue)(struct st_ops_args *args);
+	int (*test_pro_epilogue)(struct st_ops_args *args);
+	struct module *owner;
+};
+
+#endif /* _BPF_TESTMOD_H */
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
new file mode 100644
index 000000000000..b58817938deb
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BPF_TESTMOD_KFUNC_H
+#define _BPF_TESTMOD_KFUNC_H
+
+#ifndef __KERNEL__
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#else
+#define __ksym
+struct prog_test_member1 {
+	int a;
+};
+
+struct prog_test_member {
+	struct prog_test_member1 m;
+	int c;
+};
+
+struct prog_test_ref_kfunc {
+	int a;
+	int b;
+	struct prog_test_member memb;
+	struct prog_test_ref_kfunc *next;
+	refcount_t cnt;
+};
+#endif
+
+struct prog_test_pass1 {
+	int x0;
+	struct {
+		int x1;
+		struct {
+			int x2;
+			struct {
+				int x3;
+			};
+		};
+	};
+};
+
+struct prog_test_pass2 {
+	int len;
+	short arr1[4];
+	struct {
+		char arr2[4];
+		unsigned long arr3[8];
+	} x;
+};
+
+struct prog_test_fail1 {
+	void *p;
+	int x;
+};
+
+struct prog_test_fail2 {
+	int x8;
+	struct prog_test_pass1 x;
+};
+
+struct prog_test_fail3 {
+	int len;
+	char arr1[2];
+	char arr2[];
+};
+
+struct init_sock_args {
+	int af;
+	int type;
+};
+
+struct addr_args {
+	char addr[sizeof(struct __kernel_sockaddr_storage)];
+	int addrlen;
+};
+
+struct sendmsg_args {
+	struct addr_args addr;
+	char msg[10];
+	int msglen;
+};
+
+struct bpf_testmod_ctx {
+	struct callback_head	rcu;
+	refcount_t		usage;
+};
+
+struct prog_test_ref_kfunc *
+bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym;
+void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
+void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) __ksym;
+
+void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
+int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym;
+int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
+int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
+void bpf_kfunc_call_int_mem_release(int *p) __ksym;
+
+/* The bpf_kfunc_call_test_static_unused_arg is defined as static,
+ * but bpf program compilation needs to see it as global symbol.
+ */
+#ifndef __KERNEL__
+u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused) __ksym;
+#endif
+
+void bpf_testmod_test_mod_kfunc(int i) __ksym;
+
+__u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+				__u32 c, __u64 d) __ksym;
+int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
+struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
+long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym;
+
+void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym;
+void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym;
+void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym;
+void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
+
+void bpf_kfunc_call_test_destructive(void) __ksym;
+void bpf_kfunc_call_test_sleepable(void) __ksym;
+
+void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p);
+struct prog_test_member *bpf_kfunc_call_memb_acquire(void);
+void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p);
+void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p);
+void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p);
+void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p);
+void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len);
+
+void bpf_kfunc_common_test(void) __ksym;
+
+int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym;
+void bpf_kfunc_close_sock(void) __ksym;
+int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym;
+int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym;
+int bpf_kfunc_call_kernel_listen(void) __ksym;
+int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym;
+int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym;
+int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym;
+int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym;
+
+void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, struct bpf_dynptr *ptr__nullable) __ksym;
+
+struct bpf_testmod_ctx *bpf_testmod_ctx_create(int *err) __ksym;
+void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) __ksym;
+
+struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) __ksym;
+struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym;
+void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym;
+
+struct st_ops_args;
+int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) __ksym;
+int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym;
+int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym;
+int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym;
+
+void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym;
+void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym;
+void bpf_kfunc_trusted_num_test(int *ptr) __ksym;
+void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym;
+
+#endif /* _BPF_TESTMOD_KFUNC_H */
-- 
cgit v1.2.3


From 9d93db0d1881c9e37e1528cd796e20ff13b7692c Mon Sep 17 00:00:00 2001
From: Gautam Somani <gautamsomani@gmail.com>
Date: Sun, 1 Dec 2024 03:41:02 +0900
Subject: x86/mm/selftests: Fix typo in lam.c

Change the spelling from metadate -> metadata

Signed-off-by: Gautam Somani <gautamsomani@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20241130184102.2182-1-gautamsomani@gmail.com
---
 tools/testing/selftests/x86/lam.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c
index 0ea4f6813930..4d4a76532dc9 100644
--- a/tools/testing/selftests/x86/lam.c
+++ b/tools/testing/selftests/x86/lam.c
@@ -237,7 +237,7 @@ static uint64_t set_metadata(uint64_t src, unsigned long lam)
  * both pointers should point to the same address.
  *
  * @return:
- * 0: value on the pointer with metadate and value on original are same
+ * 0: value on the pointer with metadata and value on original are same
  * 1: not same.
  */
 static int handle_lam_test(void *src, unsigned int lam)
-- 
cgit v1.2.3


From 48697bdfb65d21bab8c686830b04bf2e47b96d52 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Wed, 4 Dec 2024 16:32:39 +0000
Subject: selftests: net: cleanup busy_poller.c

Fix various integer type conversions by using strtoull and a temporary
variable which is bounds checked before being casted into the
appropriate cfg_* variable for use by the test program.

While here:
  - free the strdup'd cfg string for overall hygenie.
  - initialize napi_id = 0 in setup_queue to avoid warnings on some
    compilers.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20241204163239.294123-1-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/busy_poller.c | 88 ++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c
index 99b0e8c17fca..04c7ff577bb8 100644
--- a/tools/testing/selftests/net/busy_poller.c
+++ b/tools/testing/selftests/net/busy_poller.c
@@ -54,16 +54,16 @@ struct epoll_params {
 #define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
 #endif
 
-static uint32_t cfg_port = 8000;
+static uint16_t cfg_port = 8000;
 static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY };
 static char *cfg_outfile;
 static int cfg_max_events = 8;
-static int cfg_ifindex;
+static uint32_t cfg_ifindex;
 
 /* busy poll params */
 static uint32_t cfg_busy_poll_usecs;
-static uint32_t cfg_busy_poll_budget;
-static uint32_t cfg_prefer_busy_poll;
+static uint16_t cfg_busy_poll_budget;
+static uint8_t cfg_prefer_busy_poll;
 
 /* IRQ params */
 static uint32_t cfg_defer_hard_irqs;
@@ -79,6 +79,7 @@ static void usage(const char *filepath)
 
 static void parse_opts(int argc, char **argv)
 {
+	unsigned long long tmp;
 	int ret;
 	int c;
 
@@ -86,31 +87,40 @@ static void parse_opts(int argc, char **argv)
 		usage(argv[0]);
 
 	while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) {
+		/* most options take integer values, except o and b, so reduce
+		 * code duplication a bit for the common case by calling
+		 * strtoull here and leave bounds checking and casting per
+		 * option below.
+		 */
+		if (c != 'o' && c != 'b')
+			tmp = strtoull(optarg, NULL, 0);
+
 		switch (c) {
 		case 'u':
-			cfg_busy_poll_usecs = strtoul(optarg, NULL, 0);
-			if (cfg_busy_poll_usecs == ULONG_MAX ||
-			    cfg_busy_poll_usecs > UINT32_MAX)
+			if (tmp == ULLONG_MAX || tmp > UINT32_MAX)
 				error(1, ERANGE, "busy_poll_usecs too large");
+
+			cfg_busy_poll_usecs = (uint32_t)tmp;
 			break;
 		case 'P':
-			cfg_prefer_busy_poll = strtoul(optarg, NULL, 0);
-			if (cfg_prefer_busy_poll == ULONG_MAX ||
-			    cfg_prefer_busy_poll > 1)
+			if (tmp == ULLONG_MAX || tmp > 1)
 				error(1, ERANGE,
 				      "prefer busy poll should be 0 or 1");
+
+			cfg_prefer_busy_poll = (uint8_t)tmp;
 			break;
 		case 'g':
-			cfg_busy_poll_budget = strtoul(optarg, NULL, 0);
-			if (cfg_busy_poll_budget == ULONG_MAX ||
-			    cfg_busy_poll_budget > UINT16_MAX)
+			if (tmp == ULLONG_MAX || tmp > UINT16_MAX)
 				error(1, ERANGE,
 				      "busy poll budget must be [0, UINT16_MAX]");
+
+			cfg_busy_poll_budget = (uint16_t)tmp;
 			break;
 		case 'p':
-			cfg_port = strtoul(optarg, NULL, 0);
-			if (cfg_port > UINT16_MAX)
+			if (tmp == ULLONG_MAX || tmp > UINT16_MAX)
 				error(1, ERANGE, "port must be <= 65535");
+
+			cfg_port = (uint16_t)tmp;
 			break;
 		case 'b':
 			ret = inet_aton(optarg, &cfg_bind_addr);
@@ -124,41 +134,39 @@ static void parse_opts(int argc, char **argv)
 				error(1, 0, "outfile invalid");
 			break;
 		case 'm':
-			cfg_max_events = strtol(optarg, NULL, 0);
-
-			if (cfg_max_events == LONG_MIN ||
-			    cfg_max_events == LONG_MAX ||
-			    cfg_max_events <= 0)
+			if (tmp == ULLONG_MAX || tmp > INT_MAX)
 				error(1, ERANGE,
-				      "max events must be > 0 and < LONG_MAX");
+				      "max events must be > 0 and <= INT_MAX");
+
+			cfg_max_events = (int)tmp;
 			break;
 		case 'd':
-			cfg_defer_hard_irqs = strtoul(optarg, NULL, 0);
-
-			if (cfg_defer_hard_irqs == ULONG_MAX ||
-			    cfg_defer_hard_irqs > INT32_MAX)
+			if (tmp == ULLONG_MAX || tmp > INT32_MAX)
 				error(1, ERANGE,
 				      "defer_hard_irqs must be <= INT32_MAX");
+
+			cfg_defer_hard_irqs = (uint32_t)tmp;
 			break;
 		case 'r':
-			cfg_gro_flush_timeout = strtoull(optarg, NULL, 0);
-
-			if (cfg_gro_flush_timeout == ULLONG_MAX)
+			if (tmp == ULLONG_MAX || tmp > UINT64_MAX)
 				error(1, ERANGE,
-				      "gro_flush_timeout must be < ULLONG_MAX");
+				      "gro_flush_timeout must be < UINT64_MAX");
+
+			cfg_gro_flush_timeout = (uint64_t)tmp;
 			break;
 		case 's':
-			cfg_irq_suspend_timeout = strtoull(optarg, NULL, 0);
-
-			if (cfg_irq_suspend_timeout == ULLONG_MAX)
+			if (tmp == ULLONG_MAX || tmp > UINT64_MAX)
 				error(1, ERANGE,
 				      "irq_suspend_timeout must be < ULLONG_MAX");
+
+			cfg_irq_suspend_timeout = (uint64_t)tmp;
 			break;
 		case 'i':
-			cfg_ifindex = strtoul(optarg, NULL, 0);
-			if (cfg_ifindex == ULONG_MAX)
+			if (tmp == ULLONG_MAX || tmp > INT_MAX)
 				error(1, ERANGE,
-				      "ifindex must be < ULONG_MAX");
+				      "ifindex must be <= INT_MAX");
+
+			cfg_ifindex = (int)tmp;
 			break;
 		}
 	}
@@ -215,7 +223,7 @@ static void setup_queue(void)
 	struct netdev_napi_set_req *set_req = NULL;
 	struct ynl_sock *ys;
 	struct ynl_error yerr;
-	uint32_t napi_id;
+	uint32_t napi_id = 0;
 
 	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
 	if (!ys)
@@ -277,8 +285,8 @@ static void run_poller(void)
 	 * here
 	 */
 	epoll_params.busy_poll_usecs = cfg_busy_poll_usecs;
-	epoll_params.busy_poll_budget = (uint16_t)cfg_busy_poll_budget;
-	epoll_params.prefer_busy_poll = (uint8_t)cfg_prefer_busy_poll;
+	epoll_params.busy_poll_budget = cfg_busy_poll_budget;
+	epoll_params.prefer_busy_poll = cfg_prefer_busy_poll;
 	epoll_params.__pad = 0;
 
 	val = 1;
@@ -342,5 +350,9 @@ int main(int argc, char *argv[])
 	parse_opts(argc, argv);
 	setup_queue();
 	run_poller();
+
+	if (cfg_outfile)
+		free(cfg_outfile);
+
 	return 0;
 }
-- 
cgit v1.2.3


From 00ab246750821b226f14ebc94ad21431dc82820b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 6 Dec 2024 11:30:56 +0100
Subject: tools: ynl-gen-c: annotate valid choices for --mode

This makes argparse validate the input and helps users
understand which modes are possible.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://patch.msgid.link/20241206113100.e2ab5cf6937c.Ie149a0ca5df713860964b44fe9d9ae547f2e1553@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 8098bcbb6f40..7f6e5157770d 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2706,7 +2706,8 @@ def find_kernel_root(full_path):
 
 def main():
     parser = argparse.ArgumentParser(description='Netlink simple parsing generator')
-    parser.add_argument('--mode', dest='mode', type=str, required=True)
+    parser.add_argument('--mode', dest='mode', type=str, required=True,
+                        choices=('user', 'kernel', 'uapi'))
     parser.add_argument('--spec', dest='spec', type=str, required=True)
     parser.add_argument('--header', dest='header', action='store_true', default=None)
     parser.add_argument('--source', dest='header', action='store_false')
-- 
cgit v1.2.3


From 81d89e6e88d5d592c1792940753d69d9753b3a8a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 6 Dec 2024 11:30:57 +0100
Subject: tools: ynl-gen-c: don't require -o argument

Without -o the tool currently crashes, but it's not marked
as required. The only thing we can't do without it is to
generate the correct #include for user source files, but
we can put a placeholder instead.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://patch.msgid.link/20241206113100.89d35bf124d6.I9228fb704e6d5c9d8e046ef15025a47a48439c1e@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 7f6e5157770d..ec2288948795 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2761,7 +2761,10 @@ def main():
         cw.p('#define ' + hdr_prot)
         cw.nl()
 
-    hdr_file=os.path.basename(args.out_file[:-2]) + ".h"
+    if args.out_file:
+        hdr_file = os.path.basename(args.out_file[:-2]) + ".h"
+    else:
+        hdr_file = "generated_header_file.h"
 
     if args.mode == 'kernel':
         cw.p('#include <net/netlink.h>')
-- 
cgit v1.2.3


From 477fb0671feb7b51fed9a803aeafa64d66cf6101 Mon Sep 17 00:00:00 2001
From: George Guo <guodongtai@kylinos.cn>
Date: Mon, 25 Nov 2024 19:28:12 +0800
Subject: selftests/livepatch: Replace hardcoded module name with variable in
 test-callbacks.sh

Replaced the hardcoded module name test_klp_callbacks_demo in the
pre_patch_callback log message with the variable $MOD_LIVEPATCH.

Signed-off-by: George Guo <guodongtai@kylinos.cn>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20241125112812.281018-2-dongtai.guo@linux.dev
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 tools/testing/selftests/livepatch/test-callbacks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/livepatch/test-callbacks.sh b/tools/testing/selftests/livepatch/test-callbacks.sh
index 37bbc3fb2780..2a03deb26a12 100755
--- a/tools/testing/selftests/livepatch/test-callbacks.sh
+++ b/tools/testing/selftests/livepatch/test-callbacks.sh
@@ -259,7 +259,7 @@ $MOD_TARGET: ${MOD_TARGET}_init
 % insmod test_modules/$MOD_LIVEPATCH.ko pre_patch_ret=-19
 livepatch: enabling patch '$MOD_LIVEPATCH'
 livepatch: '$MOD_LIVEPATCH': initializing patching transition
-test_klp_callbacks_demo: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
 livepatch: pre-patch callback failed for object 'vmlinux'
 livepatch: failed to enable patch '$MOD_LIVEPATCH'
 livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch
-- 
cgit v1.2.3


From ed2ec63d3faa4a3fda0dbf7cca93a9d0d9793100 Mon Sep 17 00:00:00 2001
From: Wardenjohn <zhangwarden@gmail.com>
Date: Thu, 24 Oct 2024 16:35:30 +0800
Subject: selftests: livepatch: add test cases of stack_order sysfs interface

Add selftest test cases to sysfs attribute 'stack_order'.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Wardenjohn <zhangwarden@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20241024083530.58775-1-zhangwarden@gmail.com
[pmladek@suse.com: Substitute $SYSFS_KLP_DIR]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 tools/testing/selftests/livepatch/test-sysfs.sh | 71 +++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh
index 2c91428d2997..58fe1d96997c 100755
--- a/tools/testing/selftests/livepatch/test-sysfs.sh
+++ b/tools/testing/selftests/livepatch/test-sysfs.sh
@@ -5,6 +5,8 @@
 . $(dirname $0)/functions.sh
 
 MOD_LIVEPATCH=test_klp_livepatch
+MOD_LIVEPATCH2=test_klp_callbacks_demo
+MOD_LIVEPATCH3=test_klp_syscall
 
 setup_config
 
@@ -19,6 +21,8 @@ check_sysfs_rights "$MOD_LIVEPATCH" "enabled" "-rw-r--r--"
 check_sysfs_value  "$MOD_LIVEPATCH" "enabled" "1"
 check_sysfs_rights "$MOD_LIVEPATCH" "force" "--w-------"
 check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--"
+check_sysfs_rights "$MOD_LIVEPATCH" "stack_order" "-r--r--r--"
+check_sysfs_value  "$MOD_LIVEPATCH" "stack_order" "1"
 check_sysfs_rights "$MOD_LIVEPATCH" "transition" "-r--r--r--"
 check_sysfs_value  "$MOD_LIVEPATCH" "transition" "0"
 check_sysfs_rights "$MOD_LIVEPATCH" "vmlinux/patched" "-r--r--r--"
@@ -131,4 +135,71 @@ livepatch: '$MOD_LIVEPATCH': completing unpatching transition
 livepatch: '$MOD_LIVEPATCH': unpatching complete
 % rmmod $MOD_LIVEPATCH"
 
+start_test "sysfs test stack_order value"
+
+load_lp $MOD_LIVEPATCH
+
+check_sysfs_value  "$MOD_LIVEPATCH" "stack_order" "1"
+
+load_lp $MOD_LIVEPATCH2
+
+check_sysfs_value  "$MOD_LIVEPATCH2" "stack_order" "2"
+
+load_lp $MOD_LIVEPATCH3
+
+check_sysfs_value  "$MOD_LIVEPATCH3" "stack_order" "3"
+
+disable_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH2
+
+check_sysfs_value  "$MOD_LIVEPATCH" "stack_order" "1"
+check_sysfs_value  "$MOD_LIVEPATCH3" "stack_order" "2"
+
+disable_lp $MOD_LIVEPATCH3
+unload_lp $MOD_LIVEPATCH3
+
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% insmod test_modules/$MOD_LIVEPATCH.ko
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+% insmod test_modules/$MOD_LIVEPATCH2.ko
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% insmod test_modules/$MOD_LIVEPATCH3.ko
+livepatch: enabling patch '$MOD_LIVEPATCH3'
+livepatch: '$MOD_LIVEPATCH3': initializing patching transition
+livepatch: '$MOD_LIVEPATCH3': starting patching transition
+livepatch: '$MOD_LIVEPATCH3': completing patching transition
+livepatch: '$MOD_LIVEPATCH3': patching complete
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled
+livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
+$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
+$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': unpatching complete
+% rmmod $MOD_LIVEPATCH2
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH3/enabled
+livepatch: '$MOD_LIVEPATCH3': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH3': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH3': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH3': unpatching complete
+% rmmod $MOD_LIVEPATCH3
+% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
 exit 0
-- 
cgit v1.2.3


From 82c1f13de315f38ecdb63d8b0e63ad7d70994d55 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Mon, 9 Dec 2024 13:04:55 +0000
Subject: selftests/bpf: Add more stats into veristat

Extend veristat to collect and print more stats, namely:
  - program size in instructions
  - jited program size in bytes
  - program type
  - attach type
  - stack depth

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241209130455.94592-1-mykyta.yatsenko5@gmail.com
---
 tools/testing/selftests/bpf/veristat.c | 64 ++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index e12ef953fba8..162fe27d06f8 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -34,6 +34,11 @@ enum stat_id {
 	PEAK_STATES,
 	MAX_STATES_PER_INSN,
 	MARK_READ_MAX_LEN,
+	SIZE,
+	JITED_SIZE,
+	STACK,
+	PROG_TYPE,
+	ATTACH_TYPE,
 
 	FILE_NAME,
 	PROG_NAME,
@@ -640,19 +645,21 @@ cleanup:
 }
 
 static const struct stat_specs default_output_spec = {
-	.spec_cnt = 7,
+	.spec_cnt = 8,
 	.ids = {
 		FILE_NAME, PROG_NAME, VERDICT, DURATION,
-		TOTAL_INSNS, TOTAL_STATES, PEAK_STATES,
+		TOTAL_INSNS, TOTAL_STATES, SIZE, JITED_SIZE
 	},
 };
 
 static const struct stat_specs default_csv_output_spec = {
-	.spec_cnt = 9,
+	.spec_cnt = 14,
 	.ids = {
 		FILE_NAME, PROG_NAME, VERDICT, DURATION,
 		TOTAL_INSNS, TOTAL_STATES, PEAK_STATES,
 		MAX_STATES_PER_INSN, MARK_READ_MAX_LEN,
+		SIZE, JITED_SIZE, PROG_TYPE, ATTACH_TYPE,
+		STACK,
 	},
 };
 
@@ -688,6 +695,11 @@ static struct stat_def {
 	[PEAK_STATES] = { "Peak states", {"peak_states"}, },
 	[MAX_STATES_PER_INSN] = { "Max states per insn", {"max_states_per_insn"}, },
 	[MARK_READ_MAX_LEN] = { "Max mark read length", {"max_mark_read_len", "mark_read"}, },
+	[SIZE] = { "Program size", {"prog_size"}, },
+	[JITED_SIZE] = { "Jited size", {"prog_size_jited"}, },
+	[STACK] = {"Stack depth", {"stack_depth", "stack"}, },
+	[PROG_TYPE] = { "Program type", {"prog_type"}, },
+	[ATTACH_TYPE] = { "Attach type", {"attach_type", }, },
 };
 
 static bool parse_stat_id_var(const char *name, size_t len, int *id,
@@ -835,7 +847,8 @@ static char verif_log_buf[64 * 1024];
 static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *s)
 {
 	const char *cur;
-	int pos, lines;
+	int pos, lines, sub_stack, cnt = 0;
+	char *state = NULL, *token, stack[512];
 
 	buf[buf_sz - 1] = '\0';
 
@@ -853,15 +866,22 @@ static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *
 
 		if (1 == sscanf(cur, "verification time %ld usec\n", &s->stats[DURATION]))
 			continue;
-		if (6 == sscanf(cur, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld",
+		if (5 == sscanf(cur, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld",
 				&s->stats[TOTAL_INSNS],
 				&s->stats[MAX_STATES_PER_INSN],
 				&s->stats[TOTAL_STATES],
 				&s->stats[PEAK_STATES],
 				&s->stats[MARK_READ_MAX_LEN]))
 			continue;
-	}
 
+		if (1 == sscanf(cur, "stack depth %511s", stack))
+			continue;
+	}
+	while ((token = strtok_r(cnt++ ? NULL : stack, "+", &state))) {
+		if (sscanf(token, "%d", &sub_stack) == 0)
+			break;
+		s->stats[STACK] += sub_stack;
+	}
 	return 0;
 }
 
@@ -1146,8 +1166,11 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	char *buf;
 	int buf_sz, log_level;
 	struct verif_stats *stats;
+	struct bpf_prog_info info;
+	__u32 info_len = sizeof(info);
 	int err = 0;
 	void *tmp;
+	int fd;
 
 	if (!should_process_file_prog(base_filename, bpf_program__name(prog))) {
 		env.progs_skipped++;
@@ -1196,6 +1219,15 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	stats->file_name = strdup(base_filename);
 	stats->prog_name = strdup(bpf_program__name(prog));
 	stats->stats[VERDICT] = err == 0; /* 1 - success, 0 - failure */
+	stats->stats[SIZE] = bpf_program__insn_cnt(prog);
+	stats->stats[PROG_TYPE] = bpf_program__type(prog);
+	stats->stats[ATTACH_TYPE] = bpf_program__expected_attach_type(prog);
+
+	memset(&info, 0, info_len);
+	fd = bpf_program__fd(prog);
+	if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0)
+		stats->stats[JITED_SIZE] = info.jited_prog_len;
+
 	parse_verif_log(buf, buf_sz, stats);
 
 	if (env.verbose) {
@@ -1309,6 +1341,11 @@ static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2,
 	case PROG_NAME:
 		cmp = strcmp(s1->prog_name, s2->prog_name);
 		break;
+	case ATTACH_TYPE:
+	case PROG_TYPE:
+	case SIZE:
+	case JITED_SIZE:
+	case STACK:
 	case VERDICT:
 	case DURATION:
 	case TOTAL_INSNS:
@@ -1523,12 +1560,27 @@ static void prepare_value(const struct verif_stats *s, enum stat_id id,
 		else
 			*str = s->stats[VERDICT] ? "success" : "failure";
 		break;
+	case ATTACH_TYPE:
+		if (!s)
+			*str = "N/A";
+		else
+			*str = libbpf_bpf_attach_type_str(s->stats[ATTACH_TYPE]) ?: "N/A";
+		break;
+	case PROG_TYPE:
+		if (!s)
+			*str = "N/A";
+		else
+			*str = libbpf_bpf_prog_type_str(s->stats[PROG_TYPE]) ?: "N/A";
+		break;
 	case DURATION:
 	case TOTAL_INSNS:
 	case TOTAL_STATES:
 	case PEAK_STATES:
 	case MAX_STATES_PER_INSN:
 	case MARK_READ_MAX_LEN:
+	case STACK:
+	case SIZE:
+	case JITED_SIZE:
 		*val = s ? s->stats[id] : 0;
 		break;
 	default:
-- 
cgit v1.2.3


From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Tue, 10 Sep 2024 00:44:32 +0000
Subject: tools: hv: Fix a complier warning in the fcopy uio daemon

hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated
writing up to 14 bytes into a region of size 10 [-Wformat-truncation=]
  436 |  snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name);

Also added 'static' for the array 'desc[]'.

Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver")
Cc: stable@vger.kernel.org # 6.10+
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20240910004433.50254-1-decui@microsoft.com>
---
 tools/hv/hv_fcopy_uio_daemon.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c
index 7a00f3066a98..12743d7f164f 100644
--- a/tools/hv/hv_fcopy_uio_daemon.c
+++ b/tools/hv/hv_fcopy_uio_daemon.c
@@ -35,8 +35,6 @@
 #define WIN8_SRV_MINOR		1
 #define WIN8_SRV_VERSION	(WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
 
-#define MAX_FOLDER_NAME		15
-#define MAX_PATH_LEN		15
 #define FCOPY_UIO		"/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio"
 
 #define FCOPY_VER_COUNT		1
@@ -51,7 +49,7 @@ static const int fw_versions[] = {
 
 #define HV_RING_SIZE		0x4000 /* 16KB ring buffer size */
 
-unsigned char desc[HV_RING_SIZE];
+static unsigned char desc[HV_RING_SIZE];
 
 static int target_fd;
 static char target_fname[PATH_MAX];
@@ -409,8 +407,8 @@ int main(int argc, char *argv[])
 	struct vmbus_br txbr, rxbr;
 	void *ring;
 	uint32_t len = HV_RING_SIZE;
-	char uio_name[MAX_FOLDER_NAME] = {0};
-	char uio_dev_path[MAX_PATH_LEN] = {0};
+	char uio_name[NAME_MAX] = {0};
+	char uio_dev_path[PATH_MAX] = {0};
 
 	static struct option long_options[] = {
 		{"help",	no_argument,	   0,  'h' },
-- 
cgit v1.2.3


From 91ae69c7ed9e262f24240c425ad1eef2cf6639b7 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 16 Oct 2024 16:35:10 +0200
Subject: tools: hv: change permissions of NetworkManager configuration file

Align permissions of the resulting .nmconnection file, instead of
the input file from hv_kvp_daemon. To avoid the tiny time frame
where the output file is world-readable, use umask instead of chmod.

Fixes: 42999c904612 ("hv/hv_kvp_daemon:Support for keyfile based connection profile")
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Link: https://lore.kernel.org/r/20241016143521.3735-1-olaf@aepfle.de
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20241016143521.3735-1-olaf@aepfle.de>
---
 tools/hv/hv_set_ifconfig.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh
index 440a91b35823..2f8baed2b8f7 100755
--- a/tools/hv/hv_set_ifconfig.sh
+++ b/tools/hv/hv_set_ifconfig.sh
@@ -81,7 +81,7 @@ echo "ONBOOT=yes" >> $1
 
 cp $1 /etc/sysconfig/network-scripts/
 
-chmod 600 $2
+umask 0177
 interface=$(echo $2 | awk -F - '{ print $2 }')
 filename="${2##*/}"
 
-- 
cgit v1.2.3


From a9640fcdd400463442846677e62b8208b81cb031 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 5 Nov 2024 09:14:04 +0100
Subject: tools/hv: terminate fcopy daemon if read from uio fails

Terminate endless loop in reading fails, to avoid flooding syslog.

This happens if the state of "Guest services" integration service
is changed from "enabled" to "disabled" at runtime in the VM
settings. In this case pread returns EIO.

Also handle an interrupted system call, and continue in this case.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Link: https://lore.kernel.org/r/20241105081437.15689-1-olaf@aepfle.de
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20241105081437.15689-1-olaf@aepfle.de>
---
 tools/hv/hv_fcopy_uio_daemon.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c
index 12743d7f164f..0198321d14a2 100644
--- a/tools/hv/hv_fcopy_uio_daemon.c
+++ b/tools/hv/hv_fcopy_uio_daemon.c
@@ -466,8 +466,10 @@ int main(int argc, char *argv[])
 		 */
 		ret = pread(fcopy_fd, &tmp, sizeof(int), 0);
 		if (ret < 0) {
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
 			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
-			continue;
+			goto close;
 		}
 
 		len = HV_RING_SIZE;
-- 
cgit v1.2.3


From 07dfa6e821e1c58cbd0f195173dddbd593721f9b Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 12 Nov 2024 16:04:01 +0100
Subject: hv/hv_kvp_daemon: Pass NIC name to hv_get_dns_info as well

The reference implementation of hv_get_dns_info which is in the tree uses
/etc/resolv.conf to get DNS servers and this does not require to know which
NIC is queried. Distro specific implementations, however, may want to
provide per-NIC, fine grained information. E.g. NetworkManager keeps track
of DNS servers per connection.

Similar to hv_get_dhcp_info, pass NIC name as a parameter to
hv_get_dns_info script.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20241112150401.217094-1-vkuznets@redhat.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20241112150401.217094-1-vkuznets@redhat.com>
---
 tools/hv/hv_kvp_daemon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index ae57bf69ad4a..296a7a62c54d 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name,
 	 * .
 	 */
 
-	sprintf(cmd, KVP_SCRIPTS_PATH "%s",  "hv_get_dns_info");
+	sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name);
 
 	/*
 	 * Execute the command to gather DNS info.
-- 
cgit v1.2.3


From a4d024fe2e77063069c5f423f2f9be766450f0f9 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 2 Dec 2024 13:04:10 +0100
Subject: tools/hv: reduce resouce usage in hv_get_dns_info helper

Remove the usage of cat. Replace the shell process with awk with 'exec'.
Also use a generic shell because no bash specific features will be used.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Acked-by: Wei Liu <wei.liu@kernel.org>
Link: https://lore.kernel.org/r/20241202120432.21115-1-olaf@aepfle.de
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20241202120432.21115-1-olaf@aepfle.de>
---
 tools/hv/hv_get_dns_info.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/hv/hv_get_dns_info.sh b/tools/hv/hv_get_dns_info.sh
index 058c17b46ffc..268521234d4b 100755
--- a/tools/hv/hv_get_dns_info.sh
+++ b/tools/hv/hv_get_dns_info.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 
 # This example script parses /etc/resolv.conf to retrive DNS information.
 # In the interest of keeping the KVP daemon code free of distro specific
@@ -10,4 +10,4 @@
 # this script can be based on the Network Manager APIs for retrieving DNS
 # entries.
 
-cat /etc/resolv.conf 2>/dev/null | awk '/^nameserver/ { print $2 }'
+exec awk '/^nameserver/ { print $2 }' /etc/resolv.conf 2>/dev/null
-- 
cgit v1.2.3


From becc7fe329c09a7744fa908fca83418fa94a45a0 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 2 Dec 2024 13:40:52 +0100
Subject: tools/hv: add a .gitignore file

Remove generated files from 'git status' output after 'make -C tools/hv'.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Link: https://lore.kernel.org/r/20241202124107.28650-1-olaf@aepfle.de
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20241202124107.28650-1-olaf@aepfle.de>
---
 tools/hv/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tools/hv/.gitignore

(limited to 'tools')

diff --git a/tools/hv/.gitignore b/tools/hv/.gitignore
new file mode 100644
index 000000000000..0c5bc15d602f
--- /dev/null
+++ b/tools/hv/.gitignore
@@ -0,0 +1,3 @@
+hv_fcopy_uio_daemon
+hv_kvp_daemon
+hv_vss_daemon
-- 
cgit v1.2.3


From 175c71c2aceef173ae6d3dceb41edfc2ac0d5937 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Sun, 8 Dec 2024 23:47:17 +0000
Subject: tools/hv: reduce resource usage in hv_kvp_daemon

hv_kvp_daemon uses popen(3) and system(3) as convinience helper to
launch external helpers. These helpers are invoked via a
temporary shell process. There is no need to keep this temporary
process around while the helper runs. Replace this temporary shell
with the actual helper process via 'exec'.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Link: https://lore.kernel.org/linux-hyperv/20241202123520.27812-1-olaf@aepfle.de/
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 tools/hv/hv_kvp_daemon.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 296a7a62c54d..04ba035d67e9 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name,
 	 * .
 	 */
 
-	sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name);
+	sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dns_info", if_name);
 
 	/*
 	 * Execute the command to gather DNS info.
@@ -742,7 +742,7 @@ static void kvp_get_ipconfig_info(char *if_name,
 	 * Enabled: DHCP enabled.
 	 */
 
-	sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dhcp_info", if_name);
+	sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dhcp_info", if_name);
 
 	file = popen(cmd, "r");
 	if (file == NULL)
@@ -1606,8 +1606,9 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
 	 * invoke the external script to do its magic.
 	 */
 
-	str_len = snprintf(cmd, sizeof(cmd), KVP_SCRIPTS_PATH "%s %s %s",
-			   "hv_set_ifconfig", if_filename, nm_filename);
+	str_len = snprintf(cmd, sizeof(cmd), "exec %s %s %s",
+			   KVP_SCRIPTS_PATH "hv_set_ifconfig",
+			   if_filename, nm_filename);
 	/*
 	 * This is a little overcautious, but it's necessary to suppress some
 	 * false warnings from gcc 8.0.1.
-- 
cgit v1.2.3


From 4d33dc1bc31df80356c49e40dbd3ddff19500bcb Mon Sep 17 00:00:00 2001
From: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Date: Mon, 9 Dec 2024 12:27:20 +0530
Subject: selftests/bpf: Fix fill_link_info selftest on powerpc

With CONFIG_KPROBES_ON_FTRACE enabled on powerpc, ftrace_location_range
returns ftrace location for bpf_fentry_test1 at offset of 4 bytes from
function entry. This is because branch to _mcount function is at offset
of 4 bytes in function profile sequence.

To fix this, add entry_offset of 4 bytes while verifying the address for
kprobe entry address of bpf_fentry_test1 in verify_perf_link_info in
selftest, when CONFIG_KPROBES_ON_FTRACE is enabled.

Disassemble of bpf_fentry_test1:

c000000000e4b080 <bpf_fentry_test1>:
c000000000e4b080:       a6 02 08 7c     mflr    r0
c000000000e4b084:       b9 e2 22 4b     bl      c00000000007933c <_mcount>
c000000000e4b088:       01 00 63 38     addi    r3,r3,1
c000000000e4b08c:       b4 07 63 7c     extsw   r3,r3
c000000000e4b090:       20 00 80 4e     blr

When CONFIG_PPC_FTRACE_OUT_OF_LINE [1] is enabled, these function profile
sequence is moved out of line with an unconditional branch at offset 0.
So, the test works without altering the offset for
'CONFIG_KPROBES_ON_FTRACE && CONFIG_PPC_FTRACE_OUT_OF_LINE' case.

Disassemble of bpf_fentry_test1:

c000000000f95190 <bpf_fentry_test1>:
c000000000f95190:       00 00 00 60     nop
c000000000f95194:       01 00 63 38     addi    r3,r3,1
c000000000f95198:       b4 07 63 7c     extsw   r3,r3
c000000000f9519c:       20 00 80 4e     blr

[1] https://lore.kernel.org/all/20241030070850.1361304-13-hbathini@linux.ibm.com/

Fixes: 23cf7aa539dc ("selftests/bpf: Add selftest for fill_link_info")
Signed-off-by: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241209065720.234344-1-skb99@linux.ibm.com
---
 tools/testing/selftests/bpf/prog_tests/fill_link_info.c |  4 ++++
 tools/testing/selftests/bpf/progs/test_fill_link_info.c | 13 ++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
index d50cbd8040d4..e59af2aa6601 100644
--- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
@@ -171,6 +171,10 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel,
 		/* See also arch_adjust_kprobe_addr(). */
 		if (skel->kconfig->CONFIG_X86_KERNEL_IBT)
 			entry_offset = 4;
+		if (skel->kconfig->CONFIG_PPC64 &&
+		    skel->kconfig->CONFIG_KPROBES_ON_FTRACE &&
+		    !skel->kconfig->CONFIG_PPC_FTRACE_OUT_OF_LINE)
+			entry_offset = 4;
 		err = verify_perf_link_info(link_fd, type, kprobe_addr, 0, entry_offset);
 		ASSERT_OK(err, "verify_perf_link_info");
 	} else {
diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c
index 6afa834756e9..fac33a14f200 100644
--- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c
+++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c
@@ -6,13 +6,20 @@
 #include <stdbool.h>
 
 extern bool CONFIG_X86_KERNEL_IBT __kconfig __weak;
+extern bool CONFIG_PPC_FTRACE_OUT_OF_LINE __kconfig __weak;
+extern bool CONFIG_KPROBES_ON_FTRACE __kconfig __weak;
+extern bool CONFIG_PPC64 __kconfig __weak;
 
-/* This function is here to have CONFIG_X86_KERNEL_IBT
- * used and added to object BTF.
+/* This function is here to have CONFIG_X86_KERNEL_IBT,
+ * CONFIG_PPC_FTRACE_OUT_OF_LINE, CONFIG_KPROBES_ON_FTRACE,
+ * CONFIG_PPC6 used and added to object BTF.
  */
 int unused(void)
 {
-	return CONFIG_X86_KERNEL_IBT ? 0 : 1;
+	return CONFIG_X86_KERNEL_IBT ||
+			CONFIG_PPC_FTRACE_OUT_OF_LINE ||
+			CONFIG_KPROBES_ON_FTRACE ||
+			CONFIG_PPC64 ? 0 : 1;
 }
 
 SEC("kprobe")
-- 
cgit v1.2.3


From 8653eb21d68c6882ce5716b04379431817310b85 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 5 Dec 2024 16:40:58 +0100
Subject: selftests: net: lib: Rename ip_link_master() to ip_link_set_master()

Let's have a verb in that function name to make it clearer what's going on.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/fbf7c53a429b340b9cff5831280ea8c305a224f9.1733412063.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fdb_notify.sh | 6 +++---
 tools/testing/selftests/net/lib.sh        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh
index c03151e7791c..c159230c9b62 100755
--- a/tools/testing/selftests/net/fdb_notify.sh
+++ b/tools/testing/selftests/net/fdb_notify.sh
@@ -49,7 +49,7 @@ test_dup_vxlan_self()
 {
 	ip_link_add br up type bridge vlan_filtering 1
 	ip_link_add vx up type vxlan id 2000 dstport 4789
-	ip_link_master vx br
+	ip_link_set_master vx br
 
 	do_test_dup add "vxlan" dev vx self dst 192.0.2.1
 	do_test_dup del "vxlan" dev vx self dst 192.0.2.1
@@ -59,7 +59,7 @@ test_dup_vxlan_master()
 {
 	ip_link_add br up type bridge vlan_filtering 1
 	ip_link_add vx up type vxlan id 2000 dstport 4789
-	ip_link_master vx br
+	ip_link_set_master vx br
 
 	do_test_dup add "vxlan master" dev vx master
 	do_test_dup del "vxlan master" dev vx master
@@ -79,7 +79,7 @@ test_dup_macvlan_master()
 	ip_link_add br up type bridge vlan_filtering 1
 	ip_link_add dd up type dummy
 	ip_link_add mv up link dd type macvlan mode passthru
-	ip_link_master mv br
+	ip_link_set_master mv br
 
 	do_test_dup add "macvlan master" dev mv self
 	do_test_dup del "macvlan master" dev mv self
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 8994fec1c38f..5ea6537acd2b 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -451,7 +451,7 @@ ip_link_add()
 	defer ip link del dev "$name"
 }
 
-ip_link_master()
+ip_link_set_master()
 {
 	local member=$1; shift
 	local master=$1; shift
-- 
cgit v1.2.3


From d76ccb2ec368c8a44f64839140cd253c19f6a79a Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 5 Dec 2024 16:40:59 +0100
Subject: selftests: net: lib: Add several autodefer helpers

Add ip_link_set_addr(), ip_link_set_up(), ip_addr_add() and ip_route_add()
to the suite of helpers that automatically schedule a corresponding
cleanup.

When setting a new MAC, one needs to remember the old address first. Move
mac_get() from forwarding/ to that end.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/add6bcbe30828fd01363266df20c338cf13aaf25.1733412063.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh |  7 -----
 tools/testing/selftests/net/lib.sh            | 39 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 7337f398f9cc..1fd40bada694 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -932,13 +932,6 @@ packets_rate()
 	echo $(((t1 - t0) / interval))
 }
 
-mac_get()
-{
-	local if_name=$1
-
-	ip -j link show dev $if_name | jq -r '.[]["address"]'
-}
-
 ether_addr_to_u64()
 {
 	local addr="$1"
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 5ea6537acd2b..2cd5c743b2d9 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -435,6 +435,13 @@ xfail_on_veth()
 	fi
 }
 
+mac_get()
+{
+	local if_name=$1
+
+	ip -j link show dev $if_name | jq -r '.[]["address"]'
+}
+
 kill_process()
 {
 	local pid=$1; shift
@@ -459,3 +466,35 @@ ip_link_set_master()
 	ip link set dev "$member" master "$master"
 	defer ip link set dev "$member" nomaster
 }
+
+ip_link_set_addr()
+{
+	local name=$1; shift
+	local addr=$1; shift
+
+	local old_addr=$(mac_get "$name")
+	ip link set dev "$name" address "$addr"
+	defer ip link set dev "$name" address "$old_addr"
+}
+
+ip_link_set_up()
+{
+	local name=$1; shift
+
+	ip link set dev "$name" up
+	defer ip link set dev "$name" down
+}
+
+ip_addr_add()
+{
+	local name=$1; shift
+
+	ip addr add dev "$name" "$@"
+	defer ip addr del dev "$name" "$@"
+}
+
+ip_route_add()
+{
+	ip route add "$@"
+	defer ip route del "$@"
+}
-- 
cgit v1.2.3


From d84b5dccf3ebdeeabef910d1c19b931c84f67884 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 5 Dec 2024 16:41:00 +0100
Subject: selftests: forwarding: Add a selftest for the new reserved_bits UAPI

Run VXLAN packets through a gateway. Flip individual bits of the packet
and/or reserved bits of the gateway, and check that the gateway treats the
packets as expected.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/388bef3c30ebc887d4e64cd86a362e2df2f2d2e1.1733412063.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/Makefile    |   1 +
 .../selftests/net/forwarding/vxlan_reserved.sh     | 352 +++++++++++++++++++++
 2 files changed, 353 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_reserved.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 7d885cff8d79..00bde7b6f39e 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -105,6 +105,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	vxlan_bridge_1q_port_8472_ipv6.sh \
 	vxlan_bridge_1q_port_8472.sh \
 	vxlan_bridge_1q.sh \
+	vxlan_reserved.sh \
 	vxlan_symmetric_ipv6.sh \
 	vxlan_symmetric.sh
 
diff --git a/tools/testing/selftests/net/forwarding/vxlan_reserved.sh b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh
new file mode 100755
index 000000000000..46c31794b91b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh
@@ -0,0 +1,352 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------+
+# | H1 (vrf)           |
+# |    + $h1           |
+# |    | 192.0.2.1/28  |
+# +----|---------------+
+#      |
+# +----|--------------------------------+
+# | SW |                                |
+# | +--|------------------------------+ |
+# | |  + $swp1           BR1 (802.1d) | |
+# | |                                 | |
+# | |  + vx1 (vxlan)                  | |
+# | |    local 192.0.2.17             | |
+# | |    id 1000 dstport $VXPORT      | |
+# | +---------------------------------+ |
+# |                                     |
+# |  192.0.2.32/28 via 192.0.2.18       |
+# |                                     |
+# |  + $rp1                             |
+# |  | 192.0.2.17/28                    |
+# +--|----------------------------------+
+#    |
+# +--|----------------------------------+
+# |  |                                  |
+# |  + $rp2                             |
+# |    192.0.2.18/28                    |
+# |                                     |
+# | VRP2 (vrf)                          |
+# +-------------------------------------+
+
+: ${VXPORT:=4789}
+: ${ALL_TESTS:="
+	default_test
+	plain_test
+	reserved_0_test
+	reserved_10_test
+	reserved_31_test
+	reserved_56_test
+	reserved_63_test
+    "}
+
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28
+	defer simple_if_fini $h1 192.0.2.1/28
+
+	tc qdisc add dev $h1 clsact
+	defer tc qdisc del dev $h1 clsact
+
+	tc filter add dev $h1 ingress pref 77 \
+	   prot ip flower skip_hw ip_proto icmp action drop
+	defer tc filter del dev $h1 ingress pref 77
+}
+
+switch_create()
+{
+	ip_link_add br1 type bridge vlan_filtering 0 mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip_link_set_addr br1 $(mac_get $swp1)
+	ip_link_set_up br1
+
+	ip_link_set_up $rp1
+	ip_addr_add $rp1 192.0.2.17/28
+	ip_route_add 192.0.2.32/28 nexthop via 192.0.2.18
+
+	ip_link_set_master $swp1 br1
+	ip_link_set_up $swp1
+}
+
+vrp2_create()
+{
+	simple_if_init $rp2 192.0.2.18/28
+	defer simple_if_fini $rp2 192.0.2.18/28
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	rp1=${NETIFS[p3]}
+	rp2=${NETIFS[p4]}
+
+	vrf_prepare
+	defer vrf_cleanup
+
+	forwarding_enable
+	defer forwarding_restore
+
+	h1_create
+	switch_create
+
+	vrp2_create
+}
+
+vxlan_header_bytes()
+{
+	local vni=$1; shift
+	local -a extra_bits=("$@")
+	local -a bits
+	local i
+
+	for ((i=0; i < 64; i++)); do
+		bits[i]=0
+	done
+
+	# Bit 4 is the I flag and is always on.
+	bits[4]=1
+
+	for i in ${extra_bits[@]}; do
+		bits[i]=1
+	done
+
+	# Bits 32..55 carry the VNI
+	local mask=0x800000
+	for ((i=0; i < 24; i++)); do
+		bits[$((i + 32))]=$(((vni & mask) != 0))
+		((mask >>= 1))
+	done
+
+	local bytes
+	for ((i=0; i < 8; i++)); do
+		local byte=0
+		local j
+		for ((j=0; j < 8; j++)); do
+			local bit=${bits[8 * i + j]}
+			((byte += bit << (7 - j)))
+		done
+		bytes+=$(printf %02x $byte):
+	done
+
+	echo ${bytes%:}
+}
+
+neg_bytes()
+{
+	local bytes=$1; shift
+
+	local -A neg=([0]=f [1]=e [2]=d [3]=c [4]=b [5]=a [6]=9 [7]=8
+		      [8]=7 [9]=6 [a]=5 [b]=4 [c]=3 [d]=2 [e]=1 [f]=0 [:]=:)
+	local out
+	local i
+
+	for ((i=0; i < ${#bytes}; i++)); do
+		local c=${bytes:$i:1}
+		out+=${neg[$c]}
+	done
+	echo $out
+}
+
+vxlan_ping_do()
+{
+	local count=$1; shift
+	local dev=$1; shift
+	local next_hop_mac=$1; shift
+	local dest_ip=$1; shift
+	local dest_mac=$1; shift
+	local vni=$1; shift
+	local reserved_bits=$1; shift
+
+	local vxlan_header=$(vxlan_header_bytes $vni $reserved_bits)
+
+	$MZ $dev -c $count -d 100msec -q \
+		-b $next_hop_mac -B $dest_ip \
+		-t udp sp=23456,dp=$VXPORT,p=$(:
+		    )"$vxlan_header:"$(              : VXLAN
+		    )"$dest_mac:"$(                  : ETH daddr
+		    )"00:11:22:33:44:55:"$(          : ETH saddr
+		    )"08:00:"$(                      : ETH type
+		    )"45:"$(                         : IP version + IHL
+		    )"00:"$(                         : IP TOS
+		    )"00:54:"$(                      : IP total length
+		    )"99:83:"$(                      : IP identification
+		    )"40:00:"$(                      : IP flags + frag off
+		    )"40:"$(                         : IP TTL
+		    )"01:"$(                         : IP proto
+		    )"00:00:"$(                      : IP header csum
+		    )"$(ipv4_to_bytes 192.0.2.3):"$( : IP saddr
+		    )"$(ipv4_to_bytes 192.0.2.1):"$( : IP daddr
+		    )"08:"$(                         : ICMP type
+		    )"00:"$(                         : ICMP code
+		    )"8b:f2:"$(                      : ICMP csum
+		    )"1f:6a:"$(                      : ICMP request identifier
+		    )"00:01:"$(                      : ICMP request seq. number
+		    )"4f:ff:c5:5b:00:00:00:00:"$(    : ICMP payload
+		    )"6d:74:0b:00:00:00:00:00:"$(    :
+		    )"10:11:12:13:14:15:16:17:"$(    :
+		    )"18:19:1a:1b:1c:1d:1e:1f:"$(    :
+		    )"20:21:22:23:24:25:26:27:"$(    :
+		    )"28:29:2a:2b:2c:2d:2e:2f:"$(    :
+		    )"30:31:32:33:34:35:36:37"
+}
+
+vxlan_device_add()
+{
+	ip_link_add vx1 up type vxlan id 1000		\
+		local 192.0.2.17 dstport "$VXPORT"	\
+		nolearning noudpcsum tos inherit ttl 100 "$@"
+	ip_link_set_master vx1 br1
+}
+
+vxlan_all_reserved_bits()
+{
+	local i
+
+	for ((i=0; i < 64; i++)); do
+		if ((i == 4 || i >= 32 && i < 56)); then
+			continue
+		fi
+		echo $i
+	done
+}
+
+vxlan_ping_vanilla()
+{
+	vxlan_ping_do 10 $rp2 $(mac_get $rp1) 192.0.2.17 $(mac_get $h1) 1000
+}
+
+vxlan_ping_reserved()
+{
+	for bit in $(vxlan_all_reserved_bits); do
+		vxlan_ping_do 1 $rp2 $(mac_get $rp1) \
+			      192.0.2.17 $(mac_get $h1) 1000 "$bit"
+		((n++))
+	done
+}
+
+vxlan_ping_test()
+{
+	local what=$1; shift
+	local get_stat=$1; shift
+	local expect=$1; shift
+
+	RET=0
+
+	local t0=$($get_stat)
+
+	"$@"
+	check_err $? "Failure when running $@"
+
+	local t1=$($get_stat)
+	local delta=$((t1 - t0))
+
+	((expect == delta))
+	check_err $? "Expected to capture $expect packets, got $delta."
+
+	log_test "$what"
+}
+
+__default_test_do()
+{
+	local n_allowed_bits=$1; shift
+	local what=$1; shift
+
+	vxlan_ping_test "$what: clean packets" \
+		"tc_rule_stats_get $h1 77 ingress" \
+		10 vxlan_ping_vanilla
+
+	local t0=$(link_stats_get vx1 rx errors)
+	vxlan_ping_test "$what: mangled packets" \
+		"tc_rule_stats_get $h1 77 ingress" \
+		$n_allowed_bits vxlan_ping_reserved
+	local t1=$(link_stats_get vx1 rx errors)
+
+	RET=0
+	local expect=$((39 - n_allowed_bits))
+	local delta=$((t1 - t0))
+	((expect == delta))
+	check_err $? "Expected $expect error packets, got $delta."
+	log_test "$what: drops reported"
+}
+
+default_test_do()
+{
+	vxlan_device_add
+	__default_test_do 0 "Default"
+}
+
+default_test()
+{
+	in_defer_scope \
+	    default_test_do
+}
+
+plain_test_do()
+{
+	vxlan_device_add reserved_bits 0xf7ffffff000000ff
+	__default_test_do 0 "reserved_bits 0xf7ffffff000000ff"
+}
+
+plain_test()
+{
+	in_defer_scope \
+	    plain_test_do
+}
+
+reserved_test()
+{
+	local bit=$1; shift
+
+	local allowed_bytes=$(vxlan_header_bytes 0xffffff $bit)
+	local reserved_bytes=$(neg_bytes $allowed_bytes)
+	local reserved_bits=${reserved_bytes//:/}
+
+	vxlan_device_add reserved_bits 0x$reserved_bits
+	__default_test_do 1 "reserved_bits 0x$reserved_bits"
+}
+
+reserved_0_test()
+{
+	in_defer_scope \
+	    reserved_test 0
+}
+
+reserved_10_test()
+{
+	in_defer_scope \
+	    reserved_test 10
+}
+
+reserved_31_test()
+{
+	in_defer_scope \
+	    reserved_test 31
+}
+
+reserved_56_test()
+{
+	in_defer_scope \
+	    reserved_test 56
+}
+
+reserved_63_test()
+{
+	in_defer_scope \
+	    reserved_test 63
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From 694389cd2bdfc6bc646bbb0fd2a5684c5e8d5fbf Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 22 Nov 2024 15:47:57 +0800
Subject: selftests/cpufreq: gitignore output files and clean them in make
 clean

After `make run_tests`, the git status complains:
Untracked files:
    (use "git add <file>..." to include in what will be committed)
        cpufreq/cpufreq_selftest.dmesg_cpufreq.txt
        cpufreq/cpufreq_selftest.dmesg_full.txt
        cpufreq/cpufreq_selftest.txt

Link: https://lore.kernel.org/all/20241122074757.1583002-1-lizhijian@fujitsu.com/
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/cpufreq/.gitignore | 2 ++
 tools/testing/selftests/cpufreq/Makefile   | 1 +
 2 files changed, 3 insertions(+)
 create mode 100644 tools/testing/selftests/cpufreq/.gitignore

(limited to 'tools')

diff --git a/tools/testing/selftests/cpufreq/.gitignore b/tools/testing/selftests/cpufreq/.gitignore
new file mode 100644
index 000000000000..67604e91e068
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+cpufreq_selftest.*
diff --git a/tools/testing/selftests/cpufreq/Makefile b/tools/testing/selftests/cpufreq/Makefile
index c86ca8342222..9b2ccb10b0cf 100644
--- a/tools/testing/selftests/cpufreq/Makefile
+++ b/tools/testing/selftests/cpufreq/Makefile
@@ -3,6 +3,7 @@ all:
 
 TEST_PROGS := main.sh
 TEST_FILES := cpu.sh cpufreq.sh governor.sh module.sh special-tests.sh
+EXTRA_CLEAN := cpufreq_selftest.dmesg_cpufreq.txt cpufreq_selftest.dmesg_full.txt cpufreq_selftest.txt
 
 include ../lib.mk
 
-- 
cgit v1.2.3


From 3075476a7af666de3ec10b4f35d8e62db8fd5b6d Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Fri, 29 Nov 2024 09:20:05 +0800
Subject: pm: cpupower: Makefile: Fix cross compilation

After commit f79473ed9220 ("pm: cpupower: Makefile: Allow overriding
cross-compiling env params") we would fail to cross compile cpupower in
buildroot which uses the recipe at [1] where only the CROSS variable is
being set.

The issue here is the use of the lazy evaluation for all variables: CC,
LD, AR, STRIP, RANLIB, rather than just CROSS.

[1]:
https://git.buildroot.net/buildroot/tree/package/linux-tools/linux-tool-cpupower.mk.in

Fixes: f79473ed9220 ("pm: cpupower: Makefile: Allow overriding cross-compiling env params")
Reported-by: Florian Fainelli <florian.fainelli@broadcom.com>
Closes: https://lore.kernel.org/all/2bbabd2c-24ef-493c-a199-594e5dada3da@broadcom.com/
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Tested-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile
index 175004ce44b2..51a95239fe06 100644
--- a/tools/power/cpupower/Makefile
+++ b/tools/power/cpupower/Makefile
@@ -87,11 +87,19 @@ INSTALL_SCRIPT = ${INSTALL} -m 644
 # to something more interesting, like "arm-linux-".  If you want
 # to compile vs uClibc, that can be done here as well.
 CROSS ?= #/usr/i386-linux-uclibc/usr/bin/i386-uclibc-
+ifneq ($(CROSS), )
+CC = $(CROSS)gcc
+LD = $(CROSS)gcc
+AR = $(CROSS)ar
+STRIP = $(CROSS)strip
+RANLIB = $(CROSS)ranlib
+else
 CC ?= $(CROSS)gcc
 LD ?= $(CROSS)gcc
 AR ?= $(CROSS)ar
 STRIP ?= $(CROSS)strip
 RANLIB ?= $(CROSS)ranlib
+endif
 HOSTCC = gcc
 MKDIR = mkdir
 
-- 
cgit v1.2.3


From 46fd8c707b552c0a846917192f66e623bb03f976 Mon Sep 17 00:00:00 2001
From: wangfushuai <wangfushuai@baidu.com>
Date: Wed, 4 Dec 2024 15:02:47 +0800
Subject: cpupower: revise is_valid flag handling for idle_monitor

The is_valid flag should reflect the validity state of both
the XXX_start and XXX_stop functions. But the use of '=' in
XXX_stop overwrites the validity state set by XXX_start. This
commit changes '=' to '|=' in XXX_stop to preserve and combine
the validity state of XXX_start and XXX_stop.

Signed-off-by: wangfushuai <wangfushuai@baidu.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c  | 4 ++--
 tools/power/cpupower/utils/idle_monitor/mperf_monitor.c | 2 +-
 tools/power/cpupower/utils/idle_monitor/nhm_idle.c      | 2 +-
 tools/power/cpupower/utils/idle_monitor/snb_idle.c      | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c b/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c
index 55e55b6b42f9..f5a2a326b1b7 100644
--- a/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c
+++ b/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c
@@ -117,7 +117,7 @@ static int hsw_ext_start(void)
 
 	for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) {
 		for (cpu = 0; cpu < cpu_count; cpu++) {
-			hsw_ext_get_count(num, &val, cpu);
+			is_valid[cpu] = !hsw_ext_get_count(num, &val, cpu);
 			previous_count[num][cpu] = val;
 		}
 	}
@@ -134,7 +134,7 @@ static int hsw_ext_stop(void)
 
 	for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) {
 		for (cpu = 0; cpu < cpu_count; cpu++) {
-			is_valid[cpu] = !hsw_ext_get_count(num, &val, cpu);
+			is_valid[cpu] |= !hsw_ext_get_count(num, &val, cpu);
 			current_count[num][cpu] = val;
 		}
 	}
diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
index ae6af354a81d..0a03573ebcc2 100644
--- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
+++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
@@ -148,7 +148,7 @@ static int mperf_measure_stats(unsigned int cpu)
 	ret = get_aperf_mperf(cpu, &aval, &mval);
 	aperf_current_count[cpu] = aval;
 	mperf_current_count[cpu] = mval;
-	is_valid[cpu] = !ret;
+	is_valid[cpu] |= !ret;
 
 	return 0;
 }
diff --git a/tools/power/cpupower/utils/idle_monitor/nhm_idle.c b/tools/power/cpupower/utils/idle_monitor/nhm_idle.c
index 16eaf006f61f..6b1733782ffa 100644
--- a/tools/power/cpupower/utils/idle_monitor/nhm_idle.c
+++ b/tools/power/cpupower/utils/idle_monitor/nhm_idle.c
@@ -151,7 +151,7 @@ static int nhm_stop(void)
 
 	for (num = 0; num < NHM_CSTATE_COUNT; num++) {
 		for (cpu = 0; cpu < cpu_count; cpu++) {
-			is_valid[cpu] = !nhm_get_count(num, &val, cpu);
+			is_valid[cpu] |= !nhm_get_count(num, &val, cpu);
 			current_count[num][cpu] = val;
 		}
 	}
diff --git a/tools/power/cpupower/utils/idle_monitor/snb_idle.c b/tools/power/cpupower/utils/idle_monitor/snb_idle.c
index 811d63ab17a7..5969b88a85b4 100644
--- a/tools/power/cpupower/utils/idle_monitor/snb_idle.c
+++ b/tools/power/cpupower/utils/idle_monitor/snb_idle.c
@@ -115,7 +115,7 @@ static int snb_start(void)
 
 	for (num = 0; num < SNB_CSTATE_COUNT; num++) {
 		for (cpu = 0; cpu < cpu_count; cpu++) {
-			snb_get_count(num, &val, cpu);
+			is_valid[cpu] = !snb_get_count(num, &val, cpu);
 			previous_count[num][cpu] = val;
 		}
 	}
@@ -132,7 +132,7 @@ static int snb_stop(void)
 
 	for (num = 0; num < SNB_CSTATE_COUNT; num++) {
 		for (cpu = 0; cpu < cpu_count; cpu++) {
-			is_valid[cpu] = !snb_get_count(num, &val, cpu);
+			is_valid[cpu] |= !snb_get_count(num, &val, cpu);
 			current_count[num][cpu] = val;
 		}
 	}
-- 
cgit v1.2.3


From 3a7a53c8d4813ef510a731f529b8c58208ab8896 Mon Sep 17 00:00:00 2001
From: Zhu Jun <zhujun2@cmss.chinamobile.com>
Date: Wed, 4 Dec 2024 00:01:49 -0800
Subject: selftests/powerpc: Fix typo in test-vphn.c

The word 'accross' is wrong, so fix it.

Signed-off-by: Zhu Jun <zhujun2@cmss.chinamobile.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20241204080149.11759-1-zhujun2@cmss.chinamobile.com
---
 tools/testing/selftests/powerpc/vphn/test-vphn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/vphn/test-vphn.c b/tools/testing/selftests/powerpc/vphn/test-vphn.c
index 81d3069ffb84..f348f54914a9 100644
--- a/tools/testing/selftests/powerpc/vphn/test-vphn.c
+++ b/tools/testing/selftests/powerpc/vphn/test-vphn.c
@@ -275,7 +275,7 @@ static struct test {
 		}
 	},
 	{
-		/* Parse a 32-bit value split accross two consecutives 64-bit
+		/* Parse a 32-bit value split across two consecutives 64-bit
 		 * input values.
 		 */
 		"vphn: 16-bit value followed by 2 x 32-bit values",
-- 
cgit v1.2.3


From 50a78409a2157c0340572beecab86e2b263fe2a2 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Fri, 6 Dec 2024 10:05:06 -0800
Subject: selftests/hid: Add host-tools to .gitignore

When compiling these selftests the host-tools directory is generated.
Add it to the .gitignore so git doesn't see these files as trackable.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 tools/testing/selftests/hid/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/.gitignore b/tools/testing/selftests/hid/.gitignore
index 746c62361f77..933f483815b2 100644
--- a/tools/testing/selftests/hid/.gitignore
+++ b/tools/testing/selftests/hid/.gitignore
@@ -1,5 +1,6 @@
 bpftool
 *.skel.h
+/host-tools
 /tools
 hid_bpf
 hidraw
-- 
cgit v1.2.3


From 11d5245f608f8ac01c97b93f31497cef7b96e457 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Mon, 2 Dec 2024 12:29:24 +0100
Subject: selftests/bpf: Extend test for sockmap update with same

Verify that the sockmap link was not severed, and socket's entry is indeed
removed from the map when the corresponding descriptor gets closed.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241202-sockmap-replace-v1-2-1e88579e7bd5@rbox.co
---
 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index fdff0652d7ef..248754296d97 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -934,8 +934,10 @@ static void test_sockmap_same_sock(void)
 
 	err = socketpair(AF_UNIX, SOCK_STREAM, 0, stream);
 	ASSERT_OK(err, "socketpair(af_unix, sock_stream)");
-	if (err)
+	if (err) {
+		close(tcp);
 		goto out;
+	}
 
 	for (i = 0; i < 2; i++) {
 		err = bpf_map_update_elem(map, &zero, &stream[0], BPF_ANY);
@@ -954,14 +956,14 @@ static void test_sockmap_same_sock(void)
 		ASSERT_OK(err, "bpf_map_update_elem(tcp)");
 	}
 
+	close(tcp);
 	err = bpf_map_delete_elem(map, &zero);
-	ASSERT_OK(err, "bpf_map_delete_elem(entry)");
+	ASSERT_ERR(err, "bpf_map_delete_elem(entry)");
 
 	close(stream[0]);
 	close(stream[1]);
 out:
 	close(dgram);
-	close(tcp);
 	close(udp);
 	test_sockmap_pass_prog__destroy(skel);
 }
-- 
cgit v1.2.3


From b70b073979ebf7a5271e7ce655ea1c25b4673f04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Wed, 4 Dec 2024 20:37:44 +0100
Subject: tools/resolve_btfids: Add --fatal_warnings option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently warnings emitted by resolve_btfids are buried in the build log
and are slipping into mainline frequently.

Add an option to elevate warnings to hard errors so the CI bots can catch
any new warnings.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20241204-resolve_btfids-v3-1-e6a279a74cfd@weissschuh.net
---
 tools/bpf/resolve_btfids/main.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index bd9f960bce3d..d47191c6e55e 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -141,6 +141,7 @@ struct object {
 };
 
 static int verbose;
+static int warnings;
 
 static int eprintf(int level, int var, const char *fmt, ...)
 {
@@ -604,6 +605,7 @@ static int symbols_resolve(struct object *obj)
 			if (id->id) {
 				pr_info("WARN: multiple IDs found for '%s': %d, %d - using %d\n",
 					str, id->id, type_id, id->id);
+				warnings++;
 			} else {
 				id->id = type_id;
 				(*nr)--;
@@ -625,8 +627,10 @@ static int id_patch(struct object *obj, struct btf_id *id)
 	int i;
 
 	/* For set, set8, id->id may be 0 */
-	if (!id->id && !id->is_set && !id->is_set8)
+	if (!id->id && !id->is_set && !id->is_set8) {
 		pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name);
+		warnings++;
+	}
 
 	for (i = 0; i < id->addr_cnt; i++) {
 		unsigned long addr = id->addr[i];
@@ -782,6 +786,7 @@ int main(int argc, const char **argv)
 		.funcs    = RB_ROOT,
 		.sets     = RB_ROOT,
 	};
+	bool fatal_warnings = false;
 	struct option btfid_options[] = {
 		OPT_INCR('v', "verbose", &verbose,
 			 "be more verbose (show errors, etc)"),
@@ -789,6 +794,8 @@ int main(int argc, const char **argv)
 			   "BTF data"),
 		OPT_STRING('b', "btf_base", &obj.base_btf_path, "file",
 			   "path of file providing base BTF"),
+		OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings,
+			    "turn warnings into errors"),
 		OPT_END()
 	};
 	int err = -1;
@@ -823,7 +830,8 @@ int main(int argc, const char **argv)
 	if (symbols_patch(&obj))
 		goto out;
 
-	err = 0;
+	if (!(fatal_warnings && warnings))
+		err = 0;
 out:
 	if (obj.efile.elf) {
 		elf_end(obj.efile.elf);
-- 
cgit v1.2.3


From 3f23ee5590d9605dbde9a5e1d4b97637a4803329 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 9 Dec 2024 20:10:56 -0800
Subject: selftests/bpf: test for changing packet data from global functions

Check if verifier is aware of packet pointers invalidation done in
global functions. Based on a test shared by Nick Zavaritsky in [0].

[0] https://lore.kernel.org/bpf/0498CA22-5779-4767-9C0C-A9515CEA711F@gmail.com/

Suggested-by: Nick Zavaritsky <mejedi@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241210041100.1898468-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_sock.c | 28 +++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c
index d3e70e38e442..51826379a1aa 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sock.c
@@ -1037,4 +1037,32 @@ __naked void sock_create_read_src_port(void)
 	: __clobber_all);
 }
 
+__noinline
+long skb_pull_data2(struct __sk_buff *sk, __u32 len)
+{
+	return bpf_skb_pull_data(sk, len);
+}
+
+__noinline
+long skb_pull_data1(struct __sk_buff *sk, __u32 len)
+{
+	return skb_pull_data2(sk, len);
+}
+
+/* global function calls bpf_skb_pull_data(), which invalidates packet
+ * pointers established before global function call.
+ */
+SEC("tc")
+__failure __msg("invalid mem access")
+int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk)
+{
+	int *p = (void *)(long)sk->data;
+
+	if ((void *)(p + 1) > (void *)(long)sk->data_end)
+		return TCX_DROP;
+	skb_pull_data1(sk, 0);
+	*p = 42; /* this is unsafe */
+	return TCX_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 89ff40890d8f12a7d7e93fb602cc27562f3834f0 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 9 Dec 2024 20:10:58 -0800
Subject: selftests/bpf: freplace tests for tracking of changes_packet_data

Try different combinations of global functions replacement:
- replace function that changes packet data with one that doesn't;
- replace function that changes packet data with one that does;
- replace function that doesn't change packet data with one that does;
- replace function that doesn't change packet data with one that doesn't;

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241210041100.1898468-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/changes_pkt_data.c    | 76 ++++++++++++++++++++++
 .../testing/selftests/bpf/progs/changes_pkt_data.c | 26 ++++++++
 .../bpf/progs/changes_pkt_data_freplace.c          | 18 +++++
 3 files changed, 120 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
 create mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data.c
 create mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
new file mode 100644
index 000000000000..c0c7202f6c5c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bpf/libbpf.h"
+#include "changes_pkt_data_freplace.skel.h"
+#include "changes_pkt_data.skel.h"
+#include <test_progs.h>
+
+static void print_verifier_log(const char *log)
+{
+	if (env.verbosity >= VERBOSE_VERY)
+		fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log);
+}
+
+static void test_aux(const char *main_prog_name, const char *freplace_prog_name, bool expect_load)
+{
+	struct changes_pkt_data_freplace *freplace = NULL;
+	struct bpf_program *freplace_prog = NULL;
+	LIBBPF_OPTS(bpf_object_open_opts, opts);
+	struct changes_pkt_data *main = NULL;
+	char log[16*1024];
+	int err;
+
+	opts.kernel_log_buf = log;
+	opts.kernel_log_size = sizeof(log);
+	if (env.verbosity >= VERBOSE_SUPER)
+		opts.kernel_log_level = 1 | 2 | 4;
+	main = changes_pkt_data__open_opts(&opts);
+	if (!ASSERT_OK_PTR(main, "changes_pkt_data__open"))
+		goto out;
+	err = changes_pkt_data__load(main);
+	print_verifier_log(log);
+	if (!ASSERT_OK(err, "changes_pkt_data__load"))
+		goto out;
+	freplace = changes_pkt_data_freplace__open_opts(&opts);
+	if (!ASSERT_OK_PTR(freplace, "changes_pkt_data_freplace__open"))
+		goto out;
+	freplace_prog = bpf_object__find_program_by_name(freplace->obj, freplace_prog_name);
+	if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog"))
+		goto out;
+	bpf_program__set_autoload(freplace_prog, true);
+	bpf_program__set_autoattach(freplace_prog, true);
+	bpf_program__set_attach_target(freplace_prog,
+				       bpf_program__fd(main->progs.dummy),
+				       main_prog_name);
+	err = changes_pkt_data_freplace__load(freplace);
+	print_verifier_log(log);
+	if (expect_load) {
+		ASSERT_OK(err, "changes_pkt_data_freplace__load");
+	} else {
+		ASSERT_ERR(err, "changes_pkt_data_freplace__load");
+		ASSERT_HAS_SUBSTR(log, "Extension program changes packet data", "error log");
+	}
+
+out:
+	changes_pkt_data_freplace__destroy(freplace);
+	changes_pkt_data__destroy(main);
+}
+
+/* There are two global subprograms in both changes_pkt_data.skel.h:
+ * - one changes packet data;
+ * - another does not.
+ * It is ok to freplace subprograms that change packet data with those
+ * that either do or do not. It is only ok to freplace subprograms
+ * that do not change packet data with those that do not as well.
+ * The below tests check outcomes for each combination of such freplace.
+ */
+void test_changes_pkt_data_freplace(void)
+{
+	if (test__start_subtest("changes_with_changes"))
+		test_aux("changes_pkt_data", "changes_pkt_data", true);
+	if (test__start_subtest("changes_with_doesnt_change"))
+		test_aux("changes_pkt_data", "does_not_change_pkt_data", true);
+	if (test__start_subtest("doesnt_change_with_changes"))
+		test_aux("does_not_change_pkt_data", "changes_pkt_data", false);
+	if (test__start_subtest("doesnt_change_with_doesnt_change"))
+		test_aux("does_not_change_pkt_data", "does_not_change_pkt_data", true);
+}
diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data.c b/tools/testing/selftests/bpf/progs/changes_pkt_data.c
new file mode 100644
index 000000000000..f87da8e9d6b3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/changes_pkt_data.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__noinline
+long changes_pkt_data(struct __sk_buff *sk, __u32 len)
+{
+	return bpf_skb_pull_data(sk, len);
+}
+
+__noinline __weak
+long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len)
+{
+	return 0;
+}
+
+SEC("tc")
+int dummy(struct __sk_buff *sk)
+{
+	changes_pkt_data(sk, 0);
+	does_not_change_pkt_data(sk, 0);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c
new file mode 100644
index 000000000000..0e525beb8603
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("?freplace")
+long changes_pkt_data(struct __sk_buff *sk, __u32 len)
+{
+	return bpf_skb_pull_data(sk, len);
+}
+
+SEC("?freplace")
+long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len)
+{
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 1a4607ffba35bf2a630aab299e34dd3f6e658d70 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 9 Dec 2024 20:10:59 -0800
Subject: bpf: consider that tail calls invalidate packet pointers

Tail-called programs could execute any of the helpers that invalidate
packet pointers. Hence, conservatively assume that each tail call
invalidates packet pointers.

Making the change in bpf_helper_changes_pkt_data() automatically makes
use of check_cfg() logic that computes 'changes_pkt_data' effect for
global sub-programs, such that the following program could be
rejected:

    int tail_call(struct __sk_buff *sk)
    {
    	bpf_tail_call_static(sk, &jmp_table, 0);
    	return 0;
    }

    SEC("tc")
    int not_safe(struct __sk_buff *sk)
    {
    	int *p = (void *)(long)sk->data;
    	... make p valid ...
    	tail_call(sk);
    	*p = 42; /* this is unsafe */
    	...
    }

The tc_bpf2bpf.c:subprog_tc() needs change: mark it as a function that
can invalidate packet pointers. Otherwise, it can't be freplaced with
tailcall_freplace.c:entry_freplace() that does a tail call.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241210041100.1898468-8-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/tc_bpf2bpf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
index d1a57f7d09bd..fe6249d99b31 100644
--- a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c
@@ -11,6 +11,8 @@ int subprog_tc(struct __sk_buff *skb)
 
 	__sink(skb);
 	__sink(ret);
+	/* let verifier know that 'subprog_tc' can change pointers to skb->data */
+	bpf_skb_change_proto(skb, 0, 0);
 	return ret;
 }
 
-- 
cgit v1.2.3


From d9706b56e13b7916461ca6b4b731e169ed44ed09 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 9 Dec 2024 20:11:00 -0800
Subject: selftests/bpf: validate that tail call invalidates packet pointers

Add a test case with a tail call done from a global sub-program. Such
tails calls should be considered as invalidating packet pointers.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241210041100.1898468-9-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_sock.c | 28 +++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c
index 51826379a1aa..0d5e56dffabb 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sock.c
@@ -50,6 +50,13 @@ struct {
 	__uint(map_flags, BPF_F_NO_PREALLOC);
 } sk_storage_map SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
 SEC("cgroup/skb")
 __description("skb->sk: no NULL check")
 __failure __msg("invalid mem access 'sock_common_or_null'")
@@ -1065,4 +1072,25 @@ int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk)
 	return TCX_PASS;
 }
 
+__noinline
+int tail_call(struct __sk_buff *sk)
+{
+	bpf_tail_call_static(sk, &jmp_table, 0);
+	return 0;
+}
+
+/* Tail calls invalidate packet pointers. */
+SEC("tc")
+__failure __msg("invalid mem access")
+int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk)
+{
+	int *p = (void *)(long)sk->data;
+
+	if ((void *)(p + 1) > (void *)(long)sk->data_end)
+		return TCX_DROP;
+	tail_call(sk);
+	*p = 42; /* this is unsafe */
+	return TCX_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From b8f614207b0d5e4abd6df8d5cb3cc11f009d1d93 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 9 Dec 2024 09:29:24 -0600
Subject: scx: Fix maximal BPF selftest prog

maximal.bpf.c is still dispatching to and consuming from SCX_DSQ_GLOBAL.
Let's have it use its own DSQ to avoid any runtime errors.

Signed-off-by: David Vernet <void@manifault.com>
Tested-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/maximal.bpf.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 4c005fa71810..430f5e13bf55 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -12,6 +12,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+#define DSQ_ID 0
+
 s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
@@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
 }
 
 void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
@@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
 
 void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL);
+	scx_bpf_dsq_move_to_local(DSQ_ID);
 }
 
 void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
@@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
 {
-	return 0;
+	return scx_bpf_create_dsq(DSQ_ID, -1);
 }
 
 void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
-- 
cgit v1.2.3


From 9b496a8bbed9cc292b0dfd796f38ec58b6d0375f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 5 Dec 2024 14:51:01 -0500
Subject: cgroup/cpuset: Prevent leakage of isolated CPUs into sched domains

Isolated CPUs are not allowed to be used in a non-isolated partition.
The only exception is the top cpuset which is allowed to contain boot
time isolated CPUs.

Commit ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation
problem") introduces a simplified scheme of including only partition
roots in sched domain generation. However, it does not properly account
for this exception case. This can result in leakage of isolated CPUs
into a sched domain.

Fix it by making sure that isolated CPUs are excluded from the top
cpuset before generating sched domains.

Also update the way the boot time isolated CPUs are handled in
test_cpuset_prs.sh to make sure that those isolated CPUs are really
isolated instead of just skipping them in the tests.

Fixes: ccac8e8de99c ("cgroup/cpuset: Fix remote root partition creation problem")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_cpuset_prs.sh | 33 +++++++++++++----------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 03c1bdaed2c3..400a696a0d21 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -86,15 +86,15 @@ echo "" > test/cpuset.cpus
 
 #
 # If isolated CPUs have been reserved at boot time (as shown in
-# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7
+# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8
 # that will be used by this script for testing purpose. If not, some of
-# the tests may fail incorrectly. These isolated CPUs will also be removed
-# before being compared with the expected results.
+# the tests may fail incorrectly. These pre-isolated CPUs should stay in
+# an isolated state throughout the testing process for now.
 #
 BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated)
 if [[ -n "$BOOT_ISOLCPUS" ]]
 then
-	[[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] &&
+	[[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 8 ]] &&
 		skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested"
 	echo "Pre-isolated CPUs: $BOOT_ISOLCPUS"
 fi
@@ -683,15 +683,19 @@ check_isolcpus()
 		EXPECT_VAL2=$EXPECT_VAL
 	fi
 
+	#
+	# Appending pre-isolated CPUs
+	# Even though CPU #8 isn't used for testing, it can't be pre-isolated
+	# to make appending those CPUs easier.
+	#
+	[[ -n "$BOOT_ISOLCPUS" ]] && {
+		EXPECT_VAL=${EXPECT_VAL:+${EXPECT_VAL},}${BOOT_ISOLCPUS}
+		EXPECT_VAL2=${EXPECT_VAL2:+${EXPECT_VAL2},}${BOOT_ISOLCPUS}
+	}
+
 	#
 	# Check cpuset.cpus.isolated cpumask
 	#
-	if [[ -z "$BOOT_ISOLCPUS" ]]
-	then
-		ISOLCPUS=$(cat $ISCPUS)
-	else
-		ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//")
-	fi
 	[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && {
 		# Take a 50ms pause and try again
 		pause 0.05
@@ -731,8 +735,6 @@ check_isolcpus()
 		fi
 	done
 	[[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
-	[[ -n "BOOT_ISOLCPUS" ]] &&
-		ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//")
 
 	[[ "$EXPECT_VAL" = "$ISOLCPUS" ]]
 }
@@ -836,8 +838,11 @@ run_state_test()
 		# if available
 		[[ -n "$ICPUS" ]] && {
 			check_isolcpus $ICPUS
-			[[ $? -ne 0 ]] && test_fail $I "isolated CPU" \
-				"Expect $ICPUS, get $ISOLCPUS instead"
+			[[ $? -ne 0 ]] && {
+				[[ -n "$BOOT_ISOLCPUS" ]] && ICPUS=${ICPUS},${BOOT_ISOLCPUS}
+				test_fail $I "isolated CPU" \
+					"Expect $ICPUS, get $ISOLCPUS instead"
+			}
 		}
 		reset_cgroup_states
 		#
-- 
cgit v1.2.3


From a3b16198d3df38aa2fc6de167b919ecb3fae74a6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 11 Dec 2024 01:35:40 +0200
Subject: selftests: forwarding: add a pvid_change test to bridge_vlan_unaware

Historically, DSA drivers have seen problems with the model in which
bridge VLANs work, particularly with them being offloaded to switchdev
asynchronously relative to when they become active (vlan_filtering=1).

This switchdev API peculiarity was papered over by commit 2ea7a679ca2a
("net: dsa: Don't add vlans when vlan filtering is disabled"), which
introduced other problems, fixed by commit 54a0ed0df496 ("net: dsa:
provide an option for drivers to always receive bridge VLANs") through
an opt-in ds->configure_vlan_while_not_filtering bool (which later
became an opt-out).

The point is that some DSA drivers still skip VLAN configuration while
VLAN-unaware, and there is a desire to get rid of that behavior.

It's hard to deduce from the wording "at least one corner case" what
Andrew saw, but my best guess is that there is a discrepancy of meaning
between bridge pvid and hardware port pvid which caused breakage.

On one side, the Linux bridge with vlan_filtering=0 is completely
VLAN-unaware, and will accept and process a packet the same way
irrespective of the VLAN groups on the ports or the bridge itself
(there may not even be a pvid, and this makes no difference).

On the other hand, DSA switches still do VLAN processing internally,
even with vlan_filtering disabled, but they are expected to classify all
packets to the port pvid. That pvid shouldn't be confused with the
bridge pvid, and there lies the problem.

When a switch port is under a VLAN-unaware bridge, the hardware pvid
must be explicitly managed by the driver to classify all received
packets to it, regardless of bridge VLAN groups. When under a VLAN-aware
bridge, the hardware pvid must be synchronized to the bridge port pvid.
To do this correctly, the pattern is unfortunately a bit complicated,
and involves hooking the pvid change logic into quite a few places
(the ones that change the input variables which determine the value to
use as hardware pvid for a port). See mv88e6xxx_port_commit_pvid(),
sja1105_commit_pvid(), ocelot_port_set_pvid() etc.

The point is that not all drivers used to do that, especially in older
kernels. If a driver is to blindly program a bridge pvid VLAN received
from switchdev while it's VLAN-unaware, this might in turn change the
hardware pvid used by a VLAN-unaware bridge port, which might result in
packet loss depending which other ports have that pvid too (in that same
note, it might also go unnoticed).

To capture that condition, it is sufficient to take a VLAN-unaware
bridge and change the [VLAN-aware] bridge pvid on a single port, to a
VID that isn't present on any other port. This shouldn't have absolutely
any effect on packet classification or forwarding. However, broken
drivers will take the bait, and change their PVID to 3, causing packet
loss.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20241210233541.1401837-1-vladimir.oltean@nxp.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../net/forwarding/bridge_vlan_unaware.sh          | 25 +++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
index 1c8a26046589..2b5700b61ffa 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding"
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding pvid_change"
 NUM_NETIFS=4
 source lib.sh
 
@@ -77,12 +77,16 @@ cleanup()
 
 ping_ipv4()
 {
-	ping_test $h1 192.0.2.2
+	local msg=$1
+
+	ping_test $h1 192.0.2.2 "$msg"
 }
 
 ping_ipv6()
 {
-	ping6_test $h1 2001:db8:1::2
+	local msg=$1
+
+	ping6_test $h1 2001:db8:1::2 "$msg"
 }
 
 learning()
@@ -95,6 +99,21 @@ flooding()
 	flood_test $swp2 $h1 $h2
 }
 
+pvid_change()
+{
+	# Test that the changing of the VLAN-aware PVID does not affect
+	# VLAN-unaware forwarding
+	bridge vlan add vid 3 dev $swp1 pvid untagged
+
+	ping_ipv4 " with bridge port $swp1 PVID changed"
+	ping_ipv6 " with bridge port $swp1 PVID changed"
+
+	bridge vlan del vid 3 dev $swp1
+
+	ping_ipv4 " with bridge port $swp1 PVID deleted"
+	ping_ipv6 " with bridge port $swp1 PVID deleted"
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From b9fee10a52c0999f6f1c7e1c0ea83869f3cd10ae Mon Sep 17 00:00:00 2001
From: Simone Magnani <simone.magnani@isovalent.com>
Date: Mon, 9 Dec 2024 15:54:39 +0100
Subject: bpftool: Probe for ISA v4 instruction set extension

This patch introduces a new probe to check whether the kernel supports
instruction set extensions v4. The v4 extension comprises several new
instructions: BPF_{SDIV,SMOD} (signed div and mod), BPF_{LD,LDX,ST,STX,MOV}
(sign-extended load/store/move), 32-bit BPF_JA (unconditional jump),
target-independent BPF_ALU64 BSWAP (byte-swapping 16/32/64).

These have been introduced in the following commits respectively:

* ec0e2da95f72 ("bpf: Support new signed div/mod instructions.")
* 1f9a1ea821ff ("bpf: Support new sign-extension load insns")
* 8100928c8814 ("bpf: Support new sign-extension mov insns")
* 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction")
* 0845c3db7bf5 ("bpf: Support new unconditional bswap instruction")

Support in bpftool for previous ISA extensions was added in commit
0fd800b2456c ("bpftool: Probe for instruction set extensions"). These
probes are useful for userspace BPF projects that want to use newer
instruction set extensions on newer kernels, to reduce the programs'
sizes or their complexity.

LLVM provides the mcpu=v4 option since LLVM commit 8f28e8069c4b ("[BPF]
support for BPF_ST instruction in codegen") [0].

Signed-off-by: Simone Magnani <simone.magnani@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Link: https://github.com/llvm/llvm-project/commit/8f28e8069c4ba1110daee8bddc4d5049b6d4646e [0]
Link: https://lore.kernel.org/bpf/20241209145439.336362-1-simone.magnani@isovalent.com
---
 tools/bpf/bpftool/feature.c  | 23 +++++++++++++++++++++++
 tools/include/linux/filter.h | 10 ++++++++++
 2 files changed, 33 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 4dbc4fcdf473..24fecdf8e430 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -885,6 +885,28 @@ probe_v3_isa_extension(const char *define_prefix, __u32 ifindex)
 			   "V3_ISA_EXTENSION");
 }
 
+/*
+ * Probe for the v4 instruction set extension introduced in commit 1f9a1ea821ff
+ * ("bpf: Support new sign-extension load insns").
+ */
+static void
+probe_v4_isa_extension(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[5] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_JMP32_IMM(BPF_JEQ, BPF_REG_0, 1, 1),
+		BPF_JMP32_A(1),
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_v4_isa_extension",
+			   "ISA extension v4",
+			   "V4_ISA_EXTENSION");
+}
+
 static void
 section_system_config(enum probe_component target, const char *define_prefix)
 {
@@ -1029,6 +1051,7 @@ static void section_misc(const char *define_prefix, __u32 ifindex)
 	probe_bounded_loops(define_prefix, ifindex);
 	probe_v2_isa_extension(define_prefix, ifindex);
 	probe_v3_isa_extension(define_prefix, ifindex);
+	probe_v4_isa_extension(define_prefix, ifindex);
 	print_end_section();
 }
 
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index 65aa8ce142e5..bcc6df79301a 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -273,6 +273,16 @@
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Unconditional jumps, gotol pc + imm32 */
+
+#define BPF_JMP32_A(IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP32 | BPF_JA,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
 /* Function call */
 
 #define BPF_EMIT_CALL(FUNC)					\
-- 
cgit v1.2.3


From 04789af756a4a43e72986185f66f148e65b32fed Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 11 Dec 2024 23:07:11 -0800
Subject: selftests/bpf: extend changes_pkt_data with cases w/o subprograms

Extend changes_pkt_data tests with test cases freplacing the main
program that does not have subprograms. Try four combinations when
both main program and replacement do and do not change packet data.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20241212070711.427443-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/changes_pkt_data.c    | 55 +++++++++++++++++-----
 .../testing/selftests/bpf/progs/changes_pkt_data.c | 27 ++++++++---
 .../bpf/progs/changes_pkt_data_freplace.c          |  6 +--
 3 files changed, 66 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
index c0c7202f6c5c..7526de379081 100644
--- a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
+++ b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
@@ -10,10 +10,14 @@ static void print_verifier_log(const char *log)
 		fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log);
 }
 
-static void test_aux(const char *main_prog_name, const char *freplace_prog_name, bool expect_load)
+static void test_aux(const char *main_prog_name,
+		     const char *to_be_replaced,
+		     const char *replacement,
+		     bool expect_load)
 {
 	struct changes_pkt_data_freplace *freplace = NULL;
 	struct bpf_program *freplace_prog = NULL;
+	struct bpf_program *main_prog = NULL;
 	LIBBPF_OPTS(bpf_object_open_opts, opts);
 	struct changes_pkt_data *main = NULL;
 	char log[16*1024];
@@ -26,6 +30,10 @@ static void test_aux(const char *main_prog_name, const char *freplace_prog_name,
 	main = changes_pkt_data__open_opts(&opts);
 	if (!ASSERT_OK_PTR(main, "changes_pkt_data__open"))
 		goto out;
+	main_prog = bpf_object__find_program_by_name(main->obj, main_prog_name);
+	if (!ASSERT_OK_PTR(main_prog, "main_prog"))
+		goto out;
+	bpf_program__set_autoload(main_prog, true);
 	err = changes_pkt_data__load(main);
 	print_verifier_log(log);
 	if (!ASSERT_OK(err, "changes_pkt_data__load"))
@@ -33,14 +41,14 @@ static void test_aux(const char *main_prog_name, const char *freplace_prog_name,
 	freplace = changes_pkt_data_freplace__open_opts(&opts);
 	if (!ASSERT_OK_PTR(freplace, "changes_pkt_data_freplace__open"))
 		goto out;
-	freplace_prog = bpf_object__find_program_by_name(freplace->obj, freplace_prog_name);
+	freplace_prog = bpf_object__find_program_by_name(freplace->obj, replacement);
 	if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog"))
 		goto out;
 	bpf_program__set_autoload(freplace_prog, true);
 	bpf_program__set_autoattach(freplace_prog, true);
 	bpf_program__set_attach_target(freplace_prog,
-				       bpf_program__fd(main->progs.dummy),
-				       main_prog_name);
+				       bpf_program__fd(main_prog),
+				       to_be_replaced);
 	err = changes_pkt_data_freplace__load(freplace);
 	print_verifier_log(log);
 	if (expect_load) {
@@ -62,15 +70,38 @@ out:
  * that either do or do not. It is only ok to freplace subprograms
  * that do not change packet data with those that do not as well.
  * The below tests check outcomes for each combination of such freplace.
+ * Also test a case when main subprogram itself is replaced and is a single
+ * subprogram in a program.
  */
 void test_changes_pkt_data_freplace(void)
 {
-	if (test__start_subtest("changes_with_changes"))
-		test_aux("changes_pkt_data", "changes_pkt_data", true);
-	if (test__start_subtest("changes_with_doesnt_change"))
-		test_aux("changes_pkt_data", "does_not_change_pkt_data", true);
-	if (test__start_subtest("doesnt_change_with_changes"))
-		test_aux("does_not_change_pkt_data", "changes_pkt_data", false);
-	if (test__start_subtest("doesnt_change_with_doesnt_change"))
-		test_aux("does_not_change_pkt_data", "does_not_change_pkt_data", true);
+	struct {
+		const char *main;
+		const char *to_be_replaced;
+		bool changes;
+	} mains[] = {
+		{ "main_with_subprogs",   "changes_pkt_data",         true },
+		{ "main_with_subprogs",   "does_not_change_pkt_data", false },
+		{ "main_changes",         "main_changes",             true },
+		{ "main_does_not_change", "main_does_not_change",     false },
+	};
+	struct {
+		const char *func;
+		bool changes;
+	} replacements[] = {
+		{ "changes_pkt_data",         true },
+		{ "does_not_change_pkt_data", false }
+	};
+	char buf[64];
+
+	for (int i = 0; i < ARRAY_SIZE(mains); ++i) {
+		for (int j = 0; j < ARRAY_SIZE(replacements); ++j) {
+			snprintf(buf, sizeof(buf), "%s_with_%s",
+				 mains[i].to_be_replaced, replacements[j].func);
+			if (!test__start_subtest(buf))
+				continue;
+			test_aux(mains[i].main, mains[i].to_be_replaced, replacements[j].func,
+				 mains[i].changes || !replacements[j].changes);
+		}
+	}
 }
diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data.c b/tools/testing/selftests/bpf/progs/changes_pkt_data.c
index f87da8e9d6b3..43cada48b28a 100644
--- a/tools/testing/selftests/bpf/progs/changes_pkt_data.c
+++ b/tools/testing/selftests/bpf/progs/changes_pkt_data.c
@@ -4,22 +4,35 @@
 #include <bpf/bpf_helpers.h>
 
 __noinline
-long changes_pkt_data(struct __sk_buff *sk, __u32 len)
+long changes_pkt_data(struct __sk_buff *sk)
 {
-	return bpf_skb_pull_data(sk, len);
+	return bpf_skb_pull_data(sk, 0);
 }
 
 __noinline __weak
-long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len)
+long does_not_change_pkt_data(struct __sk_buff *sk)
 {
 	return 0;
 }
 
-SEC("tc")
-int dummy(struct __sk_buff *sk)
+SEC("?tc")
+int main_with_subprogs(struct __sk_buff *sk)
+{
+	changes_pkt_data(sk);
+	does_not_change_pkt_data(sk);
+	return 0;
+}
+
+SEC("?tc")
+int main_changes(struct __sk_buff *sk)
+{
+	bpf_skb_pull_data(sk, 0);
+	return 0;
+}
+
+SEC("?tc")
+int main_does_not_change(struct __sk_buff *sk)
 {
-	changes_pkt_data(sk, 0);
-	does_not_change_pkt_data(sk, 0);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c
index 0e525beb8603..f9a622705f1b 100644
--- a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c
+++ b/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c
@@ -4,13 +4,13 @@
 #include <bpf/bpf_helpers.h>
 
 SEC("?freplace")
-long changes_pkt_data(struct __sk_buff *sk, __u32 len)
+long changes_pkt_data(struct __sk_buff *sk)
 {
-	return bpf_skb_pull_data(sk, len);
+	return bpf_skb_pull_data(sk, 0);
 }
 
 SEC("?freplace")
-long does_not_change_pkt_data(struct __sk_buff *sk, __u32 len)
+long does_not_change_pkt_data(struct __sk_buff *sk)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 659b9ba7cb2d7adb64618b87ddfaa528a143766e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 12 Dec 2024 01:20:49 -0800
Subject: bpf: Check size for BTF-based ctx access of pointer members

Robert Morris reported the following program type which passes the
verifier in [0]:

SEC("struct_ops/bpf_cubic_init")
void BPF_PROG(bpf_cubic_init, struct sock *sk)
{
	asm volatile("r2 = *(u16*)(r1 + 0)");     // verifier should demand u64
	asm volatile("*(u32 *)(r2 +1504) = 0");   // 1280 in some configs
}

The second line may or may not work, but the first instruction shouldn't
pass, as it's a narrow load into the context structure of the struct ops
callback. The code falls back to btf_ctx_access to ensure correctness
and obtaining the types of pointers. Ensure that the size of the access
is correctly checked to be 8 bytes, otherwise the verifier thinks the
narrow load obtained a trusted BTF pointer and will permit loads/stores
as it sees fit.

Perform the check on size after we've verified that the load is for a
pointer field, as for scalar values narrow loads are fine. Access to
structs passed as arguments to a BPF program are also treated as
scalars, therefore no adjustment is needed in their case.

Existing verifier selftests are broken by this change, but because they
were incorrect. Verifier tests for d_path were performing narrow load
into context to obtain path pointer, had this program actually run it
would cause a crash. The same holds for verifier_btf_ctx_access tests.

  [0]: https://lore.kernel.org/bpf/51338.1732985814@localhost

Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF")
Reported-by: Robert Morris <rtm@mit.edu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241212092050.3204165-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c | 4 ++--
 tools/testing/selftests/bpf/progs/verifier_d_path.c         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
index a570e48b917a..bfc3bf18fed4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
@@ -11,7 +11,7 @@ __success __retval(0)
 __naked void btf_ctx_access_accept(void)
 {
 	asm volatile ("					\
-	r2 = *(u32*)(r1 + 8);		/* load 2nd argument value (int pointer) */\
+	r2 = *(u64 *)(r1 + 8);		/* load 2nd argument value (int pointer) */\
 	r0 = 0;						\
 	exit;						\
 "	::: __clobber_all);
@@ -23,7 +23,7 @@ __success __retval(0)
 __naked void ctx_access_u32_pointer_accept(void)
 {
 	asm volatile ("					\
-	r2 = *(u32*)(r1 + 0);		/* load 1nd argument value (u32 pointer) */\
+	r2 = *(u64 *)(r1 + 0);		/* load 1nd argument value (u32 pointer) */\
 	r0 = 0;						\
 	exit;						\
 "	::: __clobber_all);
diff --git a/tools/testing/selftests/bpf/progs/verifier_d_path.c b/tools/testing/selftests/bpf/progs/verifier_d_path.c
index ec79cbcfde91..87e51a215558 100644
--- a/tools/testing/selftests/bpf/progs/verifier_d_path.c
+++ b/tools/testing/selftests/bpf/progs/verifier_d_path.c
@@ -11,7 +11,7 @@ __success __retval(0)
 __naked void d_path_accept(void)
 {
 	asm volatile ("					\
-	r1 = *(u32*)(r1 + 0);				\
+	r1 = *(u64 *)(r1 + 0);				\
 	r2 = r10;					\
 	r2 += -8;					\
 	r6 = 0;						\
@@ -31,7 +31,7 @@ __failure __msg("helper call is not allowed in probe")
 __naked void d_path_reject(void)
 {
 	asm volatile ("					\
-	r1 = *(u32*)(r1 + 0);				\
+	r1 = *(u64 *)(r1 + 0);				\
 	r2 = r10;					\
 	r2 += -8;					\
 	r6 = 0;						\
-- 
cgit v1.2.3


From 8025731c28beb4700dc801a1ca4504d1f78bac27 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 12 Dec 2024 01:20:50 -0800
Subject: selftests/bpf: Add test for narrow ctx load for pointer args

Ensure that performing narrow ctx loads other than size == 8 are
rejected when the argument is a pointer type.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241212092050.3204165-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_btf_ctx_access.c  | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
index bfc3bf18fed4..28b939572cda 100644
--- a/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_btf_ctx_access.c
@@ -29,4 +29,40 @@ __naked void ctx_access_u32_pointer_accept(void)
 "	::: __clobber_all);
 }
 
+SEC("fentry/bpf_fentry_test9")
+__description("btf_ctx_access u32 pointer reject u32")
+__failure __msg("size 4 must be 8")
+__naked void ctx_access_u32_pointer_reject_32(void)
+{
+	asm volatile ("					\
+	r2 = *(u32 *)(r1 + 0);		/* load 1st argument with narrow load */\
+	r0 = 0;						\
+	exit;						\
+"	::: __clobber_all);
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("btf_ctx_access u32 pointer reject u16")
+__failure __msg("size 2 must be 8")
+__naked void ctx_access_u32_pointer_reject_16(void)
+{
+	asm volatile ("					\
+	r2 = *(u16 *)(r1 + 0);		/* load 1st argument with narrow load */\
+	r0 = 0;						\
+	exit;						\
+"	::: __clobber_all);
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("btf_ctx_access u32 pointer reject u8")
+__failure __msg("size 1 must be 8")
+__naked void ctx_access_u32_pointer_reject_8(void)
+{
+	asm volatile ("					\
+	r2 = *(u8 *)(r1 + 0);		/* load 1st argument with narrow load */\
+	r0 = 0;						\
+	exit;						\
+"	::: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From b641712925bfe89ff7217cc2d0b7a8e042df556b Mon Sep 17 00:00:00 2001
From: Alastair Robertson <ajor@meta.com>
Date: Wed, 11 Dec 2024 08:40:29 -0800
Subject: libbpf: Pull file-opening logic up to top-level functions

Move the filename arguments and file-descriptor handling from
init_output_elf() and linker_load_obj_file() and instead handle them
at the top-level in bpf_linker__new() and bpf_linker__add_file().

This will allow the inner functions to be shared with a new,
non-filename-based, API in the next commit.

Signed-off-by: Alastair Robertson <ajor@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241211164030.573042-2-ajor@meta.com
---
 tools/lib/bpf/linker.c | 84 +++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index e56ba6e67451..c49e94506d9c 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -157,10 +157,9 @@ struct bpf_linker {
 #define pr_warn_elf(fmt, ...)									\
 	libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1))
 
-static int init_output_elf(struct bpf_linker *linker, const char *file);
+static int init_output_elf(struct bpf_linker *linker);
 
-static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
-				const struct bpf_linker_file_opts *opts,
+static int linker_load_obj_file(struct bpf_linker *linker,
 				struct src_obj *obj);
 static int linker_sanity_check_elf(struct src_obj *obj);
 static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec);
@@ -233,9 +232,20 @@ struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts
 	if (!linker)
 		return errno = ENOMEM, NULL;
 
-	linker->fd = -1;
+	linker->filename = strdup(filename);
+	if (!linker->filename) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	linker->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
+	if (linker->fd < 0) {
+		err = -errno;
+		pr_warn("failed to create '%s': %d\n", filename, err);
+		goto err_out;
+	}
 
-	err = init_output_elf(linker, filename);
+	err = init_output_elf(linker);
 	if (err)
 		goto err_out;
 
@@ -294,23 +304,12 @@ static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx)
 	return sym;
 }
 
-static int init_output_elf(struct bpf_linker *linker, const char *file)
+static int init_output_elf(struct bpf_linker *linker)
 {
 	int err, str_off;
 	Elf64_Sym *init_sym;
 	struct dst_sec *sec;
 
-	linker->filename = strdup(file);
-	if (!linker->filename)
-		return -ENOMEM;
-
-	linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644);
-	if (linker->fd < 0) {
-		err = -errno;
-		pr_warn("failed to create '%s': %s\n", file, errstr(err));
-		return err;
-	}
-
 	linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL);
 	if (!linker->elf) {
 		pr_warn_elf("failed to create ELF object");
@@ -440,7 +439,7 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
 			 const struct bpf_linker_file_opts *opts)
 {
 	struct src_obj obj = {};
-	int err = 0;
+	int err = 0, fd;
 
 	if (!OPTS_VALID(opts, bpf_linker_file_opts))
 		return libbpf_err(-EINVAL);
@@ -448,7 +447,17 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
 	if (!linker->elf)
 		return libbpf_err(-EINVAL);
 
-	err = err ?: linker_load_obj_file(linker, filename, opts, &obj);
+	fd = open(filename, O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		err = -errno;
+		pr_warn("failed to open file '%s': %s\n", filename, errstr(err));
+		return libbpf_err(err);
+	}
+
+	obj.filename = filename;
+	obj.fd = fd;
+
+	err = err ?: linker_load_obj_file(linker, &obj);
 	err = err ?: linker_append_sec_data(linker, &obj);
 	err = err ?: linker_append_elf_syms(linker, &obj);
 	err = err ?: linker_append_elf_relos(linker, &obj);
@@ -534,8 +543,7 @@ static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name)
 	return sec;
 }
 
-static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
-				const struct bpf_linker_file_opts *opts,
+static int linker_load_obj_file(struct bpf_linker *linker,
 				struct src_obj *obj)
 {
 	int err = 0;
@@ -554,26 +562,18 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 #error "Unknown __BYTE_ORDER__"
 #endif
 
-	pr_debug("linker: adding object file '%s'...\n", filename);
-
-	obj->filename = filename;
+	pr_debug("linker: adding object file '%s'...\n", obj->filename);
 
-	obj->fd = open(filename, O_RDONLY | O_CLOEXEC);
-	if (obj->fd < 0) {
-		err = -errno;
-		pr_warn("failed to open file '%s': %s\n", filename, errstr(err));
-		return err;
-	}
 	obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL);
 	if (!obj->elf) {
-		pr_warn_elf("failed to parse ELF file '%s'", filename);
+		pr_warn_elf("failed to parse ELF file '%s'", obj->filename);
 		return -EINVAL;
 	}
 
 	/* Sanity check ELF file high-level properties */
 	ehdr = elf64_getehdr(obj->elf);
 	if (!ehdr) {
-		pr_warn_elf("failed to get ELF header for %s", filename);
+		pr_warn_elf("failed to get ELF header for %s", obj->filename);
 		return -EINVAL;
 	}
 
@@ -581,7 +581,7 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 	obj_byteorder = ehdr->e_ident[EI_DATA];
 	if (obj_byteorder != ELFDATA2LSB && obj_byteorder != ELFDATA2MSB) {
 		err = -EOPNOTSUPP;
-		pr_warn("unknown byte order of ELF file %s\n", filename);
+		pr_warn("unknown byte order of ELF file %s\n", obj->filename);
 		return err;
 	}
 	if (link_byteorder == ELFDATANONE) {
@@ -591,7 +591,7 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 			 obj_byteorder == ELFDATA2MSB ? "big" : "little");
 	} else if (link_byteorder != obj_byteorder) {
 		err = -EOPNOTSUPP;
-		pr_warn("byte order mismatch with ELF file %s\n", filename);
+		pr_warn("byte order mismatch with ELF file %s\n", obj->filename);
 		return err;
 	}
 
@@ -599,12 +599,12 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 	    || ehdr->e_machine != EM_BPF
 	    || ehdr->e_ident[EI_CLASS] != ELFCLASS64) {
 		err = -EOPNOTSUPP;
-		pr_warn_elf("unsupported kind of ELF file %s", filename);
+		pr_warn_elf("unsupported kind of ELF file %s", obj->filename);
 		return err;
 	}
 
 	if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) {
-		pr_warn_elf("failed to get SHSTRTAB section index for %s", filename);
+		pr_warn_elf("failed to get SHSTRTAB section index for %s", obj->filename);
 		return -EINVAL;
 	}
 
@@ -616,21 +616,21 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 		shdr = elf64_getshdr(scn);
 		if (!shdr) {
 			pr_warn_elf("failed to get section #%zu header for %s",
-				    sec_idx, filename);
+				    sec_idx, obj->filename);
 			return -EINVAL;
 		}
 
 		sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name);
 		if (!sec_name) {
 			pr_warn_elf("failed to get section #%zu name for %s",
-				    sec_idx, filename);
+				    sec_idx, obj->filename);
 			return -EINVAL;
 		}
 
 		data = elf_getdata(scn, 0);
 		if (!data) {
 			pr_warn_elf("failed to get section #%zu (%s) data from %s",
-				    sec_idx, sec_name, filename);
+				    sec_idx, sec_name, obj->filename);
 			return -EINVAL;
 		}
 
@@ -666,7 +666,7 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 				err = libbpf_get_error(obj->btf);
 				if (err) {
 					pr_warn("failed to parse .BTF from %s: %s\n",
-						filename, errstr(err));
+						obj->filename, errstr(err));
 					return err;
 				}
 				sec->skipped = true;
@@ -677,7 +677,7 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 				err = libbpf_get_error(obj->btf_ext);
 				if (err) {
 					pr_warn("failed to parse .BTF.ext from '%s': %s\n",
-						filename, errstr(err));
+						obj->filename, errstr(err));
 					return err;
 				}
 				sec->skipped = true;
@@ -694,7 +694,7 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename,
 			break;
 		default:
 			pr_warn("unrecognized section #%zu (%s) in %s\n",
-				sec_idx, sec_name, filename);
+				sec_idx, sec_name, obj->filename);
 			err = -EINVAL;
 			return err;
 		}
-- 
cgit v1.2.3


From 6d5e5e5d7ce134a0b334c3bfe44a9326d8c5f32b Mon Sep 17 00:00:00 2001
From: Alastair Robertson <ajor@meta.com>
Date: Wed, 11 Dec 2024 08:40:30 -0800
Subject: libbpf: Extend linker API to support in-memory ELF files

The new_fd and add_fd functions correspond to the original new and
add_file functions, but accept an FD instead of a file name. This
gives API consumers the option of using anonymous files/memfds to
avoid writing ELFs to disk.

This new API will be useful for performing linking as part of
bpftrace's JIT compilation.

The add_buf function is a convenience wrapper that does the work of
creating a memfd for the caller.

Signed-off-by: Alastair Robertson <ajor@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241211164030.573042-3-ajor@meta.com
---
 tools/lib/bpf/libbpf.h   |   5 ++
 tools/lib/bpf/libbpf.map |   4 ++
 tools/lib/bpf/linker.c   | 162 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 150 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b2ce3a72b11d..d45807103565 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1796,9 +1796,14 @@ struct bpf_linker_file_opts {
 struct bpf_linker;
 
 LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts);
+LIBBPF_API struct bpf_linker *bpf_linker__new_fd(int fd, struct bpf_linker_opts *opts);
 LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker,
 				    const char *filename,
 				    const struct bpf_linker_file_opts *opts);
+LIBBPF_API int bpf_linker__add_fd(struct bpf_linker *linker, int fd,
+				  const struct bpf_linker_file_opts *opts);
+LIBBPF_API int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz,
+				   const struct bpf_linker_file_opts *opts);
 LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
 LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 54b6f312cfa8..a8b2936a1646 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -432,4 +432,8 @@ LIBBPF_1.5.0 {
 } LIBBPF_1.4.0;
 
 LIBBPF_1.6.0 {
+	global:
+		bpf_linker__add_buf;
+		bpf_linker__add_fd;
+		bpf_linker__new_fd;
 } LIBBPF_1.5.0;
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index c49e94506d9c..b52f71c59616 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -4,6 +4,10 @@
  *
  * Copyright (c) 2021 Facebook
  */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
@@ -16,6 +20,7 @@
 #include <elf.h>
 #include <libelf.h>
 #include <fcntl.h>
+#include <sys/mman.h>
 #include "libbpf.h"
 #include "btf.h"
 #include "libbpf_internal.h"
@@ -152,6 +157,8 @@ struct bpf_linker {
 	/* global (including extern) ELF symbols */
 	int glob_sym_cnt;
 	struct glob_sym *glob_syms;
+
+	bool fd_is_owned;
 };
 
 #define pr_warn_elf(fmt, ...)									\
@@ -159,6 +166,9 @@ struct bpf_linker {
 
 static int init_output_elf(struct bpf_linker *linker);
 
+static int bpf_linker_add_file(struct bpf_linker *linker, int fd,
+			       const char *filename);
+
 static int linker_load_obj_file(struct bpf_linker *linker,
 				struct src_obj *obj);
 static int linker_sanity_check_elf(struct src_obj *obj);
@@ -190,7 +200,7 @@ void bpf_linker__free(struct bpf_linker *linker)
 	if (linker->elf)
 		elf_end(linker->elf);
 
-	if (linker->fd >= 0)
+	if (linker->fd >= 0 && linker->fd_is_owned)
 		close(linker->fd);
 
 	strset__free(linker->strtab_strs);
@@ -244,6 +254,49 @@ struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts
 		pr_warn("failed to create '%s': %d\n", filename, err);
 		goto err_out;
 	}
+	linker->fd_is_owned = true;
+
+	err = init_output_elf(linker);
+	if (err)
+		goto err_out;
+
+	return linker;
+
+err_out:
+	bpf_linker__free(linker);
+	return errno = -err, NULL;
+}
+
+struct bpf_linker *bpf_linker__new_fd(int fd, struct bpf_linker_opts *opts)
+{
+	struct bpf_linker *linker;
+	char filename[32];
+	int err;
+
+	if (fd < 0)
+		return errno = EINVAL, NULL;
+
+	if (!OPTS_VALID(opts, bpf_linker_opts))
+		return errno = EINVAL, NULL;
+
+	if (elf_version(EV_CURRENT) == EV_NONE) {
+		pr_warn_elf("libelf initialization failed");
+		return errno = EINVAL, NULL;
+	}
+
+	linker = calloc(1, sizeof(*linker));
+	if (!linker)
+		return errno = ENOMEM, NULL;
+
+	snprintf(filename, sizeof(filename), "fd:%d", fd);
+	linker->filename = strdup(filename);
+	if (!linker->filename) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	linker->fd = fd;
+	linker->fd_is_owned = false;
 
 	err = init_output_elf(linker);
 	if (err)
@@ -435,24 +488,11 @@ static int init_output_elf(struct bpf_linker *linker)
 	return 0;
 }
 
-int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
-			 const struct bpf_linker_file_opts *opts)
+static int bpf_linker_add_file(struct bpf_linker *linker, int fd,
+			       const char *filename)
 {
 	struct src_obj obj = {};
-	int err = 0, fd;
-
-	if (!OPTS_VALID(opts, bpf_linker_file_opts))
-		return libbpf_err(-EINVAL);
-
-	if (!linker->elf)
-		return libbpf_err(-EINVAL);
-
-	fd = open(filename, O_RDONLY | O_CLOEXEC);
-	if (fd < 0) {
-		err = -errno;
-		pr_warn("failed to open file '%s': %s\n", filename, errstr(err));
-		return libbpf_err(err);
-	}
+	int err = 0;
 
 	obj.filename = filename;
 	obj.fd = fd;
@@ -472,12 +512,91 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
 	free(obj.sym_map);
 	if (obj.elf)
 		elf_end(obj.elf);
-	if (obj.fd >= 0)
-		close(obj.fd);
 
+	return err;
+}
+
+int bpf_linker__add_file(struct bpf_linker *linker, const char *filename,
+			 const struct bpf_linker_file_opts *opts)
+{
+	int fd, err;
+
+	if (!OPTS_VALID(opts, bpf_linker_file_opts))
+		return libbpf_err(-EINVAL);
+
+	if (!linker->elf)
+		return libbpf_err(-EINVAL);
+
+	fd = open(filename, O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		err = -errno;
+		pr_warn("failed to open file '%s': %s\n", filename, errstr(err));
+		return libbpf_err(err);
+	}
+
+	err = bpf_linker_add_file(linker, fd, filename);
+	close(fd);
 	return libbpf_err(err);
 }
 
+int bpf_linker__add_fd(struct bpf_linker *linker, int fd,
+		       const struct bpf_linker_file_opts *opts)
+{
+	char filename[32];
+	int err;
+
+	if (!OPTS_VALID(opts, bpf_linker_file_opts))
+		return libbpf_err(-EINVAL);
+
+	if (!linker->elf)
+		return libbpf_err(-EINVAL);
+
+	if (fd < 0)
+		return libbpf_err(-EINVAL);
+
+	snprintf(filename, sizeof(filename), "fd:%d", fd);
+	err = bpf_linker_add_file(linker, fd, filename);
+	return libbpf_err(err);
+}
+
+int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz,
+			const struct bpf_linker_file_opts *opts)
+{
+	char filename[32];
+	int fd, written, ret;
+
+	if (!OPTS_VALID(opts, bpf_linker_file_opts))
+		return libbpf_err(-EINVAL);
+
+	if (!linker->elf)
+		return libbpf_err(-EINVAL);
+
+	snprintf(filename, sizeof(filename), "mem:%p+%zu", buf, buf_sz);
+
+	fd = memfd_create(filename, 0);
+	if (fd < 0) {
+		ret = -errno;
+		pr_warn("failed to create memfd '%s': %s\n", filename, errstr(ret));
+		return libbpf_err(ret);
+	}
+
+	written = 0;
+	while (written < buf_sz) {
+		ret = write(fd, buf, buf_sz);
+		if (ret < 0) {
+			ret = -errno;
+			pr_warn("failed to write '%s': %s\n", filename, errstr(ret));
+			goto err_out;
+		}
+		written += ret;
+	}
+
+	ret = bpf_linker_add_file(linker, fd, filename);
+err_out:
+	close(fd);
+	return libbpf_err(ret);
+}
+
 static bool is_dwarf_sec_name(const char *name)
 {
 	/* approximation, but the actual list is too long */
@@ -2687,9 +2806,10 @@ int bpf_linker__finalize(struct bpf_linker *linker)
 	}
 
 	elf_end(linker->elf);
-	close(linker->fd);
-
 	linker->elf = NULL;
+
+	if (linker->fd_is_owned)
+		close(linker->fd);
 	linker->fd = -1;
 
 	return 0;
-- 
cgit v1.2.3


From 8da7bf2cee2735dbd2478cf07672ff0d243ce6ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Dec 2024 16:16:57 -1000
Subject: tools/sched_ext: Receive updates from SCX repo

Receive tools/sched_ext updates form https://github.com/sched-ext/scx to
sync userspace bits:

- scx_bpf_dump_header() added which can be used to print out basic scheduler
  info on dump.

- BPF possible/online CPU iterators added.

- CO-RE enums added. The enums are autogenerated from vmlinux.h. Include the
  generated artifacts in tools/sched_ext to keep the Makefile simpler.

- Other misc changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h        |  83 ++++++++++++++++++-
 tools/sched_ext/include/scx/common.h            |   6 ++
 tools/sched_ext/include/scx/compat.h            |   1 +
 tools/sched_ext/include/scx/enums.autogen.bpf.h | 105 ++++++++++++++++++++++++
 tools/sched_ext/include/scx/enums.autogen.h     |  41 +++++++++
 tools/sched_ext/include/scx/enums.bpf.h         |  12 +++
 tools/sched_ext/include/scx/enums.h             |  27 ++++++
 tools/sched_ext/include/scx/user_exit_info.h    |   9 +-
 tools/sched_ext/scx_central.bpf.c               |   2 +-
 tools/sched_ext/scx_central.c                   |   1 +
 tools/sched_ext/scx_flatcg.bpf.c                |   2 +-
 tools/sched_ext/scx_flatcg.c                    |   1 +
 tools/sched_ext/scx_qmap.bpf.c                  |   2 +-
 tools/sched_ext/scx_qmap.c                      |   2 +
 14 files changed, 286 insertions(+), 8 deletions(-)
 create mode 100644 tools/sched_ext/include/scx/enums.autogen.bpf.h
 create mode 100644 tools/sched_ext/include/scx/enums.autogen.h
 create mode 100644 tools/sched_ext/include/scx/enums.bpf.h
 create mode 100644 tools/sched_ext/include/scx/enums.h

(limited to 'tools')

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 625f5b046776..858ba1f438f6 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -9,7 +9,7 @@
 
 #ifdef LSP
 #define __bpf__
-#include "../vmlinux/vmlinux.h"
+#include "../vmlinux.h"
 #else
 #include "vmlinux.h"
 #endif
@@ -24,6 +24,10 @@
 #define PF_EXITING			0x00000004
 #define CLOCK_MONOTONIC			1
 
+extern int LINUX_KERNEL_VERSION __kconfig;
+extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
+extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
+
 /*
  * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
  * lead to really confusing misbehaviors. Let's trigger a build failure.
@@ -98,7 +102,7 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 	_Pragma("GCC diagnostic push")						\
 	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
 	___bpf_fill(___param, args);						\
-	_Pragma("GCC diagnostic pop")						\
+	_Pragma("GCC diagnostic pop")
 
 /*
  * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
@@ -136,6 +140,20 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 	___scx_bpf_bstr_format_checker(fmt, ##args);				\
 })
 
+/*
+ * scx_bpf_dump_header() is a wrapper around scx_bpf_dump that adds a header
+ * of system information for debugging.
+ */
+#define scx_bpf_dump_header()							\
+({										\
+	scx_bpf_dump("kernel: %d.%d.%d %s\ncc: %s\n",				\
+		     LINUX_KERNEL_VERSION >> 16,				\
+		     LINUX_KERNEL_VERSION >> 8 & 0xFF,				\
+		     LINUX_KERNEL_VERSION & 0xFF,				\
+		     CONFIG_LOCALVERSION,					\
+		     CONFIG_CC_VERSION_TEXT);					\
+})
+
 #define BPF_STRUCT_OPS(name, args...)						\
 SEC("struct_ops/"#name)								\
 BPF_PROG(name, ##args)
@@ -317,6 +335,66 @@ u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 				   const struct cpumask *src2) __ksym;
 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
 
+int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) __ksym;
+int *bpf_iter_bits_next(struct bpf_iter_bits *it) __ksym;
+void bpf_iter_bits_destroy(struct bpf_iter_bits *it) __ksym;
+
+#define def_iter_struct(name)							\
+struct bpf_iter_##name {							\
+    struct bpf_iter_bits it;							\
+    const struct cpumask *bitmap;						\
+};
+
+#define def_iter_new(name)							\
+static inline int bpf_iter_##name##_new(					\
+	struct bpf_iter_##name *it, const u64 *unsafe_ptr__ign, u32 nr_words)	\
+{										\
+	it->bitmap = scx_bpf_get_##name##_cpumask();				\
+	return bpf_iter_bits_new(&it->it, (const u64 *)it->bitmap,		\
+				 sizeof(struct cpumask) / 8);			\
+}
+
+#define def_iter_next(name)							\
+static inline int *bpf_iter_##name##_next(struct bpf_iter_##name *it) {		\
+	return bpf_iter_bits_next(&it->it);					\
+}
+
+#define def_iter_destroy(name)							\
+static inline void bpf_iter_##name##_destroy(struct bpf_iter_##name *it) {	\
+	scx_bpf_put_cpumask(it->bitmap);					\
+	bpf_iter_bits_destroy(&it->it);						\
+}
+#define def_for_each_cpu(cpu, name) for_each_##name##_cpu(cpu)
+
+/// Provides iterator for possible and online cpus.
+///
+/// # Example
+///
+/// ```
+/// static inline void example_use() {
+///     int *cpu;
+///
+///     for_each_possible_cpu(cpu){
+///         bpf_printk("CPU %d is possible", *cpu);
+///     }
+///
+///     for_each_online_cpu(cpu){
+///         bpf_printk("CPU %d is online", *cpu);
+///     }
+/// }
+/// ```
+def_iter_struct(possible);
+def_iter_new(possible);
+def_iter_next(possible);
+def_iter_destroy(possible);
+#define for_each_possible_cpu(cpu) bpf_for_each(possible, cpu, NULL, 0)
+
+def_iter_struct(online);
+def_iter_new(online);
+def_iter_next(online);
+def_iter_destroy(online);
+#define for_each_online_cpu(cpu) bpf_for_each(online, cpu, NULL, 0)
+
 /*
  * Access a cpumask in read-only mode (typically to check bits).
  */
@@ -423,5 +501,6 @@ static inline u32 log2_u64(u64 v)
 }
 
 #include "compat.bpf.h"
+#include "enums.bpf.h"
 
 #endif	/* __SCX_COMMON_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index 5b0f90152152..dc18b99e55cd 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -71,5 +71,11 @@ typedef int64_t s64;
 
 #include "user_exit_info.h"
 #include "compat.h"
+#include "enums.h"
+
+/* not available when building kernel tools/sched_ext */
+#if __has_include(<lib/sdt_task.h>)
+#include <lib/sdt_task.h>
+#endif
 
 #endif	/* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index cc56ff9aa252..b50280e2ba2b 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -149,6 +149,7 @@ static inline long scx_hotplug_seq(void)
 	__skel = __scx_name##__open();						\
 	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
 	__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq();		\
+	SCX_ENUM_INIT(__skel);							\
 	__skel; 								\
 })
 
diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h
new file mode 100644
index 000000000000..0e941a0d6f88
--- /dev/null
+++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h
@@ -0,0 +1,105 @@
+/*
+ * WARNING: This file is autogenerated from scripts/gen_enums.py. If you would
+ * like to access an enum that is currently missing, add it to the script
+ * and run it from the root directory to update this file.
+ */
+
+const volatile u64 __SCX_OPS_NAME_LEN __weak;
+#define SCX_OPS_NAME_LEN __SCX_OPS_NAME_LEN
+
+const volatile u64 __SCX_SLICE_DFL __weak;
+#define SCX_SLICE_DFL __SCX_SLICE_DFL
+
+const volatile u64 __SCX_SLICE_INF __weak;
+#define SCX_SLICE_INF __SCX_SLICE_INF
+
+const volatile u64 __SCX_DSQ_FLAG_BUILTIN __weak;
+#define SCX_DSQ_FLAG_BUILTIN __SCX_DSQ_FLAG_BUILTIN
+
+const volatile u64 __SCX_DSQ_FLAG_LOCAL_ON __weak;
+#define SCX_DSQ_FLAG_LOCAL_ON __SCX_DSQ_FLAG_LOCAL_ON
+
+const volatile u64 __SCX_DSQ_INVALID __weak;
+#define SCX_DSQ_INVALID __SCX_DSQ_INVALID
+
+const volatile u64 __SCX_DSQ_GLOBAL __weak;
+#define SCX_DSQ_GLOBAL __SCX_DSQ_GLOBAL
+
+const volatile u64 __SCX_DSQ_LOCAL __weak;
+#define SCX_DSQ_LOCAL __SCX_DSQ_LOCAL
+
+const volatile u64 __SCX_DSQ_LOCAL_ON __weak;
+#define SCX_DSQ_LOCAL_ON __SCX_DSQ_LOCAL_ON
+
+const volatile u64 __SCX_DSQ_LOCAL_CPU_MASK __weak;
+#define SCX_DSQ_LOCAL_CPU_MASK __SCX_DSQ_LOCAL_CPU_MASK
+
+const volatile u64 __SCX_TASK_QUEUED __weak;
+#define SCX_TASK_QUEUED __SCX_TASK_QUEUED
+
+const volatile u64 __SCX_TASK_RESET_RUNNABLE_AT __weak;
+#define SCX_TASK_RESET_RUNNABLE_AT __SCX_TASK_RESET_RUNNABLE_AT
+
+const volatile u64 __SCX_TASK_DEQD_FOR_SLEEP __weak;
+#define SCX_TASK_DEQD_FOR_SLEEP __SCX_TASK_DEQD_FOR_SLEEP
+
+const volatile u64 __SCX_TASK_STATE_SHIFT __weak;
+#define SCX_TASK_STATE_SHIFT __SCX_TASK_STATE_SHIFT
+
+const volatile u64 __SCX_TASK_STATE_BITS __weak;
+#define SCX_TASK_STATE_BITS __SCX_TASK_STATE_BITS
+
+const volatile u64 __SCX_TASK_STATE_MASK __weak;
+#define SCX_TASK_STATE_MASK __SCX_TASK_STATE_MASK
+
+const volatile u64 __SCX_TASK_CURSOR __weak;
+#define SCX_TASK_CURSOR __SCX_TASK_CURSOR
+
+const volatile u64 __SCX_TASK_NONE __weak;
+#define SCX_TASK_NONE __SCX_TASK_NONE
+
+const volatile u64 __SCX_TASK_INIT __weak;
+#define SCX_TASK_INIT __SCX_TASK_INIT
+
+const volatile u64 __SCX_TASK_READY __weak;
+#define SCX_TASK_READY __SCX_TASK_READY
+
+const volatile u64 __SCX_TASK_ENABLED __weak;
+#define SCX_TASK_ENABLED __SCX_TASK_ENABLED
+
+const volatile u64 __SCX_TASK_NR_STATES __weak;
+#define SCX_TASK_NR_STATES __SCX_TASK_NR_STATES
+
+const volatile u64 __SCX_TASK_DSQ_ON_PRIQ __weak;
+#define SCX_TASK_DSQ_ON_PRIQ __SCX_TASK_DSQ_ON_PRIQ
+
+const volatile u64 __SCX_KICK_IDLE __weak;
+#define SCX_KICK_IDLE __SCX_KICK_IDLE
+
+const volatile u64 __SCX_KICK_PREEMPT __weak;
+#define SCX_KICK_PREEMPT __SCX_KICK_PREEMPT
+
+const volatile u64 __SCX_KICK_WAIT __weak;
+#define SCX_KICK_WAIT __SCX_KICK_WAIT
+
+const volatile u64 __SCX_ENQ_WAKEUP __weak;
+#define SCX_ENQ_WAKEUP __SCX_ENQ_WAKEUP
+
+const volatile u64 __SCX_ENQ_HEAD __weak;
+#define SCX_ENQ_HEAD __SCX_ENQ_HEAD
+
+const volatile u64 __SCX_ENQ_PREEMPT __weak;
+#define SCX_ENQ_PREEMPT __SCX_ENQ_PREEMPT
+
+const volatile u64 __SCX_ENQ_REENQ __weak;
+#define SCX_ENQ_REENQ __SCX_ENQ_REENQ
+
+const volatile u64 __SCX_ENQ_LAST __weak;
+#define SCX_ENQ_LAST __SCX_ENQ_LAST
+
+const volatile u64 __SCX_ENQ_CLEAR_OPSS __weak;
+#define SCX_ENQ_CLEAR_OPSS __SCX_ENQ_CLEAR_OPSS
+
+const volatile u64 __SCX_ENQ_DSQ_PRIQ __weak;
+#define SCX_ENQ_DSQ_PRIQ __SCX_ENQ_DSQ_PRIQ
+
diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h
new file mode 100644
index 000000000000..88137a140e72
--- /dev/null
+++ b/tools/sched_ext/include/scx/enums.autogen.h
@@ -0,0 +1,41 @@
+/*
+ * WARNING: This file is autogenerated from scripts/gen_enums.py. If you would
+ * like to access an enum that is currently missing, add it to the script
+ * and run it from the root directory to update this file.
+ */
+
+#define SCX_ENUM_INIT(skel) do { \
+	SCX_ENUM_SET(skel, scx_public_consts, SCX_OPS_NAME_LEN); \
+	SCX_ENUM_SET(skel, scx_public_consts, SCX_SLICE_DFL); \
+	SCX_ENUM_SET(skel, scx_public_consts, SCX_SLICE_INF); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_FLAG_BUILTIN); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_FLAG_LOCAL_ON); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_INVALID); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_GLOBAL); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_LOCAL); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_LOCAL_ON); \
+	SCX_ENUM_SET(skel, scx_dsq_id_flags, SCX_DSQ_LOCAL_CPU_MASK); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_QUEUED); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_RESET_RUNNABLE_AT); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_DEQD_FOR_SLEEP); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_STATE_SHIFT); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_STATE_BITS); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_STATE_MASK); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_CURSOR); \
+	SCX_ENUM_SET(skel, scx_task_state, SCX_TASK_NONE); \
+	SCX_ENUM_SET(skel, scx_task_state, SCX_TASK_INIT); \
+	SCX_ENUM_SET(skel, scx_task_state, SCX_TASK_READY); \
+	SCX_ENUM_SET(skel, scx_task_state, SCX_TASK_ENABLED); \
+	SCX_ENUM_SET(skel, scx_task_state, SCX_TASK_NR_STATES); \
+	SCX_ENUM_SET(skel, scx_ent_dsq_flags, SCX_TASK_DSQ_ON_PRIQ); \
+	SCX_ENUM_SET(skel, scx_kick_flags, SCX_KICK_IDLE); \
+	SCX_ENUM_SET(skel, scx_kick_flags, SCX_KICK_PREEMPT); \
+	SCX_ENUM_SET(skel, scx_kick_flags, SCX_KICK_WAIT); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_WAKEUP); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_HEAD); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_PREEMPT); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_REENQ); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_DSQ_PRIQ); \
+} while (0)
diff --git a/tools/sched_ext/include/scx/enums.bpf.h b/tools/sched_ext/include/scx/enums.bpf.h
new file mode 100644
index 000000000000..af704c5d6334
--- /dev/null
+++ b/tools/sched_ext/include/scx/enums.bpf.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Convenience macros for getting/setting struct scx_enums instances.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ */
+#ifndef __SCX_ENUMS_BPF_H
+#define __SCX_ENUMS_BPF_H
+
+#include "enums.autogen.bpf.h"
+
+#endif /* __SCX_ENUMS_BPF_H */
diff --git a/tools/sched_ext/include/scx/enums.h b/tools/sched_ext/include/scx/enums.h
new file mode 100644
index 000000000000..34cbebe974b7
--- /dev/null
+++ b/tools/sched_ext/include/scx/enums.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct scx_enums that stores the load-time values of enums
+ * used by the BPF program.
+ *
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ */
+
+#ifndef __SCX_ENUMS_H
+#define __SCX_ENUMS_H
+
+static inline void __ENUM_set(u64 *val, char *type, char *name)
+{
+	bool res;
+
+	res = __COMPAT_read_enum(type, name, val);
+	SCX_BUG_ON(!res, "enum not found(%s)", name);
+}
+
+#define SCX_ENUM_SET(skel, type, name) do {			\
+	__ENUM_set(&skel->rodata->__##name, #type, #name);	\
+	} while (0)
+
+
+#include "enums.autogen.h"
+
+#endif /* __SCX_ENUMS_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 8ce2734402e1..66f856640ee7 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -10,6 +10,11 @@
 #ifndef __USER_EXIT_INFO_H
 #define __USER_EXIT_INFO_H
 
+#ifdef LSP
+#define __bpf__
+#include "../vmlinux.h"
+#endif
+
 enum uei_sizes {
 	UEI_REASON_LEN		= 128,
 	UEI_MSG_LEN		= 1024,
@@ -25,9 +30,7 @@ struct user_exit_info {
 
 #ifdef __bpf__
 
-#ifdef LSP
-#include "../vmlinux/vmlinux.h"
-#else
+#ifndef LSP
 #include "vmlinux.h"
 #endif
 #include <bpf/bpf_core_read.h>
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index e6fad6211f6c..2907df78241e 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -57,7 +57,7 @@ enum {
 
 const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
-const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u64 slice_ns;
 
 bool timer_pinned = true;
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index e938156ed0a0..1e9f74525d8f 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -58,6 +58,7 @@ restart:
 
 	skel->rodata->central_cpu = 0;
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
 	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
 		switch (opt) {
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 4e3afcd260bf..3dbfa82883be 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -57,7 +57,7 @@ enum {
 char _license[] SEC("license") = "GPL";
 
 const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
-const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
+const volatile u64 cgrp_slice_ns;
 const volatile bool fifo_sched;
 
 u64 cvtime_now;
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 5d24ca9c29d9..6dd423eeb4ff 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -137,6 +137,7 @@ restart:
 	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
 
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+	skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
 	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
 		double v;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index ee264947e0c3..3a20bb0c014a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -33,7 +33,7 @@ enum consts {
 
 char _license[] SEC("license") = "GPL";
 
-const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u64 slice_ns;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index ac45a02b4055..c4912ab2e76f 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -64,6 +64,8 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
+	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+
 	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
-- 
cgit v1.2.3


From 5506b7d7bbdb7622959d80a4a2fc18985a01d512 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 12 Dec 2024 16:32:24 -0800
Subject: selftests/bpf: make BPF_TARGET_ENDIAN non-recursive to speed up
 *.bpf.o build

BPF_TARGET_ENDIAN is used in CLANG_BPF_BUILD_RULE and co macros.
It is defined as a recursively expanded variable, meaning that it is
recomputed each time the value is needed. Thus, it is recomputed for
each *.bpf.o file compilation. The variable is computed by running a C
compiler in a shell. This significantly hinders parallel build
performance for *.bpf.o files.

This commit changes BPF_TARGET_ENDIAN to be a simply expanded
variable.

    # Build performance stats before this commit
    $ git clean -xfd; time make -j12
    real	1m0.000s
    ...

    # Build performance stats after this commit
    $ git clean -xfd; time make -j12
    real	0m43.605s
    ...

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241213003224.837030-1-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index bb8cf8f5bf11..9e870e519c30 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -461,10 +461,10 @@ $(shell $(1) $(2) -dM -E - </dev/null | grep -E 'MIPS(EL|EB)|_MIPS_SZ(PTR|LONG)
 endef
 
 # Determine target endianness.
-IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
+IS_LITTLE_ENDIAN := $(shell $(CC) -dM -E - </dev/null | \
 			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
-MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
-BPF_TARGET_ENDIAN=$(if $(IS_LITTLE_ENDIAN),--target=bpfel,--target=bpfeb)
+MENDIAN:=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
+BPF_TARGET_ENDIAN:=$(if $(IS_LITTLE_ENDIAN),--target=bpfel,--target=bpfeb)
 
 ifneq ($(CROSS_COMPILE),)
 CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
-- 
cgit v1.2.3


From dda014ba59331dee4f3b773a020e109932f4bd24 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 29 Nov 2024 15:47:49 +0100
Subject: objtool/x86: allow syscall instruction

The syscall instruction is used in Xen PV mode for doing hypercalls.
Allow syscall to be used in the kernel in case it is tagged with an
unwind hint for objtool.

This is part of XSA-466 / CVE-2024-53241.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Co-developed-by: Peter Zijlstra <peterz@infradead.org>
---
 tools/objtool/check.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4ce176ad411f..76060da755b5 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3820,9 +3820,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 			break;
 
 		case INSN_CONTEXT_SWITCH:
-			if (func && (!next_insn || !next_insn->hint)) {
-				WARN_INSN(insn, "unsupported instruction in callable function");
-				return 1;
+			if (func) {
+				if (!next_insn || !next_insn->hint) {
+					WARN_INSN(insn, "unsupported instruction in callable function");
+					return 1;
+				}
+				break;
 			}
 			return 0;
 
-- 
cgit v1.2.3


From ce03573a1917532da06057da9f8e74a2ee9e2ac9 Mon Sep 17 00:00:00 2001
From: Weizhao Ouyang <o451686892@gmail.com>
Date: Wed, 11 Dec 2024 19:16:39 +0800
Subject: kselftest/arm64: abi: fix SVCR detection

When using svcr_in to check ZA and Streaming Mode, we should make sure
that the value in x2 is correct, otherwise it may trigger an Illegal
instruction if FEAT_SVE and !FEAT_SME.

Fixes: 43e3f85523e4 ("kselftest/arm64: Add SME support to syscall ABI test")
Signed-off-by: Weizhao Ouyang <o451686892@gmail.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241211111639.12344-1-o451686892@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/abi/syscall-abi-asm.S  | 32 ++++++++++------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
index df3230fdac39..66ab2e0bae5f 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
+++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
@@ -81,32 +81,31 @@ do_syscall:
 	stp	x27, x28, [sp, #96]
 
 	// Set SVCR if we're doing SME
-	cbz	x1, 1f
+	cbz	x1, load_gpr
 	adrp	x2, svcr_in
 	ldr	x2, [x2, :lo12:svcr_in]
 	msr	S3_3_C4_C2_2, x2
-1:
 
 	// Load ZA and ZT0 if enabled - uses x12 as scratch due to SME LDR
-	tbz	x2, #SVCR_ZA_SHIFT, 1f
+	tbz	x2, #SVCR_ZA_SHIFT, load_gpr
 	mov	w12, #0
 	ldr	x2, =za_in
-2:	_ldr_za 12, 2
+1:	_ldr_za 12, 2
 	add	x2, x2, x1
 	add	x12, x12, #1
 	cmp	x1, x12
-	bne	2b
+	bne	1b
 
 	// ZT0
 	mrs	x2, S3_0_C0_C4_5	// ID_AA64SMFR0_EL1
 	ubfx	x2, x2, #ID_AA64SMFR0_EL1_SMEver_SHIFT, \
 			 #ID_AA64SMFR0_EL1_SMEver_WIDTH
-	cbz	x2, 1f
+	cbz	x2, load_gpr
 	adrp	x2, zt_in
 	add	x2, x2, :lo12:zt_in
 	_ldr_zt 2
-1:
 
+load_gpr:
 	// Load GPRs x8-x28, and save our SP/FP for later comparison
 	ldr	x2, =gpr_in
 	add	x2, x2, #64
@@ -125,9 +124,9 @@ do_syscall:
 	str	x30, [x2], #8		// LR
 
 	// Load FPRs if we're not doing neither SVE nor streaming SVE
-	cbnz	x0, 1f
+	cbnz	x0, check_sve_in
 	ldr	x2, =svcr_in
-	tbnz	x2, #SVCR_SM_SHIFT, 1f
+	tbnz	x2, #SVCR_SM_SHIFT, check_sve_in
 
 	ldr	x2, =fpr_in
 	ldp	q0, q1, [x2]
@@ -148,8 +147,8 @@ do_syscall:
 	ldp	q30, q31, [x2, #16 * 30]
 
 	b	2f
-1:
 
+check_sve_in:
 	// Load the SVE registers if we're doing SVE/SME
 
 	ldr	x2, =z_in
@@ -256,32 +255,31 @@ do_syscall:
 	stp	q30, q31, [x2, #16 * 30]
 
 	// Save SVCR if we're doing SME
-	cbz	x1, 1f
+	cbz	x1, check_sve_out
 	mrs	x2, S3_3_C4_C2_2
 	adrp	x3, svcr_out
 	str	x2, [x3, :lo12:svcr_out]
-1:
 
 	// Save ZA if it's enabled - uses x12 as scratch due to SME STR
-	tbz	x2, #SVCR_ZA_SHIFT, 1f
+	tbz	x2, #SVCR_ZA_SHIFT, check_sve_out
 	mov	w12, #0
 	ldr	x2, =za_out
-2:	_str_za 12, 2
+1:	_str_za 12, 2
 	add	x2, x2, x1
 	add	x12, x12, #1
 	cmp	x1, x12
-	bne	2b
+	bne	1b
 
 	// ZT0
 	mrs	x2, S3_0_C0_C4_5	// ID_AA64SMFR0_EL1
 	ubfx	x2, x2, #ID_AA64SMFR0_EL1_SMEver_SHIFT, \
 			#ID_AA64SMFR0_EL1_SMEver_WIDTH
-	cbz	x2, 1f
+	cbz	x2, check_sve_out
 	adrp	x2, zt_out
 	add	x2, x2, :lo12:zt_out
 	_str_zt 2
-1:
 
+check_sve_out:
 	// Save the SVE state if we have some
 	cbz	x0, 1f
 
-- 
cgit v1.2.3


From 5e3ad22d82238e8bcb4c7ec26a20533217ddfb18 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Fri, 13 Dec 2024 12:44:09 -0700
Subject: bpftool: man: Add missing format argument to command description

The command description was missing the optional argument. Add it there
for consistency with the rest of the commands.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/140402f22fc377fba4c34376b7e1d2eba2c276b1.1734119028.git.dxu@dxuuu.xyz
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 3f6bca03ad2e..245569f43035 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -43,7 +43,7 @@ bpftool btf { show | list } [id *BTF_ID*]
     that hold open file descriptors (FDs) against BTF objects. On such kernels
     bpftool will automatically emit this information as well.
 
-bpftool btf dump *BTF_SRC*
+bpftool btf dump *BTF_SRC* [format *FORMAT*]
     Dump BTF entries from a given *BTF_SRC*.
 
     When **id** is specified, BTF object with that ID will be loaded and all
-- 
cgit v1.2.3


From 7f5819e1ace85632cf58c43ab6c38d2d4b0aa161 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Fri, 13 Dec 2024 12:44:10 -0700
Subject: bpftool: btf: Validate root_type_ids early

Handle invalid root_type_ids early, as an invalid ID will cause dumpers
to half-emit valid boilerplate and then bail with an unclean exit. This
is ugly and possibly confusing for users, so preemptively handle the
common error case before any dumping begins.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/33e09a08a6072f8381cb976218a009709309b7e1.1734119028.git.dxu@dxuuu.xyz
---
 tools/bpf/bpftool/btf.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index d005e4fd6128..3e995faf9efa 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -886,6 +886,7 @@ static int do_dump(int argc, char **argv)
 	const char *src;
 	int fd = -1;
 	int err = 0;
+	int i;
 
 	if (!REQ_ARGS(2)) {
 		usage();
@@ -1017,6 +1018,17 @@ static int do_dump(int argc, char **argv)
 		}
 	}
 
+	/* Invalid root IDs causes half emitted boilerplate and then unclean
+	 * exit. It's an ugly user experience, so handle common error here.
+	 */
+	for (i = 0; i < root_type_cnt; i++) {
+		if (root_type_ids[i] >= btf__type_cnt(btf)) {
+			err = -EINVAL;
+			p_err("invalid root ID: %u", root_type_ids[i]);
+			goto done;
+		}
+	}
+
 	if (dump_c) {
 		if (json_output) {
 			p_err("JSON output for C-syntax dump is not supported");
-- 
cgit v1.2.3


From a812d92ed2aee2d57dccb12b289377265f4ce5e7 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Fri, 13 Dec 2024 12:44:11 -0700
Subject: bpftool: btf: Support dumping a specific types from file

Some projects, for example xdp-tools [0], prefer to check in a minimized
vmlinux.h rather than the complete file which can get rather large.

However, when you try to add a minimized version of a complex struct (eg
struct xfrm_state), things can get quite complex if you're trying to
manually untangle and deduplicate the dependencies.

This commit teaches bpftool to do a minimized dump of a specific types by
providing a optional root_id argument(s).

Example usage:

    $ ./bpftool btf dump file ~/dev/linux/vmlinux | rg "STRUCT 'xfrm_state'"
    [12643] STRUCT 'xfrm_state' size=912 vlen=58

    $ ./bpftool btf dump file ~/dev/linux/vmlinux root_id 12643 format c
    #ifndef __VMLINUX_H__
    #define __VMLINUX_H__

    [..]

    struct xfrm_type_offload;

    struct xfrm_sec_ctx;

    struct xfrm_state {
            possible_net_t xs_net;
            union {
                    struct hlist_node gclist;
                    struct hlist_node bydst;
            };
            union {
                    struct hlist_node dev_gclist;
                    struct hlist_node bysrc;
            };
            struct hlist_node byspi;
    [..]

[0]: https://github.com/xdp-project/xdp-tools/blob/master/headers/bpf/vmlinux.h

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/04feb860c0a56a7da66f923551484e1483a72074.1734119028.git.dxu@dxuuu.xyz
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst |  9 ++++--
 tools/bpf/bpftool/btf.c                         | 39 +++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 245569f43035..d47dddc2b4ee 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -24,7 +24,7 @@ BTF COMMANDS
 =============
 
 | **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*]
-| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*]
+| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] [**root_id** *ROOT_ID*]
 | **bpftool** **btf help**
 |
 | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* }
@@ -43,7 +43,7 @@ bpftool btf { show | list } [id *BTF_ID*]
     that hold open file descriptors (FDs) against BTF objects. On such kernels
     bpftool will automatically emit this information as well.
 
-bpftool btf dump *BTF_SRC* [format *FORMAT*]
+bpftool btf dump *BTF_SRC* [format *FORMAT*] [root_id *ROOT_ID*]
     Dump BTF entries from a given *BTF_SRC*.
 
     When **id** is specified, BTF object with that ID will be loaded and all
@@ -67,6 +67,11 @@ bpftool btf dump *BTF_SRC* [format *FORMAT*]
     formatting, the output is sorted by default. Use the **unsorted** option
     to avoid sorting the output.
 
+    **root_id** option can be used to filter a dump to a single type and all
+    its dependent types. It cannot be used with any other types of filtering
+    (such as the "key", "value", or "kv" arguments when dumping BTF for a map).
+    It can be passed multiple times to dump multiple types.
+
 bpftool btf help
     Print short help message.
 
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 3e995faf9efa..2636655ac180 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -27,6 +27,8 @@
 #define KFUNC_DECL_TAG		"bpf_kfunc"
 #define FASTCALL_DECL_TAG	"bpf_fastcall"
 
+#define MAX_ROOT_IDS		16
+
 static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_UNKN]		= "UNKNOWN",
 	[BTF_KIND_INT]		= "INT",
@@ -880,7 +882,8 @@ static int do_dump(int argc, char **argv)
 {
 	bool dump_c = false, sort_dump_c = true;
 	struct btf *btf = NULL, *base = NULL;
-	__u32 root_type_ids[2];
+	__u32 root_type_ids[MAX_ROOT_IDS];
+	bool have_id_filtering;
 	int root_type_cnt = 0;
 	__u32 btf_id = -1;
 	const char *src;
@@ -974,6 +977,8 @@ static int do_dump(int argc, char **argv)
 		goto done;
 	}
 
+	have_id_filtering = !!root_type_cnt;
+
 	while (argc) {
 		if (is_prefix(*argv, "format")) {
 			NEXT_ARG();
@@ -993,6 +998,36 @@ static int do_dump(int argc, char **argv)
 				goto done;
 			}
 			NEXT_ARG();
+		} else if (is_prefix(*argv, "root_id")) {
+			__u32 root_id;
+			char *end;
+
+			if (have_id_filtering) {
+				p_err("cannot use root_id with other type filtering");
+				err = -EINVAL;
+				goto done;
+			} else if (root_type_cnt == MAX_ROOT_IDS) {
+				p_err("only %d root_id are supported", MAX_ROOT_IDS);
+				err = -E2BIG;
+				goto done;
+			}
+
+			NEXT_ARG();
+			root_id = strtoul(*argv, &end, 0);
+			if (*end) {
+				err = -1;
+				p_err("can't parse %s as root ID", *argv);
+				goto done;
+			}
+			for (i = 0; i < root_type_cnt; i++) {
+				if (root_type_ids[i] == root_id) {
+					err = -EINVAL;
+					p_err("duplicate root_id %d supplied", root_id);
+					goto done;
+				}
+			}
+			root_type_ids[root_type_cnt++] = root_id;
+			NEXT_ARG();
 		} else if (is_prefix(*argv, "unsorted")) {
 			sort_dump_c = false;
 			NEXT_ARG();
@@ -1403,7 +1438,7 @@ static int do_help(int argc, char **argv)
 
 	fprintf(stderr,
 		"Usage: %1$s %2$s { show | list } [id BTF_ID]\n"
-		"       %1$s %2$s dump BTF_SRC [format FORMAT]\n"
+		"       %1$s %2$s dump BTF_SRC [format FORMAT] [root_id ROOT_ID]\n"
 		"       %1$s %2$s help\n"
 		"\n"
 		"       BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
-- 
cgit v1.2.3


From 9d294f6986789e20696f44c2deb4c7f7b8ae4704 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Fri, 13 Dec 2024 12:44:12 -0700
Subject: bpftool: bash: Add bash completion for root_id argument

This commit updates the bash completion script with the new root_id
argument.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/37016c786620761e621a88240e36f6cb27a8f628.1734119028.git.dxu@dxuuu.xyz
---
 tools/bpf/bpftool/bash-completion/bpftool | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 0c541498c301..1ce409a6cbd9 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -930,19 +930,24 @@ _bpftool()
                         format)
                             COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) )
                             ;;
+                        root_id)
+                            return 0;
+                            ;;
                         c)
-                            COMPREPLY=( $( compgen -W "unsorted" -- "$cur" ) )
+                            COMPREPLY=( $( compgen -W "unsorted root_id" -- "$cur" ) )
                             ;;
                         *)
                             # emit extra options
                             case ${words[3]} in
                                 id|file)
+                                    COMPREPLY=( $( compgen -W "root_id" -- "$cur" ) )
                                     _bpftool_once_attr 'format'
                                     ;;
                                 map|prog)
                                     if [[ ${words[3]} == "map" ]] && [[ $cword == 6 ]]; then
                                         COMPREPLY+=( $( compgen -W "key value kv all" -- "$cur" ) )
                                     fi
+                                    COMPREPLY=( $( compgen -W "root_id" -- "$cur" ) )
                                     _bpftool_once_attr 'format'
                                     ;;
                                 *)
-- 
cgit v1.2.3


From 4d3ae294f900fb7232fb6c890dbd3176b8a5f121 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Fri, 13 Dec 2024 13:09:31 +0000
Subject: bpf: Add fd_array_cnt attribute for prog_load

The fd_array attribute of the BPF_PROG_LOAD syscall may contain a set
of file descriptors: maps or btfs. This field was introduced as a
sparse array. Introduce a new attribute, fd_array_cnt, which, if
present, indicates that the fd_array is a continuous array of the
corresponding length.

If fd_array_cnt is non-zero, then every map in the fd_array will be
bound to the program, as if it was used by the program. This
functionality is similar to the BPF_PROG_BIND_MAP syscall, but such
maps can be used by the verifier during the program load.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213130934.1087929-5-aspsk@isovalent.com
---
 tools/include/uapi/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4162afc6b5d0..2acf9b336371 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1573,6 +1573,16 @@ union bpf_attr {
 		 * If provided, prog_flags should have BPF_F_TOKEN_FD flag set.
 		 */
 		__s32		prog_token_fd;
+		/* The fd_array_cnt can be used to pass the length of the
+		 * fd_array array. In this case all the [map] file descriptors
+		 * passed in this array will be bound to the program, even if
+		 * the maps are not referenced directly. The functionality is
+		 * similar to the BPF_PROG_BIND_MAP syscall, but maps can be
+		 * used by the verifier during the program load. If provided,
+		 * then the fd_array[0,...,fd_array_cnt-1] is expected to be
+		 * continuous.
+		 */
+		__u32		fd_array_cnt;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit v1.2.3


From f9933acda31a9882b6e08f58cb976e67842a180b Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Fri, 13 Dec 2024 13:09:32 +0000
Subject: libbpf: prog load: Allow to use fd_array_cnt

Add new fd_array_cnt field to bpf_prog_load_opts
and pass it in bpf_attr, if set.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213130934.1087929-6-aspsk@isovalent.com
---
 tools/lib/bpf/bpf.c | 3 ++-
 tools/lib/bpf/bpf.h | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index becdfa701c75..359f73ead613 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -238,7 +238,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, size_t insn_cnt,
 		  struct bpf_prog_load_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt);
 	void *finfo = NULL, *linfo = NULL;
 	const char *func_info, *line_info;
 	__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
@@ -311,6 +311,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 	attr.line_info_cnt = OPTS_GET(opts, line_info_cnt, 0);
 
 	attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL));
+	attr.fd_array_cnt = OPTS_GET(opts, fd_array_cnt, 0);
 
 	if (log_level) {
 		attr.log_buf = ptr_to_u64(log_buf);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index a4a7b1ad1b63..435da95d2058 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,9 +107,12 @@ struct bpf_prog_load_opts {
 	 */
 	__u32 log_true_size;
 	__u32 token_fd;
+
+	/* if set, provides the length of fd_array */
+	__u32 fd_array_cnt;
 	size_t :0;
 };
-#define bpf_prog_load_opts__last_field token_fd
+#define bpf_prog_load_opts__last_field fd_array_cnt
 
 LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type,
 			     const char *prog_name, const char *license,
-- 
cgit v1.2.3


From 1c593d7402b13d97f997b570e9fc7c49e53e1ed1 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Fri, 13 Dec 2024 13:09:33 +0000
Subject: selftests/bpf: Add tests for fd_array_cnt

Add a new set of tests to test the new field in PROG_LOAD-related
part of bpf_attr: fd_array_cnt.

Add the following test cases:

  * fd_array_cnt/no-fd-array: program is loaded in a normal
    way, without any fd_array present

  * fd_array_cnt/fd-array-ok: pass two extra non-used maps,
    check that they're bound to the program

  * fd_array_cnt/fd-array-dup-input: pass a few extra maps,
    only two of which are unique

  * fd_array_cnt/fd-array-ref-maps-in-array: pass a map in
    fd_array which is also referenced from within the program

  * fd_array_cnt/fd-array-trash-input: pass array with some trash

  * fd_array_cnt/fd-array-2big: pass too large array

All the tests above are using the bpf(2) syscall directly,
no libbpf involved.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213130934.1087929-7-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/prog_tests/fd_array.c | 441 ++++++++++++++++++++++
 1 file changed, 441 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/fd_array.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c
new file mode 100644
index 000000000000..a1d52e73fb16
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include <linux/btf.h>
+#include <bpf/bpf.h>
+
+#include "../test_btf.h"
+
+static inline int new_map(void)
+{
+	const char *name = NULL;
+	__u32 max_entries = 1;
+	__u32 value_size = 8;
+	__u32 key_size = 4;
+
+	return bpf_map_create(BPF_MAP_TYPE_ARRAY, name,
+			      key_size, value_size,
+			      max_entries, NULL);
+}
+
+static int new_btf(void)
+{
+	struct btf_blob {
+		struct btf_header btf_hdr;
+		__u32 types[8];
+		__u32 str;
+	} raw_btf = {
+		.btf_hdr = {
+			.magic = BTF_MAGIC,
+			.version = BTF_VERSION,
+			.hdr_len = sizeof(struct btf_header),
+			.type_len = sizeof(raw_btf.types),
+			.str_off = offsetof(struct btf_blob, str) - offsetof(struct btf_blob, types),
+			.str_len = sizeof(raw_btf.str),
+		},
+		.types = {
+			/* long */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8),  /* [1] */
+			/* unsigned long */
+			BTF_TYPE_INT_ENC(0, 0, 0, 64, 8),  /* [2] */
+		},
+	};
+
+	return bpf_btf_load(&raw_btf, sizeof(raw_btf), NULL);
+}
+
+#define Close(FD) do {		\
+	if ((FD) >= 0) {	\
+		close(FD);	\
+		FD = -1;	\
+	}			\
+} while(0)
+
+static bool map_exists(__u32 id)
+{
+	int fd;
+
+	fd = bpf_map_get_fd_by_id(id);
+	if (fd >= 0) {
+		close(fd);
+		return true;
+	}
+	return false;
+}
+
+static bool btf_exists(__u32 id)
+{
+	int fd;
+
+	fd = bpf_btf_get_fd_by_id(id);
+	if (fd >= 0) {
+		close(fd);
+		return true;
+	}
+	return false;
+}
+
+static inline int bpf_prog_get_map_ids(int prog_fd, __u32 *nr_map_ids, __u32 *map_ids)
+{
+	__u32 len = sizeof(struct bpf_prog_info);
+	struct bpf_prog_info info;
+	int err;
+
+	memset(&info, 0, len);
+	info.nr_map_ids = *nr_map_ids,
+	info.map_ids = ptr_to_u64(map_ids),
+
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
+		return -1;
+
+	*nr_map_ids = info.nr_map_ids;
+
+	return 0;
+}
+
+static int __load_test_prog(int map_fd, const int *fd_array, int fd_array_cnt)
+{
+	/* A trivial program which uses one map */
+	struct bpf_insn insns[] = {
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+
+	opts.fd_array = fd_array;
+	opts.fd_array_cnt = fd_array_cnt;
+
+	return bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, ARRAY_SIZE(insns), &opts);
+}
+
+static int load_test_prog(const int *fd_array, int fd_array_cnt)
+{
+	int map_fd;
+	int ret;
+
+	map_fd = new_map();
+	if (!ASSERT_GE(map_fd, 0, "new_map"))
+		return map_fd;
+
+	ret = __load_test_prog(map_fd, fd_array, fd_array_cnt);
+	close(map_fd);
+	return ret;
+}
+
+static bool check_expected_map_ids(int prog_fd, int expected, __u32 *map_ids, __u32 *nr_map_ids)
+{
+	int err;
+
+	err = bpf_prog_get_map_ids(prog_fd, nr_map_ids, map_ids);
+	if (!ASSERT_OK(err, "bpf_prog_get_map_ids"))
+		return false;
+	if (!ASSERT_EQ(*nr_map_ids, expected, "unexpected nr_map_ids"))
+		return false;
+
+	return true;
+}
+
+/*
+ * Load a program, which uses one map. No fd_array maps are present.
+ * On return only one map is expected to be bound to prog.
+ */
+static void check_fd_array_cnt__no_fd_array(void)
+{
+	__u32 map_ids[16];
+	__u32 nr_map_ids;
+	int prog_fd = -1;
+
+	prog_fd = load_test_prog(NULL, 0);
+	if (!ASSERT_GE(prog_fd, 0, "BPF_PROG_LOAD"))
+		return;
+	nr_map_ids = ARRAY_SIZE(map_ids);
+	check_expected_map_ids(prog_fd, 1, map_ids, &nr_map_ids);
+	close(prog_fd);
+}
+
+/*
+ * Load a program, which uses one map, and pass two extra, non-equal, maps in
+ * fd_array with fd_array_cnt=2. On return three maps are expected to be bound
+ * to the program.
+ */
+static void check_fd_array_cnt__fd_array_ok(void)
+{
+	int extra_fds[2] = { -1, -1 };
+	__u32 map_ids[16];
+	__u32 nr_map_ids;
+	int prog_fd = -1;
+
+	extra_fds[0] = new_map();
+	if (!ASSERT_GE(extra_fds[0], 0, "new_map"))
+		goto cleanup;
+	extra_fds[1] = new_map();
+	if (!ASSERT_GE(extra_fds[1], 0, "new_map"))
+		goto cleanup;
+	prog_fd = load_test_prog(extra_fds, 2);
+	if (!ASSERT_GE(prog_fd, 0, "BPF_PROG_LOAD"))
+		goto cleanup;
+	nr_map_ids = ARRAY_SIZE(map_ids);
+	if (!check_expected_map_ids(prog_fd, 3, map_ids, &nr_map_ids))
+		goto cleanup;
+
+	/* maps should still exist when original file descriptors are closed */
+	Close(extra_fds[0]);
+	Close(extra_fds[1]);
+	if (!ASSERT_EQ(map_exists(map_ids[0]), true, "map_ids[0] should exist"))
+		goto cleanup;
+	if (!ASSERT_EQ(map_exists(map_ids[1]), true, "map_ids[1] should exist"))
+		goto cleanup;
+
+	/* some fds might be invalid, so ignore return codes */
+cleanup:
+	Close(extra_fds[1]);
+	Close(extra_fds[0]);
+	Close(prog_fd);
+}
+
+/*
+ * Load a program with a few extra maps duplicated in the fd_array.
+ * After the load maps should only be referenced once.
+ */
+static void check_fd_array_cnt__duplicated_maps(void)
+{
+	int extra_fds[4] = { -1, -1, -1, -1 };
+	__u32 map_ids[16];
+	__u32 nr_map_ids;
+	int prog_fd = -1;
+
+	extra_fds[0] = extra_fds[2] = new_map();
+	if (!ASSERT_GE(extra_fds[0], 0, "new_map"))
+		goto cleanup;
+	extra_fds[1] = extra_fds[3] = new_map();
+	if (!ASSERT_GE(extra_fds[1], 0, "new_map"))
+		goto cleanup;
+	prog_fd = load_test_prog(extra_fds, 4);
+	if (!ASSERT_GE(prog_fd, 0, "BPF_PROG_LOAD"))
+		goto cleanup;
+	nr_map_ids = ARRAY_SIZE(map_ids);
+	if (!check_expected_map_ids(prog_fd, 3, map_ids, &nr_map_ids))
+		goto cleanup;
+
+	/* maps should still exist when original file descriptors are closed */
+	Close(extra_fds[0]);
+	Close(extra_fds[1]);
+	if (!ASSERT_EQ(map_exists(map_ids[0]), true, "map should exist"))
+		goto cleanup;
+	if (!ASSERT_EQ(map_exists(map_ids[1]), true, "map should exist"))
+		goto cleanup;
+
+	/* some fds might be invalid, so ignore return codes */
+cleanup:
+	Close(extra_fds[1]);
+	Close(extra_fds[0]);
+	Close(prog_fd);
+}
+
+/*
+ * Check that if maps which are referenced by a program are
+ * passed in fd_array, then they will be referenced only once
+ */
+static void check_fd_array_cnt__referenced_maps_in_fd_array(void)
+{
+	int extra_fds[1] = { -1 };
+	__u32 map_ids[16];
+	__u32 nr_map_ids;
+	int prog_fd = -1;
+
+	extra_fds[0] = new_map();
+	if (!ASSERT_GE(extra_fds[0], 0, "new_map"))
+		goto cleanup;
+	prog_fd = __load_test_prog(extra_fds[0], extra_fds, 1);
+	if (!ASSERT_GE(prog_fd, 0, "BPF_PROG_LOAD"))
+		goto cleanup;
+	nr_map_ids = ARRAY_SIZE(map_ids);
+	if (!check_expected_map_ids(prog_fd, 1, map_ids, &nr_map_ids))
+		goto cleanup;
+
+	/* map should still exist when original file descriptor is closed */
+	Close(extra_fds[0]);
+	if (!ASSERT_EQ(map_exists(map_ids[0]), true, "map should exist"))
+		goto cleanup;
+
+	/* some fds might be invalid, so ignore return codes */
+cleanup:
+	Close(extra_fds[0]);
+	Close(prog_fd);
+}
+
+static int get_btf_id_by_fd(int btf_fd, __u32 *id)
+{
+	struct bpf_btf_info info;
+	__u32 info_len = sizeof(info);
+	int err;
+
+	memset(&info, 0, info_len);
+	err = bpf_btf_get_info_by_fd(btf_fd, &info, &info_len);
+	if (err)
+		return err;
+	if (id)
+		*id = info.id;
+	return 0;
+}
+
+/*
+ * Check that fd_array operates properly for btfs. Namely, to check that
+ * passing a btf fd in fd_array increases its reference count, do the
+ * following:
+ *  1) Create a new btf, it's referenced only by a file descriptor, so refcnt=1
+ *  2) Load a BPF prog with fd_array[0] = btf_fd; now btf's refcnt=2
+ *  3) Close the btf_fd, now refcnt=1
+ * Wait and check that BTF stil exists.
+ */
+static void check_fd_array_cnt__referenced_btfs(void)
+{
+	int extra_fds[1] = { -1 };
+	int prog_fd = -1;
+	__u32 btf_id;
+	int tries;
+	int err;
+
+	extra_fds[0] = new_btf();
+	if (!ASSERT_GE(extra_fds[0], 0, "new_btf"))
+		goto cleanup;
+	prog_fd = load_test_prog(extra_fds, 1);
+	if (!ASSERT_GE(prog_fd, 0, "BPF_PROG_LOAD"))
+		goto cleanup;
+
+	/* btf should still exist when original file descriptor is closed */
+	err = get_btf_id_by_fd(extra_fds[0], &btf_id);
+	if (!ASSERT_GE(err, 0, "get_btf_id_by_fd"))
+		goto cleanup;
+
+	Close(extra_fds[0]);
+
+	if (!ASSERT_GE(kern_sync_rcu(), 0, "kern_sync_rcu 1"))
+		goto cleanup;
+
+	if (!ASSERT_EQ(btf_exists(btf_id), true, "btf should exist"))
+		goto cleanup;
+
+	Close(prog_fd);
+
+	/* The program is freed by a workqueue, so no reliable
+	 * way to sync, so just wait a bit (max ~1 second). */
+	for (tries = 100; tries >= 0; tries--) {
+		usleep(1000);
+
+		if (!btf_exists(btf_id))
+			break;
+
+		if (tries)
+			continue;
+
+		PRINT_FAIL("btf should have been freed");
+	}
+
+	/* some fds might be invalid, so ignore return codes */
+cleanup:
+	Close(extra_fds[0]);
+	Close(prog_fd);
+}
+
+/*
+ * Test that a program with trash in fd_array can't be loaded:
+ * only map and BTF file descriptors should be accepted.
+ */
+static void check_fd_array_cnt__fd_array_with_trash(void)
+{
+	int extra_fds[3] = { -1, -1, -1 };
+	int prog_fd = -1;
+
+	extra_fds[0] = new_map();
+	if (!ASSERT_GE(extra_fds[0], 0, "new_map"))
+		goto cleanup;
+	extra_fds[1] = new_btf();
+	if (!ASSERT_GE(extra_fds[1], 0, "new_btf"))
+		goto cleanup;
+
+	/* trash 1: not a file descriptor */
+	extra_fds[2] = 0xbeef;
+	prog_fd = load_test_prog(extra_fds, 3);
+	if (!ASSERT_EQ(prog_fd, -EBADF, "prog should have been rejected with -EBADF"))
+		goto cleanup;
+
+	/* trash 2: not a map or btf */
+	extra_fds[2] = socket(AF_INET, SOCK_STREAM, 0);
+	if (!ASSERT_GE(extra_fds[2], 0, "socket"))
+		goto cleanup;
+
+	prog_fd = load_test_prog(extra_fds, 3);
+	if (!ASSERT_EQ(prog_fd, -EINVAL, "prog should have been rejected with -EINVAL"))
+		goto cleanup;
+
+	/* Validate that the prog is ok if trash is removed */
+	Close(extra_fds[2]);
+	extra_fds[2] = new_btf();
+	if (!ASSERT_GE(extra_fds[2], 0, "new_btf"))
+		goto cleanup;
+
+	prog_fd = load_test_prog(extra_fds, 3);
+	if (!ASSERT_GE(prog_fd, 0, "prog should have been loaded"))
+		goto cleanup;
+
+	/* some fds might be invalid, so ignore return codes */
+cleanup:
+	Close(extra_fds[2]);
+	Close(extra_fds[1]);
+	Close(extra_fds[0]);
+}
+
+/*
+ * Test that a program with too big fd_array can't be loaded.
+ */
+static void check_fd_array_cnt__fd_array_too_big(void)
+{
+	int extra_fds[65];
+	int prog_fd = -1;
+	int i;
+
+	for (i = 0; i < 65; i++) {
+		extra_fds[i] = new_map();
+		if (!ASSERT_GE(extra_fds[i], 0, "new_map"))
+			goto cleanup_fds;
+	}
+
+	prog_fd = load_test_prog(extra_fds, 65);
+	ASSERT_EQ(prog_fd, -E2BIG, "prog should have been rejected with -E2BIG");
+
+cleanup_fds:
+	while (i > 0)
+		Close(extra_fds[--i]);
+}
+
+void test_fd_array_cnt(void)
+{
+	if (test__start_subtest("no-fd-array"))
+		check_fd_array_cnt__no_fd_array();
+
+	if (test__start_subtest("fd-array-ok"))
+		check_fd_array_cnt__fd_array_ok();
+
+	if (test__start_subtest("fd-array-dup-input"))
+		check_fd_array_cnt__duplicated_maps();
+
+	if (test__start_subtest("fd-array-ref-maps-in-array"))
+		check_fd_array_cnt__referenced_maps_in_fd_array();
+
+	if (test__start_subtest("fd-array-ref-btfs"))
+		check_fd_array_cnt__referenced_btfs();
+
+	if (test__start_subtest("fd-array-trash-input"))
+		check_fd_array_cnt__fd_array_with_trash();
+
+	if (test__start_subtest("fd-array-2big"))
+		check_fd_array_cnt__fd_array_too_big();
+}
-- 
cgit v1.2.3


From d677a10f80abf1ef65ae9bcf51b5a83ecf10e99a Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Fri, 13 Dec 2024 13:09:34 +0000
Subject: selftest/bpf: Replace magic constants by macros

Replace magic constants in a BTF structure initialization code by
proper macros, as is done in other similar selftests.

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241213130934.1087929-8-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/progs/syscall.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/syscall.c b/tools/testing/selftests/bpf/progs/syscall.c
index 0f4dfb770c32..b698cc62a371 100644
--- a/tools/testing/selftests/bpf/progs/syscall.c
+++ b/tools/testing/selftests/bpf/progs/syscall.c
@@ -76,9 +76,9 @@ static int btf_load(void)
 			.magic = BTF_MAGIC,
 			.version = BTF_VERSION,
 			.hdr_len = sizeof(struct btf_header),
-			.type_len = sizeof(__u32) * 8,
-			.str_off = sizeof(__u32) * 8,
-			.str_len = sizeof(__u32),
+			.type_len = sizeof(raw_btf.types),
+			.str_off = offsetof(struct btf_blob, str) - offsetof(struct btf_blob, types),
+			.str_len = sizeof(raw_btf.str),
 		},
 		.types = {
 			/* long */
-- 
cgit v1.2.3


From c00d738e1673ab801e1577e4e3c780ccf88b1a5b Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 13 Dec 2024 14:19:27 -0800
Subject: bpf: Revert "bpf: Mark raw_tp arguments with PTR_MAYBE_NULL"

This patch reverts commit
cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL"). The
patch was well-intended and meant to be as a stop-gap fixing branch
prediction when the pointer may actually be NULL at runtime. Eventually,
it was supposed to be replaced by an automated script or compiler pass
detecting possibly NULL arguments and marking them accordingly.

However, it caused two main issues observed for production programs and
failed to preserve backwards compatibility. First, programs relied on
the verifier not exploring == NULL branch when pointer is not NULL, thus
they started failing with a 'dereference of scalar' error.  Next,
allowing raw_tp arguments to be modified surfaced the warning in the
verifier that warns against reg->off when PTR_MAYBE_NULL is set.

More information, context, and discusson on both problems is available
in [0]. Overall, this approach had several shortcomings, and the fixes
would further complicate the verifier's logic, and the entire masking
scheme would have to be removed eventually anyway.

Hence, revert the patch in preparation of a better fix avoiding these
issues to replace this commit.

  [0]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com

Reported-by: Manu Bretelle <chantra@meta.com>
Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241213221929.3495062-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
index 5aaf2b065f86..bba3e37f749b 100644
--- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c
@@ -7,11 +7,7 @@
 #include "bpf_misc.h"
 
 SEC("tp_btf/bpf_testmod_test_nullable_bare")
-/* This used to be a failure test, but raw_tp nullable arguments can now
- * directly be dereferenced, whether they have nullable annotation or not,
- * and don't need to be explicitly checked.
- */
-__success
+__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
 int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx)
 {
 	return nullable_ctx->len;
-- 
cgit v1.2.3


From 838a10bd2ebfe11a60dd67687533a7cfc220cc86 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 13 Dec 2024 14:19:28 -0800
Subject: bpf: Augment raw_tp arguments with PTR_MAYBE_NULL

Arguments to a raw tracepoint are tagged as trusted, which carries the
semantics that the pointer will be non-NULL.  However, in certain cases,
a raw tracepoint argument may end up being NULL. More context about this
issue is available in [0].

Thus, there is a discrepancy between the reality, that raw_tp arguments can
actually be NULL, and the verifier's knowledge, that they are never NULL,
causing explicit NULL check branch to be dead code eliminated.

A previous attempt [1], i.e. the second fixed commit, was made to
simulate symbolic execution as if in most accesses, the argument is a
non-NULL raw_tp, except for conditional jumps.  This tried to suppress
branch prediction while preserving compatibility, but surfaced issues
with production programs that were difficult to solve without increasing
verifier complexity. A more complete discussion of issues and fixes is
available at [2].

Fix this by maintaining an explicit list of tracepoints where the
arguments are known to be NULL, and mark the positional arguments as
PTR_MAYBE_NULL. Additionally, capture the tracepoints where arguments
are known to be ERR_PTR, and mark these arguments as scalar values to
prevent potential dereference.

Each hex digit is used to encode NULL-ness (0x1) or ERR_PTR-ness (0x2),
shifted by the zero-indexed argument number x 4. This can be represented
as follows:
1st arg: 0x1
2nd arg: 0x10
3rd arg: 0x100
... and so on (likewise for ERR_PTR case).

In the future, an automated pass will be used to produce such a list, or
insert __nullable annotations automatically for tracepoints. Each
compilation unit will be analyzed and results will be collated to find
whether a tracepoint pointer is definitely not null, maybe null, or an
unknown state where verifier conservatively marks it PTR_MAYBE_NULL.
A proof of concept of this tool from Eduard is available at [3].

Note that in case we don't find a specification in the raw_tp_null_args
array and the tracepoint belongs to a kernel module, we will
conservatively mark the arguments as PTR_MAYBE_NULL. This is because
unlike for in-tree modules, out-of-tree module tracepoints may pass NULL
freely to the tracepoint. We don't protect against such tracepoints
passing ERR_PTR (which is uncommon anyway), lest we mark all such
arguments as SCALAR_VALUE.

While we are it, let's adjust the test raw_tp_null to not perform
dereference of the skb->mark, as that won't be allowed anymore, and make
it more robust by using inline assembly to test the dead code
elimination behavior, which should still stay the same.

  [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb
  [1]: https://lore.kernel.org/all/20241104171959.2938862-1-memxor@gmail.com
  [2]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com
  [3]: https://github.com/eddyz87/llvm-project/tree/nullness-for-tracepoint-params

Reported-by: Juri Lelli <juri.lelli@redhat.com> # original bug
Reported-by: Manu Bretelle <chantra@meta.com> # bugs in masking fix
Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs")
Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL")
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Co-developed-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241213221929.3495062-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/raw_tp_null.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null.c b/tools/testing/selftests/bpf/progs/raw_tp_null.c
index 457f34c151e3..5927054b6dd9 100644
--- a/tools/testing/selftests/bpf/progs/raw_tp_null.c
+++ b/tools/testing/selftests/bpf/progs/raw_tp_null.c
@@ -3,6 +3,7 @@
 
 #include <vmlinux.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -17,16 +18,14 @@ int BPF_PROG(test_raw_tp_null, struct sk_buff *skb)
 	if (task->pid != tid)
 		return 0;
 
-	i = i + skb->mark + 1;
-	/* The compiler may move the NULL check before this deref, which causes
-	 * the load to fail as deref of scalar. Prevent that by using a barrier.
+	/* If dead code elimination kicks in, the increment +=2 will be
+	 * removed. For raw_tp programs attaching to tracepoints in kernel
+	 * modules, we mark input arguments as PTR_MAYBE_NULL, so branch
+	 * prediction should never kick in.
 	 */
-	barrier();
-	/* If dead code elimination kicks in, the increment below will
-	 * be removed. For raw_tp programs, we mark input arguments as
-	 * PTR_MAYBE_NULL, so branch prediction should never kick in.
-	 */
-	if (!skb)
-		i += 2;
+	asm volatile ("%[i] += 1; if %[ctx] != 0 goto +1; %[i] += 2;"
+			: [i]"+r"(i)
+			: [ctx]"r"(skb)
+			: "memory");
 	return 0;
 }
-- 
cgit v1.2.3


From 0da1955b5bd2af3a1c3d13916df06e34ffa6df3d Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 13 Dec 2024 14:19:29 -0800
Subject: selftests/bpf: Add tests for raw_tp NULL args

Add tests to ensure that arguments are correctly marked based on their
specified positions, and whether they get marked correctly as maybe
null. For modules, all tracepoint parameters should be marked
PTR_MAYBE_NULL by default.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20241213221929.3495062-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/raw_tp_null.c |  3 +++
 .../testing/selftests/bpf/progs/raw_tp_null_fail.c | 24 ++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/raw_tp_null_fail.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c
index 6fa19449297e..43676a9922dc 100644
--- a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c
@@ -3,11 +3,14 @@
 
 #include <test_progs.h>
 #include "raw_tp_null.skel.h"
+#include "raw_tp_null_fail.skel.h"
 
 void test_raw_tp_null(void)
 {
 	struct raw_tp_null *skel;
 
+	RUN_TESTS(raw_tp_null_fail);
+
 	skel = raw_tp_null__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "raw_tp_null__open_and_load"))
 		return;
diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c
new file mode 100644
index 000000000000..38d669957bf1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Ensure module parameter has PTR_MAYBE_NULL */
+SEC("tp_btf/bpf_testmod_test_raw_tp_null")
+__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+int test_raw_tp_null_bpf_testmod_test_raw_tp_null_arg_1(void *ctx) {
+    asm volatile("r1 = *(u64 *)(r1 +0); r1 = *(u64 *)(r1 +0);" ::: __clobber_all);
+    return 0;
+}
+
+/* Check NULL marking */
+SEC("tp_btf/sched_pi_setprio")
+__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+int test_raw_tp_null_sched_pi_setprio_arg_2(void *ctx) {
+    asm volatile("r1 = *(u64 *)(r1 +8); r1 = *(u64 *)(r1 +0);" ::: __clobber_all);
+    return 0;
+}
-- 
cgit v1.2.3


From 6ca774f06a7df650f41b38b67bec0665d862ac23 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 9 Oct 2024 11:56:28 -0700
Subject: torture: Make kvm-remote.sh give up on unresponsive system

Currently, a system that stops responding at the wrong time will hang
kvm-remote.sh.  This can happen when the system in question is forced
offline for maintenance, and there is currently no way for the user
to kick this script into moving ahead.  This commit therefore causes
kvm-remote.sh to wait at most 15 minutes for a non-responsive system,
that is, a system for which ssh gives an exit code of 255.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 .../testing/selftests/rcutorture/bin/kvm-remote.sh | 25 ++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
index 134cdef5a6e0..48a8052d5dae 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -181,10 +181,11 @@ done
 
 # Function to check for presence of a file on the specified system.
 # Complain if the system cannot be reached, and retry after a wait.
-# Currently just waits forever if a machine disappears.
+# Currently just waits 15 minutes if a machine disappears.
 #
 # Usage: checkremotefile system pathname
 checkremotefile () {
+	local nsshfails=0
 	local ret
 	local sleeptime=60
 
@@ -195,6 +196,11 @@ checkremotefile () {
 		if test "$ret" -eq 255
 		then
 			echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
+			nsshfails=$((nsshfails+1))
+			if ((nsshfails > 15))
+			then
+				return 255
+			fi
 		elif test "$ret" -eq 0
 		then
 			return 0
@@ -268,12 +274,23 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log"
 for i in $systems
 do
 	echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
-	while checkremotefile "$i" "$resdir/$ds/remote.run"
+	while :
 	do
+		checkremotefile "$i" "$resdir/$ds/remote.run"
+		ret=$?
+		if test "$ret" -eq 1
+		then
+			echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
+			( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
+			break;
+		fi
+		if test "$ret" -eq 255
+		then
+			echo System $i persistent ssh failure, lost results `date` | tee -a "$oldrun/remote-log"
+			break;
+		fi
 		sleep 30
 	done
-	echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
-	( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
 done
 
 ( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"
-- 
cgit v1.2.3


From 5ec090011bd2bb6ea6c2c607371db57ee0506a89 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 10 Oct 2024 11:49:54 -0700
Subject: rcutorture: Make the TREE03 scenario do preemption

This commit adds the rcutorture.preempt_duration module parameter to
rcutorture's TREE03.boot parameter list in order to better test preemption
of RCU read-side critical sections.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
index 8e50bfd4b710..90318591dae2 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -5,3 +5,4 @@ rcutree.gp_cleanup_delay=3
 rcutree.kthread_prio=2
 threadirqs
 rcutree.use_softirq=0
+rcutorture.preempt_duration=10
-- 
cgit v1.2.3


From 663ad7481f068057f6f692c5368c47150e855370 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Fri, 13 Dec 2024 13:07:11 +0000
Subject: tools/net/ynl: fix sub-message key lookup for nested attributes

Use the correct attribute space for sub-message key lookup in nested
attributes when adding attributes. This fixes rt_link where the "kind"
key and "data" sub-message are nested attributes in "linkinfo".

For example:

./tools/net/ynl/cli.py \
    --create \
    --spec Documentation/netlink/specs/rt_link.yaml \
    --do newlink \
    --json '{"link": 99,
             "linkinfo": { "kind": "vlan", "data": {"id": 4 } }
             }'

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Fixes: ab463c4342d1 ("tools/net/ynl: Add support for encoding sub-messages")
Link: https://patch.msgid.link/20241213130711.40267-1-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index 01ec01a90e76..eea29359a899 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -556,10 +556,10 @@ class YnlFamily(SpecFamily):
         if attr["type"] == 'nest':
             nl_type |= Netlink.NLA_F_NESTED
             attr_payload = b''
-            sub_attrs = SpaceAttrs(self.attr_sets[space], value, search_attrs)
+            sub_space = attr['nested-attributes']
+            sub_attrs = SpaceAttrs(self.attr_sets[sub_space], value, search_attrs)
             for subname, subvalue in value.items():
-                attr_payload += self._add_attr(attr['nested-attributes'],
-                                               subname, subvalue, sub_attrs)
+                attr_payload += self._add_attr(sub_space, subname, subvalue, sub_attrs)
         elif attr["type"] == 'flag':
             if not value:
                 # If value is absent or false then skip attribute creation.
-- 
cgit v1.2.3


From b2e584aa3c710802600b690f34a56fb526aebf2f Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 12 Dec 2024 16:36:08 +0100
Subject: selftests: tls: add key_generation argument to tls_crypto_info_init

This allows us to generate different keys, so that we can test that
rekey is using the correct one.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tls.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 1a706d03bb6b..b1f52d2bb096 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -44,9 +44,11 @@ struct tls_crypto_info_keys {
 };
 
 static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type,
-				 struct tls_crypto_info_keys *tls12)
+				 struct tls_crypto_info_keys *tls12,
+				 char key_generation)
 {
-	memset(tls12, 0, sizeof(*tls12));
+	memset(tls12, key_generation, sizeof(*tls12));
+	memset(tls12, 0, sizeof(struct tls_crypto_info));
 
 	switch (cipher_type) {
 	case TLS_CIPHER_CHACHA20_POLY1305:
@@ -275,7 +277,7 @@ TEST_F(tls_basic, recseq_wrap)
 	if (self->notls)
 		SKIP(return, "no TLS support");
 
-	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12);
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0);
 	memset(&tls12.aes128.rec_seq, 0xff, sizeof(tls12.aes128.rec_seq));
 
 	ASSERT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
@@ -391,7 +393,7 @@ FIXTURE_SETUP(tls)
 		SKIP(return, "Unsupported cipher in FIPS mode");
 
 	tls_crypto_info_init(variant->tls_version, variant->cipher_type,
-			     &tls12);
+			     &tls12, 0);
 
 	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
 
@@ -1175,7 +1177,7 @@ TEST_F(tls, bidir)
 		struct tls_crypto_info_keys tls12;
 
 		tls_crypto_info_init(variant->tls_version, variant->cipher_type,
-				     &tls12);
+				     &tls12, 0);
 
 		ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12,
 				 tls12.len);
@@ -1614,7 +1616,7 @@ TEST_F(tls, getsockopt)
 	EXPECT_EQ(get.crypto_info.cipher_type, variant->cipher_type);
 
 	/* get the full crypto_info */
-	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect, 0);
 	len = expect.len;
 	memrnd(&get, sizeof(get));
 	EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &get, &len), 0);
@@ -1696,7 +1698,7 @@ FIXTURE_SETUP(tls_err)
 	int ret;
 
 	tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_128,
-			     &tls12);
+			     &tls12, 0);
 
 	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
 	ulp_sock_pair(_metadata, &self->fd2, &self->cfd2, &self->notls);
@@ -2118,7 +2120,7 @@ TEST(tls_v6ops) {
 	int sfd, ret, fd;
 	socklen_t len, len2;
 
-	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12);
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0);
 
 	addr.sin6_family = AF_INET6;
 	addr.sin6_addr = in6addr_any;
@@ -2177,7 +2179,7 @@ TEST(prequeue) {
 	len = sizeof(addr);
 	memrnd(buf, sizeof(buf));
 
-	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12);
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12, 0);
 
 	addr.sin_family = AF_INET;
 	addr.sin_addr.s_addr = htonl(INADDR_ANY);
-- 
cgit v1.2.3


From 555f0edb9ff043196655a5b7cc65f67dfd05b530 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 12 Dec 2024 16:36:09 +0100
Subject: selftests: tls: add rekey tests

Test the kernel's ability to:
 - update the key (but not the version or cipher), only for TLS1.3
 - pause decryption after receiving a KeyUpdate message, until a new
   RX key has been provided
 - reflect the pause/non-readable socket in poll()

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tls.c | 458 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 458 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index b1f52d2bb096..9a85f93c33d8 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -1670,6 +1670,464 @@ TEST_F(tls, recv_efault)
 		EXPECT_EQ(memcmp(rec2, recv_mem + 9, ret - 9), 0);
 }
 
+#define TLS_RECORD_TYPE_HANDSHAKE      0x16
+/* key_update, length 1, update_not_requested */
+static const char key_update_msg[] = "\x18\x00\x00\x01\x00";
+static void tls_send_keyupdate(struct __test_metadata *_metadata, int fd)
+{
+	size_t len = sizeof(key_update_msg);
+
+	EXPECT_EQ(tls_send_cmsg(fd, TLS_RECORD_TYPE_HANDSHAKE,
+				(char *)key_update_msg, len, 0),
+		  len);
+}
+
+static void tls_recv_keyupdate(struct __test_metadata *_metadata, int fd, int flags)
+{
+	char buf[100];
+
+	EXPECT_EQ(tls_recv_cmsg(_metadata, fd, TLS_RECORD_TYPE_HANDSHAKE, buf, sizeof(buf), flags),
+		  sizeof(key_update_msg));
+	EXPECT_EQ(memcmp(buf, key_update_msg, sizeof(key_update_msg)), 0);
+}
+
+/* set the key to 0 then 1 for RX, immediately to 1 for TX */
+TEST_F(tls_basic, rekey_rx)
+{
+	struct tls_crypto_info_keys tls12_0, tls12_1;
+	char const *test_str = "test_message";
+	int send_len = strlen(test_str) + 1;
+	char buf[20];
+	int ret;
+
+	if (self->notls)
+		return;
+
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_0, 0);
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_1, 1);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_0, tls12_0.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len);
+	EXPECT_EQ(ret, 0);
+
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+/* set the key to 0 then 1 for TX, immediately to 1 for RX */
+TEST_F(tls_basic, rekey_tx)
+{
+	struct tls_crypto_info_keys tls12_0, tls12_1;
+	char const *test_str = "test_message";
+	int send_len = strlen(test_str) + 1;
+	char buf[20];
+	int ret;
+
+	if (self->notls)
+		return;
+
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_0, 0);
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_1, 1);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_0, tls12_0.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len);
+	EXPECT_EQ(ret, 0);
+
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+TEST_F(tls, rekey)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	char const *test_str_2 = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	/* initial send/recv */
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* send after rekey */
+	send_len = strlen(test_str_2) + 1;
+	EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len);
+
+	/* can't receive the KeyUpdate without a control message */
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* recv blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* recv non-blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* update RX key */
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	/* recv after rekey */
+	EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+	EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0);
+}
+
+TEST_F(tls, rekey_fail)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	char const *test_str_2 = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+
+	/* initial send/recv */
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+
+	if (variant->tls_version != TLS_1_3_VERSION) {
+		/* just check that rekey is not supported and return */
+		tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+		EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1);
+		EXPECT_EQ(errno, EBUSY);
+		return;
+	}
+
+	/* successful update */
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* invalid update: change of version */
+	tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* invalid update (RX socket): change of version */
+	tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* invalid update: change of cipher */
+	if (variant->cipher_type == TLS_CIPHER_AES_GCM_256)
+		tls_crypto_info_init(variant->tls_version, TLS_CIPHER_CHACHA20_POLY1305, &tls12, 1);
+	else
+		tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_256, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* send after rekey, the invalid updates shouldn't have an effect */
+	send_len = strlen(test_str_2) + 1;
+	EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len);
+
+	/* can't receive the KeyUpdate without a control message */
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* recv blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* recv non-blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* update RX key */
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	/* recv after rekey */
+	EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+	EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0);
+}
+
+TEST_F(tls, rekey_peek)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	/* can't receive the KeyUpdate without a control message */
+	EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_PEEK), -1);
+
+	/* peek KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* update RX key */
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+}
+
+TEST_F(tls, splice_rekey)
+{
+	int send_len = TLS_PAYLOAD_MAX_LEN / 2;
+	char mem_send[TLS_PAYLOAD_MAX_LEN];
+	char mem_recv[TLS_PAYLOAD_MAX_LEN];
+	struct tls_crypto_info_keys tls12;
+	int p[2];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	memrnd(mem_send, sizeof(mem_send));
+
+	ASSERT_GE(pipe(p), 0);
+	EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len);
+
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+
+	/* can't splice the KeyUpdate */
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* peek KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* can't splice before updating the key */
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* update RX key */
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, rekey_peek_splice)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+	char mem_recv[TLS_PAYLOAD_MAX_LEN];
+	int p[2];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	ASSERT_GE(pipe(p), 0);
+
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_recv, test_str_1, send_len), 0);
+}
+
+TEST_F(tls, rekey_getsockopt)
+{
+	struct tls_crypto_info_keys tls12;
+	struct tls_crypto_info_keys tls12_get;
+	socklen_t len;
+
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+}
+
+TEST_F(tls, rekey_poll_pending)
+{
+	char const *test_str = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	struct pollfd pfd = { };
+	int send_len;
+	int ret;
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* send immediately after rekey */
+	send_len = strlen(test_str) + 1;
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+
+	/* key hasn't been updated, expect cfd to be non-readable */
+	pfd.fd = self->cfd;
+	pfd.events = POLLIN;
+	EXPECT_EQ(poll(&pfd, 1, 0), 0);
+
+	ret = fork();
+	ASSERT_GE(ret, 0);
+
+	if (ret) {
+		int pid2, status;
+
+		/* wait before installing the new key */
+		sleep(1);
+
+		/* update RX key while poll() is sleeping */
+		EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+		pid2 = wait(&status);
+		EXPECT_EQ(pid2, ret);
+		EXPECT_EQ(status, 0);
+	} else {
+		pfd.fd = self->cfd;
+		pfd.events = POLLIN;
+		EXPECT_EQ(poll(&pfd, 1, 5000), 1);
+
+		exit(!__test_passed(_metadata));
+	}
+}
+
+TEST_F(tls, rekey_poll_delay)
+{
+	char const *test_str = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	struct pollfd pfd = { };
+	int send_len;
+	int ret;
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	ret = fork();
+	ASSERT_GE(ret, 0);
+
+	if (ret) {
+		int pid2, status;
+
+		/* wait before installing the new key */
+		sleep(1);
+
+		/* update RX key while poll() is sleeping */
+		EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+		sleep(1);
+		send_len = strlen(test_str) + 1;
+		EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+
+		pid2 = wait(&status);
+		EXPECT_EQ(pid2, ret);
+		EXPECT_EQ(status, 0);
+	} else {
+		pfd.fd = self->cfd;
+		pfd.events = POLLIN;
+		EXPECT_EQ(poll(&pfd, 1, 5000), 1);
+		exit(!__test_passed(_metadata));
+	}
+}
+
 FIXTURE(tls_err)
 {
 	int fd, cfd;
-- 
cgit v1.2.3


From 9d6c0e58514f8b57cd9c2c755e41623d6a966025 Mon Sep 17 00:00:00 2001
From: He Rongguang <herongguang@linux.alibaba.com>
Date: Thu, 12 Dec 2024 10:14:59 +0800
Subject: cpupower: fix TSC MHz calculation

Commit 'cpupower: Make TSC read per CPU for Mperf monitor' (c2adb1877b7)
changes TSC counter reads per cpu, but left time diff global (from start
of all cpus to end of all cpus), thus diff(time) is too large for a
cpu's tsc counting, resulting in far less than acutal TSC_Mhz and thus
`cpupower monitor` showing far less than actual cpu realtime frequency.

/proc/cpuinfo shows frequency:
cat /proc/cpuinfo | egrep -e 'processor' -e 'MHz'
...
processor : 171
cpu MHz   : 4108.498
...

before fix (System 100% busy):
    | Mperf              || Idle_Stats
 CPU| C0   | Cx   | Freq  || POLL | C1   | C2
 171|  0.77| 99.23|  2279||  0.00|  0.00|  0.00

after fix (System 100% busy):
    | Mperf              || Idle_Stats
 CPU| C0   | Cx   | Freq  || POLL | C1   | C2
 171|  0.46| 99.54|  4095||  0.00|  0.00|  0.00

Fixes: c2adb1877b76 ("cpupower: Make TSC read per CPU for Mperf monitor")
Signed-off-by: He Rongguang <herongguang@linux.alibaba.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/idle_monitor/mperf_monitor.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
index 0a03573ebcc2..73b6b10cbdd2 100644
--- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
+++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
@@ -33,7 +33,7 @@ static int mperf_get_count_percent(unsigned int self_id, double *percent,
 				   unsigned int cpu);
 static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
 				unsigned int cpu);
-static struct timespec time_start, time_end;
+static struct timespec *time_start, *time_end;
 
 static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = {
 	{
@@ -174,7 +174,7 @@ static int mperf_get_count_percent(unsigned int id, double *percent,
 		dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n",
 		       mperf_cstates[id].name, mperf_diff, tsc_diff);
 	} else if (max_freq_mode == MAX_FREQ_SYSFS) {
-		timediff = max_frequency * timespec_diff_us(time_start, time_end);
+		timediff = max_frequency * timespec_diff_us(time_start[cpu], time_end[cpu]);
 		*percent = 100.0 * mperf_diff / timediff;
 		dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n",
 		       mperf_cstates[id].name, mperf_diff, timediff);
@@ -207,7 +207,7 @@ static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
 	if (max_freq_mode == MAX_FREQ_TSC_REF) {
 		/* Calculate max_freq from TSC count */
 		tsc_diff = tsc_at_measure_end[cpu] - tsc_at_measure_start[cpu];
-		time_diff = timespec_diff_us(time_start, time_end);
+		time_diff = timespec_diff_us(time_start[cpu], time_end[cpu]);
 		max_frequency = tsc_diff / time_diff;
 	}
 
@@ -226,9 +226,8 @@ static int mperf_start(void)
 {
 	int cpu;
 
-	clock_gettime(CLOCK_REALTIME, &time_start);
-
 	for (cpu = 0; cpu < cpu_count; cpu++) {
+		clock_gettime(CLOCK_REALTIME, &time_start[cpu]);
 		mperf_get_tsc(&tsc_at_measure_start[cpu]);
 		mperf_init_stats(cpu);
 	}
@@ -243,9 +242,9 @@ static int mperf_stop(void)
 	for (cpu = 0; cpu < cpu_count; cpu++) {
 		mperf_measure_stats(cpu);
 		mperf_get_tsc(&tsc_at_measure_end[cpu]);
+		clock_gettime(CLOCK_REALTIME, &time_end[cpu]);
 	}
 
-	clock_gettime(CLOCK_REALTIME, &time_end);
 	return 0;
 }
 
@@ -349,6 +348,8 @@ struct cpuidle_monitor *mperf_register(void)
 	aperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
 	tsc_at_measure_start = calloc(cpu_count, sizeof(unsigned long long));
 	tsc_at_measure_end = calloc(cpu_count, sizeof(unsigned long long));
+	time_start = calloc(cpu_count, sizeof(struct timespec));
+	time_end = calloc(cpu_count, sizeof(struct timespec));
 	mperf_monitor.name_len = strlen(mperf_monitor.name);
 	return &mperf_monitor;
 }
@@ -361,6 +362,8 @@ void mperf_unregister(void)
 	free(aperf_current_count);
 	free(tsc_at_measure_start);
 	free(tsc_at_measure_end);
+	free(time_start);
+	free(time_end);
 	free(is_valid);
 }
 
-- 
cgit v1.2.3


From 8dccbecbb9692a96cf477eb826352a7c556a31e2 Mon Sep 17 00:00:00 2001
From: Bastien Curutchet <bastien.curutchet@bootlin.com>
Date: Fri, 13 Dec 2024 16:06:20 +0100
Subject: selftests/bpf: test_xdp_meta: Rename BPF sections

SEC("t") and SEC("x") can't be loaded by the __load() helper.

Rename these sections SEC("tc") and SEC("xdp") so they can be
interpreted by the __load() helper in upcoming patch.
Update the test_xdp_meta.sh to fit these new names.

Signed-off-by: Bastien Curutchet <bastien.curutchet@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20241213-xdp_meta-v2-1-634582725b90@bootlin.com
---
 tools/testing/selftests/bpf/progs/test_xdp_meta.c | 4 ++--
 tools/testing/selftests/bpf/test_xdp_meta.sh      | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
index a7c4a7d49fe6..fe2d71ae0e71 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
@@ -8,7 +8,7 @@
 #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
 #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
 
-SEC("t")
+SEC("tc")
 int ing_cls(struct __sk_buff *ctx)
 {
 	__u8 *data, *data_meta, *data_end;
@@ -28,7 +28,7 @@ int ing_cls(struct __sk_buff *ctx)
 	return diff ? TC_ACT_SHOT : TC_ACT_OK;
 }
 
-SEC("x")
+SEC("xdp")
 int ing_xdp(struct xdp_md *ctx)
 {
 	__u8 *data, *data_meta, *data_end;
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
index 2740322c1878..6039b92f1094 100755
--- a/tools/testing/selftests/bpf/test_xdp_meta.sh
+++ b/tools/testing/selftests/bpf/test_xdp_meta.sh
@@ -43,11 +43,11 @@ ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2
 ip netns exec ${NS1} tc qdisc add dev veth1 clsact
 ip netns exec ${NS2} tc qdisc add dev veth2 clsact
 
-ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec t
-ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec t
+ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec tc
+ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec tc
 
-ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec x
-ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec x
+ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec xdp
+ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec xdp
 
 ip netns exec ${NS1} ip link set dev veth1 up
 ip netns exec ${NS2} ip link set dev veth2 up
-- 
cgit v1.2.3


From df539cefb0abbd16be9fbcc6ec46a5a35495800f Mon Sep 17 00:00:00 2001
From: Bastien Curutchet <bastien.curutchet@bootlin.com>
Date: Fri, 13 Dec 2024 16:06:21 +0100
Subject: selftests/bpf: Migrate test_xdp_meta.sh into xdp_context_test_run.c

test_xdp_meta.sh can't be used by the BPF CI.

Migrate test_xdp_meta.sh in a new test case in xdp_context_test_run.c.
It uses the same BPF programs located in progs/test_xdp_meta.c and the
same network topology.
Remove test_xdp_meta.sh and its Makefile entry.

Signed-off-by: Bastien Curutchet <bastien.curutchet@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20241213-xdp_meta-v2-2-634582725b90@bootlin.com
---
 tools/testing/selftests/bpf/Makefile               |  1 -
 .../bpf/prog_tests/xdp_context_test_run.c          | 87 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_xdp_meta.sh       | 58 ---------------
 3 files changed, 87 insertions(+), 59 deletions(-)
 delete mode 100755 tools/testing/selftests/bpf/test_xdp_meta.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 6ad3b1ba1920..772bfc6b63fa 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -129,7 +129,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
 TEST_PROGS := test_kmod.sh \
 	test_xdp_redirect.sh \
 	test_xdp_redirect_multi.sh \
-	test_xdp_meta.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
index e6a783c7f5db..937da9b7532a 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
@@ -2,6 +2,14 @@
 #include <test_progs.h>
 #include <network_helpers.h>
 #include "test_xdp_context_test_run.skel.h"
+#include "test_xdp_meta.skel.h"
+
+#define TX_ADDR "10.0.0.1"
+#define RX_ADDR "10.0.0.2"
+#define RX_NAME "veth0"
+#define TX_NAME "veth1"
+#define TX_NETNS "xdp_context_tx"
+#define RX_NETNS "xdp_context_rx"
 
 void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts,
 			    __u32 data_meta, __u32 data, __u32 data_end,
@@ -103,3 +111,82 @@ void test_xdp_context_test_run(void)
 
 	test_xdp_context_test_run__destroy(skel);
 }
+
+void test_xdp_context_functional(void)
+{
+	LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS);
+	LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1);
+	struct netns_obj *rx_ns = NULL, *tx_ns = NULL;
+	struct bpf_program *tc_prog, *xdp_prog;
+	struct test_xdp_meta *skel = NULL;
+	struct nstoken *nstoken = NULL;
+	int rx_ifindex;
+	int ret;
+
+	tx_ns = netns_new(TX_NETNS, false);
+	if (!ASSERT_OK_PTR(tx_ns, "create tx_ns"))
+		return;
+
+	rx_ns = netns_new(RX_NETNS, false);
+	if (!ASSERT_OK_PTR(rx_ns, "create rx_ns"))
+		goto close;
+
+	SYS(close, "ip link add " RX_NAME " netns " RX_NETNS
+	    " type veth peer name " TX_NAME " netns " TX_NETNS);
+
+	nstoken = open_netns(RX_NETNS);
+	if (!ASSERT_OK_PTR(nstoken, "setns rx_ns"))
+		goto close;
+
+	SYS(close, "ip addr add " RX_ADDR "/24 dev " RX_NAME);
+	SYS(close, "ip link set dev " RX_NAME " up");
+
+	skel = test_xdp_meta__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open and load skeleton"))
+		goto close;
+
+	rx_ifindex = if_nametoindex(RX_NAME);
+	if (!ASSERT_GE(rx_ifindex, 0, "if_nametoindex rx"))
+		goto close;
+
+	tc_hook.ifindex = rx_ifindex;
+	ret = bpf_tc_hook_create(&tc_hook);
+	if (!ASSERT_OK(ret, "bpf_tc_hook_create"))
+		goto close;
+
+	tc_prog = bpf_object__find_program_by_name(skel->obj, "ing_cls");
+	if (!ASSERT_OK_PTR(tc_prog, "open ing_cls prog"))
+		goto close;
+
+	tc_opts.prog_fd = bpf_program__fd(tc_prog);
+	ret = bpf_tc_attach(&tc_hook, &tc_opts);
+	if (!ASSERT_OK(ret, "bpf_tc_attach"))
+		goto close;
+
+	xdp_prog = bpf_object__find_program_by_name(skel->obj, "ing_xdp");
+	if (!ASSERT_OK_PTR(xdp_prog, "open ing_xdp prog"))
+		goto close;
+
+	ret = bpf_xdp_attach(rx_ifindex,
+			     bpf_program__fd(xdp_prog),
+			     0, NULL);
+	if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
+		goto close;
+
+	close_netns(nstoken);
+
+	nstoken = open_netns(TX_NETNS);
+	if (!ASSERT_OK_PTR(nstoken, "setns tx_ns"))
+		goto close;
+
+	SYS(close, "ip addr add " TX_ADDR "/24 dev " TX_NAME);
+	SYS(close, "ip link set dev " TX_NAME " up");
+	ASSERT_OK(SYS_NOFAIL("ping -c 1 " RX_ADDR), "ping");
+
+close:
+	close_netns(nstoken);
+	test_xdp_meta__destroy(skel);
+	netns_free(rx_ns);
+	netns_free(tx_ns);
+}
+
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
deleted file mode 100755
index 6039b92f1094..000000000000
--- a/tools/testing/selftests/bpf/test_xdp_meta.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/sh
-
-BPF_FILE="test_xdp_meta.bpf.o"
-# Kselftest framework requirement - SKIP code is 4.
-readonly KSFT_SKIP=4
-readonly NS1="ns1-$(mktemp -u XXXXXX)"
-readonly NS2="ns2-$(mktemp -u XXXXXX)"
-
-cleanup()
-{
-	if [ "$?" = "0" ]; then
-		echo "selftests: test_xdp_meta [PASS]";
-	else
-		echo "selftests: test_xdp_meta [FAILED]";
-	fi
-
-	set +e
-	ip link del veth1 2> /dev/null
-	ip netns del ${NS1} 2> /dev/null
-	ip netns del ${NS2} 2> /dev/null
-}
-
-ip link set dev lo xdp off 2>/dev/null > /dev/null
-if [ $? -ne 0 ];then
-	echo "selftests: [SKIP] Could not run test without the ip xdp support"
-	exit $KSFT_SKIP
-fi
-set -e
-
-ip netns add ${NS1}
-ip netns add ${NS2}
-
-trap cleanup 0 2 3 6 9
-
-ip link add veth1 type veth peer name veth2
-
-ip link set veth1 netns ${NS1}
-ip link set veth2 netns ${NS2}
-
-ip netns exec ${NS1} ip addr add 10.1.1.11/24 dev veth1
-ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2
-
-ip netns exec ${NS1} tc qdisc add dev veth1 clsact
-ip netns exec ${NS2} tc qdisc add dev veth2 clsact
-
-ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec tc
-ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec tc
-
-ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec xdp
-ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec xdp
-
-ip netns exec ${NS1} ip link set dev veth1 up
-ip netns exec ${NS2} ip link set dev veth2 up
-
-ip netns exec ${NS1} ping -c 1 10.1.1.22
-ip netns exec ${NS2} ping -c 1 10.1.1.11
-
-exit 0
-- 
cgit v1.2.3


From 184a9358e506b77ade22c07dda4f34d133bc31c0 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tandersen@netflix.com>
Date: Wed, 30 Oct 2024 14:37:32 -0600
Subject: selftests/exec: add a test for execveat()'s comm

In the previous patch we've updated AT_EMPTY_PATH execs to use the dentry
filename. Test for this and just to be sure keeps working with symlinks,
which was a concern in [1], I've added a test for that as well.

The test itself is a bit ugly, because the existing check_execveat_fail()
helpers use a hardcoded envp and argv, and we want to "pass" things via the
environment to test various argument values, but it seemed cleaner than
passing one in everywhere in all the existing tests.

Output looks like:

    ok 51 Check success of execveat(6, 'home/tycho/packages/...yyyyyyyyyyyyyyyyyyyy', 0)...
    # Check execveat(AT_EMPTY_PATH)'s comm is execveat
    ok 52 Check success of execveat(9, '', 4096)...
    # Check execveat(AT_EMPTY_PATH)'s comm is execveat
    ok 53 Check success of execveat(11, '', 4096)...
    # Check execveat(AT_EMPTY_PATH)'s comm is execveat
    [   25.579272] process 'execveat' launched '/dev/fd/9' with NULL argv: empty string added
    ok 54 Check success of execveat(9, '', 4096)...

Link: https://lore.kernel.org/all/20240925.152228-private.conflict.frozen.trios-TdUGhuI5Sb4v@cyphar.com/ [1]
Signed-off-by: Tycho Andersen <tandersen@netflix.com>
Link: https://lore.kernel.org/r/20241030203732.248767-2-tycho@tycho.pizza
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/testing/selftests/exec/execveat.c | 75 +++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
index 071e03532cba..8fb7395fd35b 100644
--- a/tools/testing/selftests/exec/execveat.c
+++ b/tools/testing/selftests/exec/execveat.c
@@ -23,9 +23,11 @@
 
 #include "../kselftest.h"
 
-#define TESTS_EXPECTED 51
+#define TESTS_EXPECTED 54
 #define TEST_NAME_LEN (PATH_MAX * 4)
 
+#define CHECK_COMM "CHECK_COMM"
+
 static char longpath[2 * PATH_MAX] = "";
 static char *envp[] = { "IN_TEST=yes", NULL, NULL };
 static char *argv[] = { "execveat", "99", NULL };
@@ -237,6 +239,29 @@ static int check_execveat_pathmax(int root_dfd, const char *src, int is_script)
 	return fail;
 }
 
+static int check_execveat_comm(int fd, char *argv0, char *expected)
+{
+	char buf[128], *old_env, *old_argv0;
+	int ret;
+
+	snprintf(buf, sizeof(buf), CHECK_COMM "=%s", expected);
+
+	old_env = envp[1];
+	envp[1] = buf;
+
+	old_argv0 = argv[0];
+	argv[0] = argv0;
+
+	ksft_print_msg("Check execveat(AT_EMPTY_PATH)'s comm is %s\n",
+		       expected);
+	ret = check_execveat_invoked_rc(fd, "", AT_EMPTY_PATH, 0, 0);
+
+	envp[1] = old_env;
+	argv[0] = old_argv0;
+
+	return ret;
+}
+
 static int run_tests(void)
 {
 	int fail = 0;
@@ -389,6 +414,14 @@ static int run_tests(void)
 
 	fail += check_execveat_pathmax(root_dfd, "execveat", 0);
 	fail += check_execveat_pathmax(root_dfd, "script", 1);
+
+	/* /proc/pid/comm gives filename by default */
+	fail += check_execveat_comm(fd, "sentinel", "execveat");
+	/* /proc/pid/comm gives argv[0] when invoked via link */
+	fail += check_execveat_comm(fd_symlink, "sentinel", "execveat");
+	/* /proc/pid/comm gives filename if NULL is passed */
+	fail += check_execveat_comm(fd, NULL, "execveat");
+
 	return fail;
 }
 
@@ -415,9 +448,13 @@ int main(int argc, char **argv)
 	int ii;
 	int rc;
 	const char *verbose = getenv("VERBOSE");
+	const char *check_comm = getenv(CHECK_COMM);
 
-	if (argc >= 2) {
-		/* If we are invoked with an argument, don't run tests. */
+	if (argc >= 2 || check_comm) {
+		/*
+		 * If we are invoked with an argument, or no arguments but a
+		 * command to check, don't run tests.
+		 */
 		const char *in_test = getenv("IN_TEST");
 
 		if (verbose) {
@@ -426,6 +463,38 @@ int main(int argc, char **argv)
 				ksft_print_msg("\t[%d]='%s\n'", ii, argv[ii]);
 		}
 
+		/* If the tests wanted us to check the command, do so. */
+		if (check_comm) {
+			/* TASK_COMM_LEN == 16 */
+			char buf[32];
+			int fd, ret;
+
+			fd = open("/proc/self/comm", O_RDONLY);
+			if (fd < 0) {
+				ksft_perror("open() comm failed");
+				exit(1);
+			}
+
+			ret = read(fd, buf, sizeof(buf));
+			if (ret < 0) {
+				ksft_perror("read() comm failed");
+				close(fd);
+				exit(1);
+			}
+			close(fd);
+
+			// trim off the \n
+			buf[ret-1] = 0;
+
+			if (strcmp(buf, check_comm)) {
+				ksft_print_msg("bad comm, got: %s expected: %s\n",
+					       buf, check_comm);
+				exit(1);
+			}
+
+			exit(0);
+		}
+
 		/* Check expected environment transferred. */
 		if (!in_test || strcmp(in_test, "yes") != 0) {
 			ksft_print_msg("no IN_TEST=yes in env\n");
-- 
cgit v1.2.3


From 0518863407b8dcc7070fdbc1c015046d66777e78 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 13 Dec 2024 07:22:42 -0800
Subject: selftests: net: support setting recv_size in YNL

recv_size parameter allows constraining the buffer size for dumps.
It's useful in testing kernel handling of dump continuation,
IOW testing dumps which span multiple skbs.

Let the tests set this parameter when initializing the YNL family.
Keep the normal default, we don't want tests to unintentionally
behave very differently than normal code.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20241213152244.3080955-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/ynl.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
index a0d689d58c57..076a7e8dc3eb 100644
--- a/tools/testing/selftests/net/lib/py/ynl.py
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -32,23 +32,23 @@ except ModuleNotFoundError as e:
 # Set schema='' to avoid jsonschema validation, it's slow
 #
 class EthtoolFamily(YnlFamily):
-    def __init__(self):
+    def __init__(self, recv_size=0):
         super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(),
-                         schema='')
+                         schema='', recv_size=recv_size)
 
 
 class RtnlFamily(YnlFamily):
-    def __init__(self):
+    def __init__(self, recv_size=0):
         super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(),
-                         schema='')
+                         schema='', recv_size=recv_size)
 
 
 class NetdevFamily(YnlFamily):
-    def __init__(self):
+    def __init__(self, recv_size=0):
         super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(),
-                         schema='')
+                         schema='', recv_size=recv_size)
 
 class NetshaperFamily(YnlFamily):
-    def __init__(self):
+    def __init__(self, recv_size=0):
         super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(),
-                         schema='')
+                         schema='', recv_size=recv_size)
-- 
cgit v1.2.3


From 1234810b1649e9d781aeafd4b23fb1fcfbf95d8f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 13 Dec 2024 07:22:43 -0800
Subject: selftests: net-drv: queues: sanity check netlink dumps

This test already catches a netlink bug fixed by this series,
but only when running on HW with many queues. Make sure the
netdevsim instance created has a lot of queues, and constrain
the size of the recv_buffer used by netlink.

While at it test both rx and tx queues.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20241213152244.3080955-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/queues.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py
index 30f29096e27c..9c5473abbd78 100755
--- a/tools/testing/selftests/drivers/net/queues.py
+++ b/tools/testing/selftests/drivers/net/queues.py
@@ -8,25 +8,28 @@ from lib.py import cmd
 import glob
 
 
-def sys_get_queues(ifname) -> int:
-    folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*')
+def sys_get_queues(ifname, qtype='rx') -> int:
+    folders = glob.glob(f'/sys/class/net/{ifname}/queues/{qtype}-*')
     return len(folders)
 
 
-def nl_get_queues(cfg, nl):
+def nl_get_queues(cfg, nl, qtype='rx'):
     queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True)
     if queues:
-        return len([q for q in queues if q['type'] == 'rx'])
+        return len([q for q in queues if q['type'] == qtype])
     return None
 
 
 def get_queues(cfg, nl) -> None:
-    queues = nl_get_queues(cfg, nl)
-    if not queues:
-        raise KsftSkipEx('queue-get not supported by device')
+    snl = NetdevFamily(recv_size=4096)
 
-    expected = sys_get_queues(cfg.dev['ifname'])
-    ksft_eq(queues, expected)
+    for qtype in ['rx', 'tx']:
+        queues = nl_get_queues(cfg, snl, qtype)
+        if not queues:
+            raise KsftSkipEx('queue-get not supported by device')
+
+        expected = sys_get_queues(cfg.dev['ifname'], qtype)
+        ksft_eq(queues, expected)
 
 
 def addremove_queues(cfg, nl) -> None:
@@ -57,7 +60,7 @@ def addremove_queues(cfg, nl) -> None:
 
 
 def main() -> None:
-    with NetDrvEnv(__file__, queue_count=3) as cfg:
+    with NetDrvEnv(__file__, queue_count=100) as cfg:
         ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily()))
     ksft_exit()
 
-- 
cgit v1.2.3


From 5712e323d4c3ad03bba4d28f83e80593171ac3f1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 13 Dec 2024 07:22:44 -0800
Subject: selftests: net-drv: stats: sanity check netlink dumps

Sanity check netlink dumps, to make sure dumps don't have
repeated entries or gaps in IDs.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20241213152244.3080955-6-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/stats.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
index 63e3c045a3b2..031ac9def6c0 100755
--- a/tools/testing/selftests/drivers/net/stats.py
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -110,6 +110,23 @@ def qstat_by_ifindex(cfg) -> None:
             ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key)
             ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key)
 
+    # Sanity check the dumps
+    queues = NetdevFamily(recv_size=4096).qstats_get({"scope": "queue"}, dump=True)
+    # Reformat the output into {ifindex: {rx: [id, id, ...], tx: [id, id, ...]}}
+    parsed = {}
+    for entry in queues:
+        ifindex = entry["ifindex"]
+        if ifindex not in parsed:
+            parsed[ifindex] = {"rx":[], "tx": []}
+        parsed[ifindex][entry["queue-type"]].append(entry['queue-id'])
+    # Now, validate
+    for ifindex, queues in parsed.items():
+        for qtype in ['rx', 'tx']:
+            ksft_eq(len(queues[qtype]), len(set(queues[qtype])),
+                    comment="repeated queue keys")
+            ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1,
+                    comment="missing queue keys")
+
     # Test invalid dumps
     # 0 is invalid
     with ksft_raises(NlError) as cm:
@@ -158,7 +175,7 @@ def check_down(cfg) -> None:
 
 
 def main() -> None:
-    with NetDrvEnv(__file__) as cfg:
+    with NetDrvEnv(__file__, queue_count=100) as cfg:
         ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex,
                   check_down],
                  args=(cfg, ))
-- 
cgit v1.2.3


From cda7d5abe089cc8bd6d623cd6577627d8125d155 Mon Sep 17 00:00:00 2001
From: Anna Emese Nyiri <annaemesenyiri@gmail.com>
Date: Fri, 13 Dec 2024 09:44:56 +0100
Subject: selftests: net: test SO_PRIORITY ancillary data with cmsg_sender

Extend cmsg_sender.c with a new option '-Q' to send SO_PRIORITY
ancillary data.

cmsg_so_priority.sh script added to validate SO_PRIORITY behavior
by creating VLAN device with egress QoS mapping and testing packet
priorities using flower filters. Verify that packets with different
priorities are correctly matched and counted by filters for multiple
protocols and IP versions.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Suggested-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Anna Emese Nyiri <annaemesenyiri@gmail.com>
Link: https://patch.msgid.link/20241213084457.45120-4-annaemesenyiri@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile            |   1 +
 tools/testing/selftests/net/cmsg_sender.c       |  11 +-
 tools/testing/selftests/net/cmsg_so_priority.sh | 151 ++++++++++++++++++++++++
 3 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/cmsg_so_priority.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index cb2fc601de66..f09bd96cc978 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -32,6 +32,7 @@ TEST_PROGS += ioam6.sh
 TEST_PROGS += gro.sh
 TEST_PROGS += gre_gso.sh
 TEST_PROGS += cmsg_so_mark.sh
+TEST_PROGS += cmsg_so_priority.sh
 TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh
 TEST_PROGS += netns-name.sh
 TEST_PROGS += nl_netdev.py
diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c
index 876c2db02a63..bc314382e4e1 100644
--- a/tools/testing/selftests/net/cmsg_sender.c
+++ b/tools/testing/selftests/net/cmsg_sender.c
@@ -59,6 +59,7 @@ struct options {
 		unsigned int proto;
 	} sock;
 	struct option_cmsg_u32 mark;
+	struct option_cmsg_u32 priority;
 	struct {
 		bool ena;
 		unsigned int delay;
@@ -97,6 +98,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin)
 	       "\n"
 	       "\t\t-m val  Set SO_MARK with given value\n"
 	       "\t\t-M val  Set SO_MARK via setsockopt\n"
+	       "\t\t-P val  Set SO_PRIORITY via setsockopt\n"
+	       "\t\t-Q val  Set SO_PRIORITY via cmsg\n"
 	       "\t\t-d val  Set SO_TXTIME with given delay (usec)\n"
 	       "\t\t-t      Enable time stamp reporting\n"
 	       "\t\t-f val  Set don't fragment via cmsg\n"
@@ -115,7 +118,7 @@ static void cs_parse_args(int argc, char *argv[])
 {
 	int o;
 
-	while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:")) != -1) {
+	while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:Q:")) != -1) {
 		switch (o) {
 		case 's':
 			opt.silent_send = true;
@@ -148,6 +151,10 @@ static void cs_parse_args(int argc, char *argv[])
 			opt.mark.ena = true;
 			opt.mark.val = atoi(optarg);
 			break;
+		case 'Q':
+			opt.priority.ena = true;
+			opt.priority.val = atoi(optarg);
+			break;
 		case 'M':
 			opt.sockopt.mark = atoi(optarg);
 			break;
@@ -252,6 +259,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 
 	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
 			  SOL_SOCKET, SO_MARK, &opt.mark);
+	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
+			  SOL_SOCKET, SO_PRIORITY, &opt.priority);
 	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
 			  SOL_IPV6, IPV6_DONTFRAG, &opt.v6.dontfrag);
 	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
diff --git a/tools/testing/selftests/net/cmsg_so_priority.sh b/tools/testing/selftests/net/cmsg_so_priority.sh
new file mode 100755
index 000000000000..ee07d8653262
--- /dev/null
+++ b/tools/testing/selftests/net/cmsg_so_priority.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+readonly KSFT_SKIP=4
+
+IP4=192.0.2.1/24
+TGT4=192.0.2.2
+TGT4_RAW=192.0.2.3
+IP6=2001:db8::1/64
+TGT6=2001:db8::2
+TGT6_RAW=2001:db8::3
+PORT=1234
+TOTAL_TESTS=0
+FAILED_TESTS=0
+
+if ! command -v jq &> /dev/null; then
+    echo "SKIP cmsg_so_priroity.sh test: jq is not installed." >&2
+    exit "$KSFT_SKIP"
+fi
+
+check_result() {
+    ((TOTAL_TESTS++))
+    if [ "$1" -ne 0 ]; then
+        ((FAILED_TESTS++))
+    fi
+}
+
+cleanup()
+{
+    cleanup_ns $NS
+}
+
+trap cleanup EXIT
+
+setup_ns NS
+
+create_filter() {
+    local handle=$1
+    local vlan_prio=$2
+    local ip_type=$3
+    local proto=$4
+    local dst_ip=$5
+    local ip_proto
+
+    if [[ "$proto" == "u" ]]; then
+        ip_proto="udp"
+    elif [[ "$ip_type" == "ipv4" && "$proto" == "i" ]]; then
+        ip_proto="icmp"
+    elif [[ "$ip_type" == "ipv6" && "$proto" == "i" ]]; then
+        ip_proto="icmpv6"
+    fi
+
+    tc -n $NS filter add dev dummy1 \
+        egress pref 1 handle "$handle" proto 802.1q \
+        flower vlan_prio "$vlan_prio" vlan_ethtype "$ip_type" \
+        dst_ip "$dst_ip" ${ip_proto:+ip_proto $ip_proto} \
+        action pass
+}
+
+ip -n $NS link set dev lo up
+ip -n $NS link add name dummy1 up type dummy
+
+ip -n $NS link add link dummy1 name dummy1.10 up type vlan id 10 \
+    egress-qos-map 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
+
+ip -n $NS address add $IP4 dev dummy1.10
+ip -n $NS address add $IP6 dev dummy1.10 nodad
+
+ip netns exec $NS sysctl -wq net.ipv4.ping_group_range='0 2147483647'
+
+ip -n $NS neigh add $TGT4 lladdr 00:11:22:33:44:55 nud permanent \
+    dev dummy1.10
+ip -n $NS neigh add $TGT6 lladdr 00:11:22:33:44:55 nud permanent \
+    dev dummy1.10
+ip -n $NS neigh add $TGT4_RAW lladdr 00:11:22:33:44:66 nud permanent \
+    dev dummy1.10
+ip -n $NS neigh add $TGT6_RAW lladdr 00:11:22:33:44:66 nud permanent \
+    dev dummy1.10
+
+tc -n $NS qdisc add dev dummy1 clsact
+
+FILTER_COUNTER=10
+
+for i in 4 6; do
+    for proto in u i r; do
+        echo "Test IPV$i, prot: $proto"
+        for priority in {0..7}; do
+            if [[ $i == 4 && $proto == "r" ]]; then
+                TGT=$TGT4_RAW
+            elif [[ $i == 6 && $proto == "r" ]]; then
+                TGT=$TGT6_RAW
+            elif [ $i == 4 ]; then
+                TGT=$TGT4
+            else
+                TGT=$TGT6
+            fi
+
+            handle="${FILTER_COUNTER}${priority}"
+
+            create_filter $handle $priority ipv$i $proto $TGT
+
+            pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \
+                | jq ".[] | select(.options.handle == ${handle}) | \
+                .options.actions[0].stats.packets")
+
+            if [[ $pkts == 0 ]]; then
+                check_result 0
+            else
+                echo "prio $priority: expected 0, got $pkts"
+                check_result 1
+            fi
+
+            ip netns exec $NS ./cmsg_sender -$i -Q $priority \
+	            -p $proto $TGT $PORT
+
+            pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \
+                | jq ".[] | select(.options.handle == ${handle}) | \
+                .options.actions[0].stats.packets")
+            if [[ $pkts == 1 ]]; then
+                check_result 0
+            else
+                echo "prio $priority -Q: expected 1, got $pkts"
+                check_result 1
+            fi
+
+            ip netns exec $NS ./cmsg_sender -$i -P $priority \
+	            -p $proto $TGT $PORT
+
+            pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \
+                | jq ".[] | select(.options.handle == ${handle}) | \
+                .options.actions[0].stats.packets")
+            if [[ $pkts == 2 ]]; then
+                check_result 0
+            else
+                echo "prio $priority -P: expected 2, got $pkts"
+                check_result 1
+            fi
+        done
+        FILTER_COUNTER=$((FILTER_COUNTER + 10))
+    done
+done
+
+if [ $FAILED_TESTS -ne 0 ]; then
+    echo "FAIL - $FAILED_TESTS/$TOTAL_TESTS tests failed"
+    exit 1
+else
+    echo "OK - All $TOTAL_TESTS tests passed"
+    exit 0
+fi
-- 
cgit v1.2.3


From e45469e594b255ef8d750ed5576698743450d2ac Mon Sep 17 00:00:00 2001
From: Anna Emese Nyiri <annaemesenyiri@gmail.com>
Date: Fri, 13 Dec 2024 09:44:57 +0100
Subject: sock: Introduce SO_RCVPRIORITY socket option

Add new socket option, SO_RCVPRIORITY, to include SO_PRIORITY in the
ancillary data returned by recvmsg().
This is analogous to the existing support for SO_RCVMARK,
as implemented in commit 6fd1d51cfa253 ("net: SO_RCVMARK socket option
for SO_MARK with recvmsg()").

Reviewed-by: Willem de Bruijn <willemb@google.com>
Suggested-by: Ferenc Fejes <fejes@inf.elte.hu>
Signed-off-by: Anna Emese Nyiri <annaemesenyiri@gmail.com>
Link: https://patch.msgid.link/20241213084457.45120-5-annaemesenyiri@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/include/uapi/asm-generic/socket.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h
index 281df9139d2b..ffff554a5230 100644
--- a/tools/include/uapi/asm-generic/socket.h
+++ b/tools/include/uapi/asm-generic/socket.h
@@ -126,6 +126,8 @@
 
 #define SCM_TS_OPT_ID		78
 
+#define SO_RCVPRIORITY		79
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
-- 
cgit v1.2.3


From 59a42b0e78888e2d9a459b12e8d1eb09fb4a3c7b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 2 Dec 2024 23:44:52 +0100
Subject: selftests/pidfd: add pidfs file handle selftests

Add selftests for pidfs file handles.

Link: https://lore.kernel.org/r/20241202-imstande-einsicht-d78753e1c632@brauner
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/pidfd/.gitignore           |   1 +
 tools/testing/selftests/pidfd/Makefile             |   3 +-
 tools/testing/selftests/pidfd/pidfd.h              |  39 ++
 .../selftests/pidfd/pidfd_file_handle_test.c       | 503 +++++++++++++++++++++
 tools/testing/selftests/pidfd/pidfd_setns_test.c   |  47 +-
 tools/testing/selftests/pidfd/pidfd_wait.c         |  47 +-
 6 files changed, 567 insertions(+), 73 deletions(-)
 create mode 100644 tools/testing/selftests/pidfd/pidfd_file_handle_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 973198a3ec3d..224260e1a4a2 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -6,3 +6,4 @@ pidfd_wait
 pidfd_fdinfo_test
 pidfd_getfd_test
 pidfd_setns_test
+pidfd_file_handle_test
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index d731e3e76d5b..3c16d8e77684 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -2,7 +2,8 @@
 CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
 
 TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
-	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
+	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
+	pidfd_file_handle_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 88d6830ee004..28a471c88c51 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -17,6 +17,7 @@
 #include <sys/wait.h>
 
 #include "../kselftest.h"
+#include "../clone3/clone3_selftests.h"
 
 #ifndef P_PIDFD
 #define P_PIDFD 3
@@ -68,6 +69,11 @@
 #define PIDFD_SKIP 3
 #define PIDFD_XFAIL 4
 
+static inline int sys_waitid(int which, pid_t pid, siginfo_t *info, int options)
+{
+	return syscall(__NR_waitid, which, pid, info, options, NULL);
+}
+
 static inline int wait_for_pid(pid_t pid)
 {
 	int status, ret;
@@ -114,4 +120,37 @@ static inline int sys_memfd_create(const char *name, unsigned int flags)
 	return syscall(__NR_memfd_create, name, flags);
 }
 
+static inline pid_t create_child(int *pidfd, unsigned flags)
+{
+	struct __clone_args args = {
+		.flags		= CLONE_PIDFD | flags,
+		.exit_signal	= SIGCHLD,
+		.pidfd		= ptr_to_u64(pidfd),
+	};
+
+	return sys_clone3(&args, sizeof(struct __clone_args));
+}
+
+static inline ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = read(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
+static inline ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = write(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
 #endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
new file mode 100644
index 000000000000..439b9c6c0457
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/kcmp.h>
+#include <sys/stat.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+FIXTURE(file_handle)
+{
+	pid_t pid;
+	int pidfd;
+
+	pid_t child_pid1;
+	int child_pidfd1;
+
+	pid_t child_pid2;
+	int child_pidfd2;
+
+	pid_t child_pid3;
+	int child_pidfd3;
+};
+
+FIXTURE_SETUP(file_handle)
+{
+	int ret;
+	int ipc_sockets[2];
+	char c;
+
+	self->pid = getpid();
+	self->pidfd = sys_pidfd_open(self->pid, 0);
+	ASSERT_GE(self->pidfd, 0);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER);
+	EXPECT_GE(self->child_pid1, 0);
+
+	if (self->child_pid1 == 0) {
+		close(ipc_sockets[0]);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID);
+	EXPECT_GE(self->child_pid2, 0);
+
+	if (self->child_pid2 == 0) {
+		close(ipc_sockets[0]);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID);
+	EXPECT_GE(self->child_pid3, 0);
+
+	if (self->child_pid3 == 0) {
+		close(ipc_sockets[0]);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+}
+
+FIXTURE_TEARDOWN(file_handle)
+{
+	EXPECT_EQ(close(self->pidfd), 0);
+
+	EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0);
+	if (self->child_pidfd1 >= 0)
+		EXPECT_EQ(0, close(self->child_pidfd1));
+
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+
+	EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0);
+	if (self->child_pidfd2 >= 0)
+		EXPECT_EQ(0, close(self->child_pidfd2));
+
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
+
+	if (self->child_pidfd3 >= 0) {
+		EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd3, SIGKILL, NULL, 0), 0);
+		EXPECT_EQ(0, close(self->child_pidfd3));
+		EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+	}
+}
+
+/*
+ * Test that we can decode a pidfs file handle in the same pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_same_pidns)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle from a child pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_child_pidns)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	free(fh);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle from an ancestor
+ * child pid namespace.
+ */
+TEST_F(file_handle, file_handle_foreign_pidns)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	pid_t pid;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->pidfd, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(setns(self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int pidfd = open_by_handle_at(self->pidfd, fh, 0);
+		if (pidfd >= 0) {
+			TH_LOG("Managed to open pidfd outside of the caller's pid namespace hierarchy");
+			_exit(1);
+		}
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+
+	free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle of a process that has
+ * exited but not been reaped.
+ */
+TEST_F(file_handle, pid_has_exited)
+{
+	int mnt_id, pidfd, child_pidfd3;
+	struct file_handle *fh;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	child_pidfd3 = self->child_pidfd3;
+	self->child_pidfd3 = -EBADF;
+	EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+	EXPECT_EQ(close(child_pidfd3), 0);
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED | WNOWAIT), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle of a process that has
+ * already been reaped.
+ */
+TEST_F(file_handle, pid_has_been_reaped)
+{
+	int mnt_id, pidfd, child_pidfd3;
+	struct file_handle *fh;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	child_pidfd3 = self->child_pidfd3;
+	self->child_pidfd3 = -EBADF;
+	EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+	EXPECT_EQ(close(child_pidfd3), 0);
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_LT(pidfd, 0);
+}
+
+/*
+ * Test valid flags to open a pidfd file handle. Note, that
+ * PIDFD_NONBLOCK is defined as O_NONBLOCK and O_NONBLOCK is an alias to
+ * O_NDELAY. Also note that PIDFD_THREAD is an alias for O_EXCL.
+ */
+TEST_F(file_handle, open_by_handle_at_valid_flags)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh,
+				  O_RDONLY |
+				  O_WRONLY |
+				  O_RDWR |
+				  O_NONBLOCK |
+				  O_NDELAY |
+				  O_CLOEXEC |
+				  O_EXCL);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that invalid flags passed to open a pidfd file handle are
+ * rejected.
+ */
+TEST_F(file_handle, open_by_handle_at_invalid_flags)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	static const struct invalid_pidfs_file_handle_flags {
+		int oflag;
+		const char *oflag_name;
+	}  invalid_pidfs_file_handle_flags[] = {
+		{ FASYNC,	"FASYNC"	},
+		{ O_CREAT,	"O_CREAT"	},
+		{ O_NOCTTY,	"O_NOCTTY"	},
+		{ O_CREAT,	"O_CREAT"	},
+		{ O_TRUNC,	"O_TRUNC"	},
+		{ O_APPEND,	"O_APPEND"	},
+		{ O_SYNC,	"O_SYNC"	},
+		{ O_DSYNC,	"O_DSYNC"	},
+		{ O_DIRECT,	"O_DIRECT"	},
+		{ O_DIRECTORY,	"O_DIRECTORY"	},
+		{ O_NOFOLLOW,	"O_NOFOLLOW"	},
+		{ O_NOATIME,	"O_NOATIME"	},
+		{ O_PATH,	"O_PATH"	},
+		{ O_TMPFILE,	"O_TMPFILE"	},
+		/*
+		 * O_LARGEFILE is added implicitly by
+		 * open_by_handle_at() so pidfs simply masks it off.
+		 */
+	};
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	for (int i = 0; i < ARRAY_SIZE(invalid_pidfs_file_handle_flags); i++) {
+		pidfd = open_by_handle_at(self->pidfd, fh, invalid_pidfs_file_handle_flags[i].oflag);
+		ASSERT_LT(pidfd, 0) {
+			TH_LOG("open_by_handle_at() succeeded with invalid flags: %s", invalid_pidfs_file_handle_flags[i].oflag_name);
+		}
+	}
+}
+
+/* Test that lookup fails. */
+TEST_F(file_handle, lookup_must_fail)
+{
+	int mnt_id;
+	struct file_handle *fh;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, AT_EMPTY_PATH), 0);
+	ASSERT_EQ(errno, ENOTDIR);
+	ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, 0), 0);
+	ASSERT_EQ(errno, ENOTDIR);
+}
+
+#ifndef AT_HANDLE_CONNECTABLE
+#define AT_HANDLE_CONNECTABLE 0x002
+#endif
+
+/*
+ * Test that AT_HANDLE_CONNECTABLE is rejected. Connectable file handles
+ * don't make sense for pidfs. Note that currently AT_HANDLE_CONNECTABLE
+ * is rejected because it is incompatible with AT_EMPTY_PATH which is
+ * required with pidfds as we don't support lookup.
+ */
+TEST_F(file_handle, invalid_name_to_handle_at_flags)
+{
+	int mnt_id;
+	struct file_handle *fh;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_NE(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_CONNECTABLE), 0);
+}
+
+#ifndef AT_HANDLE_FID
+#define AT_HANDLE_FID 0x200
+#endif
+
+/*
+ * Test that a request with AT_HANDLE_FID always leads to decodable file
+ * handle as pidfs always provides export operations.
+ */
+TEST_F(file_handle, valid_name_to_handle_at_flags)
+{
+	int mnt_id, pidfd;
+	struct file_handle *fh;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_FID), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 7c2a4349170a..222f8131283b 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -19,7 +19,6 @@
 #include <linux/ioctl.h>
 
 #include "pidfd.h"
-#include "../clone3/clone3_selftests.h"
 #include "../kselftest_harness.h"
 
 #ifndef PIDFS_IOCTL_MAGIC
@@ -118,22 +117,6 @@ FIXTURE(current_nsset)
 	int child_pidfd_derived_nsfds2[PIDFD_NS_MAX];
 };
 
-static int sys_waitid(int which, pid_t pid, int options)
-{
-	return syscall(__NR_waitid, which, pid, NULL, options, NULL);
-}
-
-pid_t create_child(int *pidfd, unsigned flags)
-{
-	struct __clone_args args = {
-		.flags		= CLONE_PIDFD | flags,
-		.exit_signal	= SIGCHLD,
-		.pidfd		= ptr_to_u64(pidfd),
-	};
-
-	return sys_clone3(&args, sizeof(struct clone_args));
-}
-
 static bool switch_timens(void)
 {
 	int fd, ret;
@@ -150,28 +133,6 @@ static bool switch_timens(void)
 	return ret == 0;
 }
 
-static ssize_t read_nointr(int fd, void *buf, size_t count)
-{
-	ssize_t ret;
-
-	do {
-		ret = read(fd, buf, count);
-	} while (ret < 0 && errno == EINTR);
-
-	return ret;
-}
-
-static ssize_t write_nointr(int fd, const void *buf, size_t count)
-{
-	ssize_t ret;
-
-	do {
-		ret = write(fd, buf, count);
-	} while (ret < 0 && errno == EINTR);
-
-	return ret;
-}
-
 FIXTURE_SETUP(current_nsset)
 {
 	int i, proc_fd, ret;
@@ -229,7 +190,7 @@ FIXTURE_SETUP(current_nsset)
 		_exit(EXIT_SUCCESS);
 	}
 
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED | WNOWAIT), 0);
 
 	self->pidfd = sys_pidfd_open(self->pid, 0);
 	EXPECT_GE(self->pidfd, 0) {
@@ -432,9 +393,9 @@ FIXTURE_TEARDOWN(current_nsset)
 		EXPECT_EQ(0, close(self->child_pidfd1));
 	if (self->child_pidfd2 >= 0)
 		EXPECT_EQ(0, close(self->child_pidfd2));
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
 }
 
 static int preserve_ns(const int pid, const char *ns)
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
index 0dcb8365ddc3..1e2d49751cde 100644
--- a/tools/testing/selftests/pidfd/pidfd_wait.c
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -26,22 +26,11 @@
 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
 #endif
 
-static pid_t sys_clone3(struct clone_args *args)
-{
-	return syscall(__NR_clone3, args, sizeof(struct clone_args));
-}
-
-static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
-		      struct rusage *ru)
-{
-	return syscall(__NR_waitid, which, pid, info, options, ru);
-}
-
 TEST(wait_simple)
 {
 	int pidfd = -1;
 	pid_t parent_tid = -1;
-	struct clone_args args = {
+	struct __clone_args args = {
 		.parent_tid = ptr_to_u64(&parent_tid),
 		.pidfd = ptr_to_u64(&pidfd),
 		.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -55,7 +44,7 @@ TEST(wait_simple)
 	pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 	ASSERT_GE(pidfd, 0);
 
-	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_NE(pid, 0);
 	EXPECT_EQ(close(pidfd), 0);
 	pidfd = -1;
@@ -63,18 +52,18 @@ TEST(wait_simple)
 	pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
 	ASSERT_GE(pidfd, 0);
 
-	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_NE(pid, 0);
 	EXPECT_EQ(close(pidfd), 0);
 	pidfd = -1;
 
-	pid = sys_clone3(&args);
+	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0)
 		exit(EXIT_SUCCESS);
 
-	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_GE(pid, 0);
 	ASSERT_EQ(WIFEXITED(info.si_status), true);
 	ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
@@ -89,7 +78,7 @@ TEST(wait_states)
 {
 	int pidfd = -1;
 	pid_t parent_tid = -1;
-	struct clone_args args = {
+	struct __clone_args args = {
 		.parent_tid = ptr_to_u64(&parent_tid),
 		.pidfd = ptr_to_u64(&pidfd),
 		.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -102,7 +91,7 @@ TEST(wait_states)
 	};
 
 	ASSERT_EQ(pipe(pfd), 0);
-	pid = sys_clone3(&args);
+	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0) {
@@ -117,28 +106,28 @@ TEST(wait_states)
 	}
 
 	close(pfd[0]);
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_STOPPED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
 	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED), 0);
 	ASSERT_EQ(write(pfd[1], "C", 1), 1);
 	close(pfd[1]);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_CONTINUED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_STOPPED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
 	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_KILLED);
 	ASSERT_EQ(info.si_pid, parent_tid);
@@ -151,7 +140,7 @@ TEST(wait_nonblock)
 	int pidfd;
 	unsigned int flags = 0;
 	pid_t parent_tid = -1;
-	struct clone_args args = {
+	struct __clone_args args = {
 		.parent_tid = ptr_to_u64(&parent_tid),
 		.flags = CLONE_PARENT_SETTID,
 		.exit_signal = SIGCHLD,
@@ -173,12 +162,12 @@ TEST(wait_nonblock)
 		SKIP(return, "Skipping PIDFD_NONBLOCK test");
 	}
 
-	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_LT(ret, 0);
 	ASSERT_EQ(errno, ECHILD);
 	EXPECT_EQ(close(pidfd), 0);
 
-	pid = sys_clone3(&args);
+	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0) {
@@ -201,7 +190,7 @@ TEST(wait_nonblock)
 	 * Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
 	 * child processes exist but none have exited.
 	 */
-	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_LT(ret, 0);
 	ASSERT_EQ(errno, EAGAIN);
 
@@ -210,19 +199,19 @@ TEST(wait_nonblock)
 	 * WNOHANG raised explicitly when child processes exist but none have
 	 * exited.
 	 */
-	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG);
 	ASSERT_EQ(ret, 0);
 
 	ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_STOPPED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
 	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_EXITED);
 	ASSERT_EQ(info.si_pid, parent_tid);
-- 
cgit v1.2.3


From 212fbabe1dfecdda35bf5aaa900f745a3bab5ac4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Dec 2024 19:28:24 +0000
Subject: KVM: arm64: Fix set_id_regs selftest for ASIDBITS becoming unwritable

In commit 03c7527e97f7 ("KVM: arm64: Do not allow ID_AA64MMFR0_EL1.ASIDbits
to be overridden") we made that bitfield in the ID registers unwritable
however the change neglected to make the corresponding update to set_id_regs
resulting in it failing:

  ok 56 ID_AA64MMFR0_EL1_BIGEND
  ==== Test Assertion Failure ====
    aarch64/set_id_regs.c:434: masks[idx] & ftr_bits[j].mask == ftr_bits[j].mask
    pid=5566 tid=5566 errno=22 - Invalid argument
       1	0x00000000004034a7: test_vm_ftr_id_regs at set_id_regs.c:434
       2	0x0000000000401b53: main at set_id_regs.c:684
       3	0x0000ffff8e6b7543: ?? ??:0
       4	0x0000ffff8e6b7617: ?? ??:0
       5	0x0000000000401e6f: _start at ??:?
not ok 8 selftests: kvm: set_id_regs # exit=254

Remove ID_AA64MMFR1_EL1.ASIDBITS from the set of bitfields we test for
writeability.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241216-kvm-arm64-fix-set-id-asidbits-v1-1-8b105b888fc3@kernel.org
Acked-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/aarch64/set_id_regs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index a79b7f18452d..3a97c160b5fe 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -152,7 +152,6 @@ static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = {
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGENDEL0, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, SNSMEM, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGEND, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ASIDBITS, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, PARANGE, 0),
 	REG_FTR_END,
 };
-- 
cgit v1.2.3


From 498d5b14db8c9118be139f668720c67bea2dc344 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Wed, 11 Dec 2024 23:01:43 -0800
Subject: riscv: selftests: Fix warnings pointer masking test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling the pointer masking tests with -Wall this warning
is present:

pointer_masking.c: In function ‘test_tagged_addr_abi_sysctl’:
pointer_masking.c:203:9: warning: ignoring return value of ‘pwrite’
declared with attribute ‘warn_unused_result’ [-Wunused-result]
  203 |         pwrite(fd, &value, 1, 0); |
      ^~~~~~~~~~~~~~~~~~~~~~~~ pointer_masking.c:208:9: warning:
ignoring return value of ‘pwrite’ declared with attribute
‘warn_unused_result’ [-Wunused-result]
  208 |         pwrite(fd, &value, 1, 0);

I came across this on riscv64-linux-gnu-gcc (Ubuntu
11.4.0-1ubuntu1~22.04).

Fix this by checking that the number of bytes written equal the expected
number of bytes written.

Fixes: 7470b5afd150 ("riscv: selftests: Add a pointer masking test")
Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241211-fix_warnings_pointer_masking_tests-v6-1-c7ae708fbd2f@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 .../testing/selftests/riscv/abi/pointer_masking.c  | 28 +++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/abi/pointer_masking.c b/tools/testing/selftests/riscv/abi/pointer_masking.c
index dee41b7ee3e3..059d2e87eb1f 100644
--- a/tools/testing/selftests/riscv/abi/pointer_masking.c
+++ b/tools/testing/selftests/riscv/abi/pointer_masking.c
@@ -185,8 +185,20 @@ static void test_fork_exec(void)
 	}
 }
 
+static bool pwrite_wrapper(int fd, void *buf, size_t count, const char *msg)
+{
+	int ret = pwrite(fd, buf, count, 0);
+
+	if (ret != count) {
+		ksft_perror(msg);
+		return false;
+	}
+	return true;
+}
+
 static void test_tagged_addr_abi_sysctl(void)
 {
+	char *err_pwrite_msg = "failed to write to /proc/sys/abi/tagged_addr_disabled\n";
 	char value;
 	int fd;
 
@@ -200,14 +212,18 @@ static void test_tagged_addr_abi_sysctl(void)
 	}
 
 	value = '1';
-	pwrite(fd, &value, 1, 0);
-	ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == -EINVAL,
-			 "sysctl disabled\n");
+	if (!pwrite_wrapper(fd, &value, 1, "write '1'"))
+		ksft_test_result_fail(err_pwrite_msg);
+	else
+		ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == -EINVAL,
+				 "sysctl disabled\n");
 
 	value = '0';
-	pwrite(fd, &value, 1, 0);
-	ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == 0,
-			 "sysctl enabled\n");
+	if (!pwrite_wrapper(fd, &value, 1, "write '0'"))
+		ksft_test_result_fail(err_pwrite_msg);
+	else
+		ksft_test_result(set_tagged_addr_ctrl(min_pmlen, true) == 0,
+				 "sysctl enabled\n");
 
 	set_tagged_addr_ctrl(0, false);
 
-- 
cgit v1.2.3


From a7c205120d339b6ad2557fe3f33fdf20394f1a0f Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Tue, 17 Dec 2024 18:11:13 +0000
Subject: veristat: Fix top source line stat collection

Fix comparator implementation to return most popular source code
lines instead of least.
Introduce min/max macro for building veristat outside of Linux
repository.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241217181113.364651-1-mykyta.yatsenko5@gmail.com
---
 tools/testing/selftests/bpf/veristat.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 162fe27d06f8..9d17b4dfc170 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -26,6 +26,14 @@
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif
 
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
 enum stat_id {
 	VERDICT,
 	DURATION,
@@ -904,7 +912,7 @@ static int line_cnt_cmp(const void *a, const void *b)
 	const struct line_cnt *b_cnt = (const struct line_cnt *)b;
 
 	if (a_cnt->cnt != b_cnt->cnt)
-		return a_cnt->cnt < b_cnt->cnt ? -1 : 1;
+		return a_cnt->cnt > b_cnt->cnt ? -1 : 1;
 	return strcmp(a_cnt->line, b_cnt->line);
 }
 
-- 
cgit v1.2.3


From 026ac4dda8f666f737b375731e30ef8f5698b215 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Mon, 16 Dec 2024 21:32:55 +0530
Subject: selftest/powerpc/ptrace/core-pkey: Remove duplicate macros

./powerpc/ptrace/Makefile includes flags.mk. In flags.mk,
-I$(selfdir)/powerpc/include is always included as part of
CFLAGS. So it will pick up the "pkeys.h" defined in
powerpc/include.

core-pkey.c test has couple of macros defined which
are part of "pkeys.h" header file. Remove those
duplicates and include "pkeys.h"

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20241216160257.87252-1-maddy@linux.ibm.com
---
 tools/testing/selftests/powerpc/ptrace/core-pkey.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
index f6da4cb30cd6..31c9bf6d95db 100644
--- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
@@ -16,14 +16,7 @@
 #include <unistd.h>
 #include "ptrace.h"
 #include "child.h"
-
-#ifndef __NR_pkey_alloc
-#define __NR_pkey_alloc		384
-#endif
-
-#ifndef __NR_pkey_free
-#define __NR_pkey_free		385
-#endif
+#include "pkeys.h"
 
 #ifndef NT_PPC_PKEY
 #define NT_PPC_PKEY		0x110
@@ -61,16 +54,6 @@ struct shared_info {
 	time_t core_time;
 };
 
-static int sys_pkey_alloc(unsigned long flags, unsigned long init_access_rights)
-{
-	return syscall(__NR_pkey_alloc, flags, init_access_rights);
-}
-
-static int sys_pkey_free(int pkey)
-{
-	return syscall(__NR_pkey_free, pkey);
-}
-
 static int increase_core_file_limit(void)
 {
 	struct rlimit rlim;
-- 
cgit v1.2.3


From b0e1b95b1597ad3d87ff91d52f6b67cc9423c31e Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Mon, 16 Dec 2024 21:32:56 +0530
Subject: selftest/powerpc/ptrace/ptrace-pkey: Remove duplicate macros

./powerpc/ptrace/Makefile includes flags.mk.
In flags.mk, -I$(selfdir)/powerpc/include is
always included as part of CFLAGS. So it will
pick up the "pkeys.h" defined in powerpc/include.

ptrace-pkey.c test has macros defined which
are part of "pkeys.h" header file. Remove those
duplicates and include "pkeys.h"

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20241216160257.87252-2-maddy@linux.ibm.com
---
 tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
index d89474377f11..6893ed096457 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -7,14 +7,7 @@
  */
 #include "ptrace.h"
 #include "child.h"
-
-#ifndef __NR_pkey_alloc
-#define __NR_pkey_alloc		384
-#endif
-
-#ifndef __NR_pkey_free
-#define __NR_pkey_free		385
-#endif
+#include "pkeys.h"
 
 #ifndef NT_PPC_PKEY
 #define NT_PPC_PKEY		0x110
@@ -61,11 +54,6 @@ struct shared_info {
 	unsigned long invalid_uamor;
 };
 
-static int sys_pkey_alloc(unsigned long flags, unsigned long init_access_rights)
-{
-	return syscall(__NR_pkey_alloc, flags, init_access_rights);
-}
-
 static int child(struct shared_info *info)
 {
 	unsigned long reg;
-- 
cgit v1.2.3


From 65f5038352e8f635fb827f7482f1d08fae4d16bf Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Mon, 16 Dec 2024 21:32:57 +0530
Subject: selftest/powerpc/ptrace: Cleanup duplicate macro definitions

Both core-pkey.c and ptrace-pkey.c tests have
similar macro definitions, move them to "pkeys.h"
and remove the macro definitions from the C file.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20241216160257.87252-3-maddy@linux.ibm.com
---
 tools/testing/selftests/powerpc/include/pkeys.h      |  8 ++++++++
 tools/testing/selftests/powerpc/ptrace/core-pkey.c   | 12 ------------
 tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c | 12 ------------
 3 files changed, 8 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h
index 51729d9a7111..3a0129467de6 100644
--- a/tools/testing/selftests/powerpc/include/pkeys.h
+++ b/tools/testing/selftests/powerpc/include/pkeys.h
@@ -35,10 +35,18 @@
 #define __NR_pkey_alloc		384
 #define __NR_pkey_free		385
 
+#ifndef NT_PPC_PKEY
+#define NT_PPC_PKEY		0x110
+#endif
+
 #define PKEY_BITS_PER_PKEY	2
 #define NR_PKEYS		32
 #define PKEY_BITS_MASK		((1UL << PKEY_BITS_PER_PKEY) - 1)
 
+#define AMR_BITS_PER_PKEY 2
+#define PKEY_REG_BITS (sizeof(u64) * 8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
+
 inline unsigned long pkeyreg_get(void)
 {
 	return mfspr(SPRN_AMR);
diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
index 31c9bf6d95db..f061434af452 100644
--- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
@@ -18,18 +18,6 @@
 #include "child.h"
 #include "pkeys.h"
 
-#ifndef NT_PPC_PKEY
-#define NT_PPC_PKEY		0x110
-#endif
-
-#ifndef PKEY_DISABLE_EXECUTE
-#define PKEY_DISABLE_EXECUTE	0x4
-#endif
-
-#define AMR_BITS_PER_PKEY 2
-#define PKEY_REG_BITS (sizeof(u64) * 8)
-#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
-
 #define CORE_FILE_LIMIT	(5 * 1024 * 1024)	/* 5 MB should be enough */
 
 static const char core_pattern_file[] = "/proc/sys/kernel/core_pattern";
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
index 6893ed096457..fc633014424f 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -9,18 +9,6 @@
 #include "child.h"
 #include "pkeys.h"
 
-#ifndef NT_PPC_PKEY
-#define NT_PPC_PKEY		0x110
-#endif
-
-#ifndef PKEY_DISABLE_EXECUTE
-#define PKEY_DISABLE_EXECUTE	0x4
-#endif
-
-#define AMR_BITS_PER_PKEY 2
-#define PKEY_REG_BITS (sizeof(u64) * 8)
-#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
-
 static const char user_read[] = "[User Read (Running)]";
 static const char user_write[] = "[User Write (Running)]";
 static const char ptrace_read_running[] = "[Ptrace Read (Running)]";
-- 
cgit v1.2.3


From 88395c071f08d9ea2314045230206cc5a3f82ef0 Mon Sep 17 00:00:00 2001
From: Soham Chakradeo <sohamch@google.com>
Date: Tue, 17 Dec 2024 18:51:58 +0000
Subject: selftests/net: packetdrill: import tcp/ecn, tcp/close, tcp/sack,
 tcp/tcp_info

Same as initial tests, import verbatim from
github.com/google/packetdrill, aside from:

- update `source ./defaults.sh` path to adjust for flat dir
- add SPDX headers
- remove author statements if any
- drop blank lines at EOF

Same test process as previous tests. Both with and without debug mode.
Recording the steps once:

make mrproper
vng --build \
--config tools/testing/selftests/net/packetdrill/config \
--config kernel/configs/debug.config
vng -v --run . --user root --cpus 4 -- \
make -C tools/testing/selftests TARGETS=net/packetdrill run_tests

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Soham Chakradeo <sohamch@google.com>
Link: https://patch.msgid.link/20241217185203.297935-2-sohamch.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 ...tcp_close_close-local-close-then-remote-fin.pkt | 23 ++++++++
 .../packetdrill/tcp_close_close-on-syn-sent.pkt    | 21 +++++++
 .../tcp_close_close-remote-fin-then-close.pkt      | 36 ++++++++++++
 .../net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt      | 21 +++++++
 .../tcp_sack_sack-route-refresh-ip-tos.pkt         | 37 ++++++++++++
 ...tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt | 64 +++++++++++++++++++++
 .../tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt  | 66 ++++++++++++++++++++++
 .../tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt  | 62 ++++++++++++++++++++
 .../tcp_tcp_info_tcp-info-last_data_recv.pkt       | 20 +++++++
 .../tcp_tcp_info_tcp-info-rwnd-limited.pkt         | 54 ++++++++++++++++++
 .../tcp_tcp_info_tcp-info-sndbuf-limited.pkt       | 38 +++++++++++++
 11 files changed, 442 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt
new file mode 100644
index 000000000000..8514d6bdbb6d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test basic connection teardown where local process closes first:
+// the local process calls close() first, so we send a FIN, and receive an ACK.
+// Then we receive a FIN and ACK it.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +.01...0.011 connect(3, ..., ...) = 0
+   +0 >  S 0:0(0) <...>
+   +0 < S. 0:0(0) ack 1 win 32768 <mss 1000,nop,wscale 6,nop,nop,sackOK>
+   +0 >  . 1:1(0) ack 1
+
+   +0 write(3, ..., 1000) = 1000
+   +0 >  P. 1:1001(1000) ack 1
+   +0 < . 1:1(0) ack 1001 win 257
+
+   +0 close(3) = 0
+   +0 >  F. 1001:1001(0) ack 1
+   +0 < . 1:1(0) ack 1002 win 257
+
+   +0 < F. 1:1(0) ack 1002 win 257
+   +0 >  . 1002:1002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt
new file mode 100644
index 000000000000..04103134bd99
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test to make sure no RST is being sent when close()
+// is called on a socket with SYN_SENT state.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+   +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <...>
+
+// Application decideds to close the socket in SYN_SENT state
+// Make sure no RST is sent after close().
+   +0 close(3) = 0
+
+// Receive syn-ack to trigger the send side packet examination:
+// If a RESET were sent right after close(), it would have failed with
+// a mismatched timestamp.
+  +.1 < S. 0:0(0) ack 1 win 32000 <mss 1460,nop,wscale 7>
+   +0 > R 1:1(0)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt
new file mode 100644
index 000000000000..5f3a2914213a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+// Verify behavior for the sequence: remote side sends FIN, then we close().
+// Since the remote side (client) closes first, we test our LAST_ACK code path.
+
+`./defaults.sh`
+
+// Initialize a server socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Client closes first.
+ +.01 < F. 1:1(0) ack 1 win 257
+   +0 > . 1:1(0) ack 2
+
+// App notices that client closed.
+   +0 read(4, ..., 1000) = 0
+
+// Then we close.
+ +.01 close(4) = 0
+   +0 > F. 1:1(0) ack 2
+
+// Client ACKs our FIN.
+ +.01 < . 2:2(0) ack 2 win 257
+
+// Verify that we send RST in response to any incoming segments
+// (because the kernel no longer has any record of this socket).
+ +.01 < . 2:2(0) ack 2 win 257
+   +0 > R 2:2(0)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt
new file mode 100644
index 000000000000..643baf3267cf
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test ECN: verify that Linux TCP ECN sending code uses ECT0 (not ECT1).
+//
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=1  # fully enabled
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+
+// ECN handshake: send EW flags in SYN packet, E flag in SYN-ACK response
++.002 ... 0.004 connect(4, ..., ...) = 0
+
+   +0 > SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.002 < SE. 0:0(0) ack 1 win 32767 <mss 1000,nop,wscale 6,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
+
+// Write 1 MSS.
++.002 write(4, ..., 1000) = 1000
+// Send 1 MSS with ect0.
+   +0 > [ect0] P. 1:1001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt
new file mode 100644
index 000000000000..310ef31518da
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+// Verify that setsockopt calls that force a route refresh do not
+// cause problems matching SACKs with packets in the write queue.
+// This variant tests IP_TOS.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_IP, IP_MTU_DISCOVER, [IP_PMTUDISC_DONT], 1) = 0
+   +0...0.010 connect(3, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01 < S. 0:0(0) ack 1 win 65535 <mss 1460,nop,wscale 2,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
+
+ +.01 write(3, ..., 5840) = 5840
+   +0 > P. 1:5841(5840) ack 1
+ +.01 < . 1:1(0) ack 5841 win 65535
+
+ +.01 write(3, ..., 5840) = 5840
+   +0 > P. 5841:11681(5840) ack 1
+ +.01 < . 1:1(0) ack 11681 win 65535
+
+ +.01 write(3, ..., 14600) = 14600
+   +0 > P. 11681:26281(14600) ack 1
+
+// Try the socket option that we know can force a route refresh.
+   +0 setsockopt(3, SOL_IP, IP_TOS, [4], 1) = 0
+// Then revert to avoid routing/mangling/etc implications of that setting.
+   +0 setsockopt(3, SOL_IP, IP_TOS, [0], 1) = 0
+
+// Verify that we do not retransmit the SACKed segments.
+ +.01 < . 1:1(0) ack 13141 win 65535 <sack 16061:17521 20441:26281,nop,nop>
+   +0 > . 13141:16061(2920) ack 1
+   +0 > P. 17521:20441(2920) ack 1
+ +.01 < . 1:1(0) ack 26281 win 65535
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt
new file mode 100644
index 000000000000..f185e1ac57ea
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant tests non-FACK SACK with SACKs coming in the order
+// 2 6 8 3 9, to test what happens when we get a new SACKed range
+// (for packet 3) that is on the right of an existing SACKed range
+// (for packet 2).
+
+`./defaults.sh`
+
+// Establish a connection and send 10 MSS.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 1024
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+  +.1 < . 1:1(0) ack 1 win 257 <sack 2001:3001,nop,nop>
++.001 < . 1:1(0) ack 1 win 257 <sack 2001:3001 6001:7001,nop,nop>
++.001 < . 1:1(0) ack 1 win 257 <sack 2001:3001 6001:7001 8001:9001,nop,nop>
+
+// 3 SACKed packets, so we enter Fast Recovery.
+   +0 > . 1:1001(1000) ack 1
+   +0 %{ assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state }%
+   +0 %{ assert tcpi_lost == 6, tcpi_lost }%
+
+// SACK for 3001:4001.
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
++.007 < . 1:1(0) ack 1 win 257 <sack 2001:4001 6001:7001 8001:9001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 %{ assert tcpi_lost == 5, tcpi_lost }%
+   +0 %{ assert tcpi_reordering == 6, tcpi_reordering }%   // 8001:9001 -> 3001:4001 is 6
+
+// SACK for 9001:10001.
+ +.01 < . 1:1(0) ack 1 win 257 <sack 2001:4001 6001:7001 8001:10001,nop,nop>
+   +0 %{ assert tcpi_lost == 5, tcpi_lost }%
+
+// ACK for 1:1001 as packets from t=0.303 arrive.
++.083 < . 1:1(0) ack 1001 win 257 <sack 2001:4001 6001:7001 8001:10001,nop,nop>
+   +0 %{ assert tcpi_lost == 4,tcpi_lost }%
+
+// ACK for 1:4001 as packets from t=0.310 arrive.
++.017 < . 1:1(0) ack 4001 win 257 <sack 6001:7001 8001:10001,nop,nop>
+   +0 %{ assert tcpi_lost == 3,tcpi_lost }%
+
+// ACK for 1:7001 as packets from t=0.320 arrive.
+ +.01 < . 1:1(0) ack 7001 win 257 <sack 8001:10001,nop,nop>
+
+// ACK for all data as packets from t=0.403 arrive.
+  +.1 < . 1:1(0) ack 10001 win 257
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_unacked == 0, tcpi_unacked
+assert tcpi_sacked == 0, tcpi_sacked
+assert tcpi_lost == 0, tcpi_lost
+assert tcpi_retrans == 0, tcpi_retrans
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt
new file mode 100644
index 000000000000..0093b4973934
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant tests the case where we mark packets 0-4 lost, then
+// get a SACK for 3, and then a SACK for 4.
+
+`./defaults.sh`
+
+// Establish a connection and send 10 MSS.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 1024
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// SACK for 7001:8001. Using RACK we delay the fast retransmit.
+  +.1 < . 1:1(0) ack 1 win 257 <sack 7001:8001,nop,nop>
+// RACK reordering timer
++.027 > . 1:1001(1000) ack 1
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state
+assert tcpi_lost == 7, tcpi_lost  # RACK thinks 1:7001 are lost
+assert tcpi_reordering == 3, tcpi_reordering
+}%
+
+// SACK for 3001:4001.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:4001 7001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 %{
+assert tcpi_lost == 6, tcpi_lost              # since 3001:4001 is no longer lost
+assert tcpi_reordering == 5, tcpi_reordering  # 7001:8001 -> 3001:4001
+}%
+
+// SACK for 4001:5001.
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+// It uses the RFC3517 algorithm to mark 1:3001 lost
+// because >=3 higher-sequence packets are SACKed.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:8001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
+   +0 %{
+assert tcpi_lost == 5,tcpi_lost         # SACK/RFC3517 thinks 1:3001 are lost
+}%
+
+// SACK for 8001:9001.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:9001,nop,nop>
+
+// SACK for 9001:10001.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:10001,nop,nop>
+   +0 > . 5001:6001(1000) ack 1
+
+// To simplify clean-up, say we get an ACK for all data.
+  +.1 < . 1:1(0) ack 10001 win 257
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_unacked == 0, tcpi_unacked
+assert tcpi_sacked == 0, tcpi_sacked
+assert tcpi_lost == 0, tcpi_lost
+assert tcpi_retrans == 0, tcpi_retrans
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt
new file mode 100644
index 000000000000..980a832dc81c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant tests the case where we mark packets 0-4 lost, then
+// get a SACK for 5, and then a SACK for 6.
+
+`./defaults.sh`
+
+// Establish a connection and send 10 MSS.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 1024
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// SACK for 7001:8001. Using RACK we delay a fast retransmit.
+  +.1 < . 1:1(0) ack 1 win 257 <sack 7001:8001,nop,nop>
++.027 > . 1:1001(1000) ack 1
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state
+assert tcpi_lost == 7,tcpi_lost         # RACK thinks 1:7001 are lost
+assert tcpi_reordering == 3, tcpi_reordering
+}%
+
+// SACK for 5001:6001.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:6001 7001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 %{
+assert tcpi_lost == 6, tcpi_lost
+assert tcpi_reordering == 3, tcpi_reordering  # 7001:8001 -> 5001:6001 is 3
+}%
+
+// SACK for 6001:7001.
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:8001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
+   +0 %{ assert tcpi_lost == 5, tcpi_lost }%
+
+// SACK for 8001:9001.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:9001,nop,nop>
+   +0 > . 3001:4001(1000) ack 1
+
+// SACK for 9001:10001.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:10001,nop,nop>
+   +0 > . 4001:5001(1000) ack 1
+
+// To simplify clean-up, say we get an ACK for all data.
+  +.1 < . 1:1(0) ack 10001 win 257
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_unacked == 0, tcpi_unacked
+assert tcpi_sacked == 0, tcpi_sacked
+assert tcpi_lost == 0, tcpi_lost
+assert tcpi_retrans == 0, tcpi_retrans
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt
new file mode 100644
index 000000000000..d7fdb43a8e89
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test tcpi_last_data_recv for active session
+`./defaults.sh`
+
+// Create a socket and set it to non-blocking.
++0    socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0    fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
++0    fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
++0    connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
++0    > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.030 < S. 0:0(0) ack 1 win 10000 <mss 1000,sackOK,nop,nop,nop,wscale 8>
++0    > . 1:1(0) ack 1
+
++1 %{ assert 990 <= tcpi_last_data_recv <= 1010, tcpi_last_data_recv }%
+
++0    < . 1:1001(1000) ack 1 win 300
++0    > . 1:1(0) ack 1001
+
++0 %{ assert tcpi_last_data_recv <= 10, tcpi_last_data_recv }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt
new file mode 100644
index 000000000000..a9bcd46f6cb6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test rwnd limited time in tcp_info for client side.
+
+`./defaults.sh`
+
+// Create a socket and set it to non-blocking.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+   +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+
+// Server advertises 0 receive window.
+ +.01 < S. 0:0(0) ack 1 win 0 <mss 1000,nop,nop,sackOK>
+
+   +0 > . 1:1(0) ack 1
+   +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0 fcntl(3, F_SETFL, O_RDWR) = 0   // set back to blocking
+
+// Make sure that initial rwnd limited time is 0.
+   +0 %{ assert tcpi_rwnd_limited == 0, tcpi_rwnd_limited }%
+
+// Receive window limited time starts here.
+   +0 write(3, ..., 1000) = 1000
+
+// Check that rwnd limited time in tcp_info is around 0.1s.
+  +.1 %{ assert 98000 <= tcpi_rwnd_limited <= 110000, tcpi_rwnd_limited }%
+
+// Server opens the receive window.
+  +.1 < . 1:1(0) ack 1 win 2000
+
+// Check that rwnd limited time in tcp_info is around 0.2s.
+   +0 %{ assert 198000 <= tcpi_rwnd_limited <= 210000, tcpi_rwnd_limited }%
+
+   +0 > P. 1:1001(1000) ack 1
+
+// Server advertises a very small receive window.
+ +.03 < . 1:1(0) ack 1001 win 10
+
+// Receive window limited time starts again.
+   +0 write(3, ..., 1000) = 1000
+
+// Server opens the receive window again.
+  +.1 < . 1:1(0) ack 1001 win 2000
+// Check that rwnd limited time in tcp_info is around 0.3s
+// and busy time is 0.3 + 0.03 (server opened small window temporarily).
+   +0 %{ assert 298000 <= tcpi_rwnd_limited <= 310000, tcpi_rwnd_limited;\
+         assert 328000 <= tcpi_busy_time <= 340000, tcpi_busy_time;\
+}%
+
+   +0 > P. 1001:2001(1000) ack 1
+ +.02 < . 1:1(0) ack 2001 win 2000
+   +0 %{ assert 348000 <= tcpi_busy_time <= 360000, tcpi_busy_time }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt
new file mode 100644
index 000000000000..f0de2acd0f8e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test send-buffer-limited time in tcp_info for client side.
+`./defaults.sh`
+
+// Create a socket and set it to non-blocking.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+   +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01 < S. 0:0(0) ack 1 win 10000 <mss 1000,sackOK,nop,nop,nop,wscale 8>
+   +0 > . 1:1(0) ack 1
+   +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0 fcntl(3, F_SETFL, O_RDWR) = 0   // set back to blocking
+   +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [10000], 4) = 0
+   +0 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [20000], [4]) = 0
+
+ +.09...0.14 write(3, ..., 150000) = 150000
+
+ +.01 < . 1:1(0) ack 10001 win 10000
+
+ +.01 < . 1:1(0) ack 30001 win 10000
+
+// cwnd goes from 40(60KB) to 80(120KB), and that we hit the tiny sndbuf limit 10KB
+ +.01 < . 1:1(0) ack 70001 win 10000
+
+ +.02 < . 1:1(0) ack 95001 win 10000
+   +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited; \
+	 assert 49000 <= tcpi_busy_time <= 52000, tcpi_busy_time; \
+	 assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }%
+
+// This ack frees up enough buffer so we are no longer
+// buffer limited (socket flag SOCK_NOSPACE is cleared)
+ +.02 < . 1:1(0) ack 150001 win 10000
+   +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited;\
+	 assert 69000 <= tcpi_busy_time <= 73000, tcpi_busy_time;\
+	 assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }%
-- 
cgit v1.2.3


From eab35989cc37e168550b7bfa690905ea2d1ae603 Mon Sep 17 00:00:00 2001
From: Soham Chakradeo <sohamch@google.com>
Date: Tue, 17 Dec 2024 18:51:59 +0000
Subject: selftests/net: packetdrill: import tcp/fast_recovery, tcp/nagle,
 tcp/timestamping

Use the standard import and testing method, as described in the
import of tcp/ecn , tcp/close , tcp/sack , tcp/tcp_info.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Soham Chakradeo <sohamch@google.com>
Link: https://patch.msgid.link/20241217185203.297935-3-sohamch.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt      |  72 ++++++++++
 ...p_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt |  50 +++++++
 .../tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt     |  43 ++++++
 ...ast_recovery_prr-ss-ack-below-snd_una-cubic.pkt |  41 ++++++
 .../net/packetdrill/tcp_nagle_https_client.pkt     |  40 ++++++
 .../net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt |  66 ++++++++++
 .../packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt |  43 ++++++
 .../tcp_timestamping_client-only-last-byte.pkt     |  92 +++++++++++++
 .../net/packetdrill/tcp_timestamping_partial.pkt   |  91 +++++++++++++
 .../net/packetdrill/tcp_timestamping_server.pkt    | 145 +++++++++++++++++++++
 10 files changed, 683 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt
new file mode 100644
index 000000000000..0d3c8077e830
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation.
+// In this variant we test a simple case where in-flight == ssthresh
+// all the way through recovery, so during fast recovery we send one segment
+// for each segment SACKed/ACKed.
+
+// Set up config.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// RTT 100ms
+  +.1 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Send 10 data segments.
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// Lost packet 1:1001.
+ +.11 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:4001,nop,nop>
+// Enter fast recovery.
+   +0 > . 1:1001(1000) ack 1
+ +.01 %{
+assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state
+assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd
+assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh
+}%
+
+// Write some more, which we will send 1 MSS at a time,
+// as in-flight segments are SACKed or ACKed.
+ +.01 write(4, ..., 7000) = 7000
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:5001,nop,nop>
+   +0 > . 10001:11001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:6001,nop,nop>
+   +0 > . 11001:12001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:7001,nop,nop>
+   +0 > . 12001:13001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:8001,nop,nop>
+   +0 > . 13001:14001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:9001,nop,nop>
+   +0 > . 14001:15001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:10001,nop,nop>
+   +0 > . 15001:16001(1000) ack 1
+
+ +.02 < . 1:1(0) ack 10001 win 320
+   +0 > P. 16001:17001(1000) ack 1
+// Leave fast recovery.
+ +.01 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd
+assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh
+}%
+
+ +.03 < . 1:1(0) ack 12001 win 320
+ +.02 < . 1:1(0) ack 14001 win 320
+ +.02 < . 1:1(0) ack 16001 win 320
+ +.02 < . 1:1(0) ack 17001 win 320
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt
new file mode 100644
index 000000000000..7842a10b6967
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation. The sender sends 20 packets. Packet
+// 1 to 4, and 11 to 16 are dropped.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Write 20 data segments.
+   +0 write(4, ..., 20000) = 20000
+   +0 > P. 1:10001(10000) ack 1
+
+// Receive first DUPACK, entering PRR part
+ +.01 < . 1:1(0) ack 1 win 320 <sack 4001:5001,nop,nop>
+   +0 > . 10001:11001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:6001,nop,nop>
+   +0 > . 11001:12001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:7001,nop,nop>
+   +0 > . 1:1001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:9001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:10001,nop,nop>
+   +0 > . 3001:4001(1000) ack 1
+// Enter PRR CRB
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:11001,nop,nop>
+   +0 > . 12001:13001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:12001,nop,nop>
+   +0 > . 13001:14001(1000) ack 1
+// Enter PRR slow start
+ +.01 < . 1:1(0) ack 1001 win 320 <sack 4001:12001,nop,nop>
+   +0 > P. 14001:16001(2000) ack 1
++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:12001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 > . 16001:17001(1000) ack 1
+// inflight reaches ssthresh, goes into packet conservation mode
++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:13001,nop,nop>
+   +0 > . 17001:18001(1000) ack 1
++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:14001,nop,nop>
+   +0 > . 18001:19001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt
new file mode 100644
index 000000000000..b66d7644c3b6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation. The sender sends 20 packets. Packet
+// 1 to 4 are lost. The sender writes another 10 packets.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Send 20 data segments.
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// Lost packet 1,2,3,4
+ +.01 < . 1:1(0) ack 1 win 320 <sack 4001:5001,nop,nop>
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:6001,nop,nop>
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:7001,nop,nop>
+   +0 > . 1:1001(1000) ack 1
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:9001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:10001,nop,nop>
+   +0 > . 3001:4001(1000) ack 1
+
+// Receiver ACKs all data.
+ +.01 < . 1:1(0) ack 1001 win 320 <sack 4001:10001,nop,nop>
+   +0 < . 1:1(0) ack 2001 win 320 <sack 4001:10001,nop,nop>
+   +0 < . 1:1(0) ack 3001 win 320 <sack 4001:10001,nop,nop>
+   +0 < . 1:1(0) ack 10001 win 320
+
+// Writes another 10 packets, which the ssthresh*mss amount
+// should be sent right away
+ +.01 write(4, ..., 10000) = 10000
+   +0 > . 10001:17001(7000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt
new file mode 100644
index 000000000000..8e87bfecabb5
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation.
+// In this variant we verify that the sender uses SACK info on an ACK
+// below snd_una.
+
+// Set up config.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 8>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// RTT 10ms
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Send 10 data segments.
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// Lost packet 1:1001,4001:5001,7001:8001.
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+   +0 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+   +0 < . 1:1(0) ack 1 win 320 <sack 1001:3001 8001:9001,nop,nop>
+   +0 > . 1:1001(1000) ack 1
+
++.012 < . 1:1(0) ack 4001 win 320 <sack 8001:9001,nop,nop>
+   +0 > . 4001:7001(3000) ack 1
+
+   +0 write(4, ..., 10000) = 10000
+
+// The following ACK was reordered - delayed so that it arrives with
+// an ACK field below snd_una. Here we check that the newly-SACKed
+// 2MSS at 5001:7001 cause us to send out 2 more MSS.
++.002 < . 1:1(0) ack 3001 win 320 <sack 5001:7001,nop,nop>
+   +0 > . 7001:8001(1000) ack 1
+   +0 > . 10001:11001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt
new file mode 100644
index 000000000000..7adae7a9ef4a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+// This is a test inspired by an Android client app using SSL. This
+// test verifies using TCP_NODELAY would save application latency
+// (Perhaps even better with TCP_NAGLE).
+//
+`./defaults.sh
+ethtool -K tun0 tso off gso off
+./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+
+   +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < S. 0:0(0) ack 1 win 5792 <mss 974,nop,nop,sackOK,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+// SSL handshake (resumed session)
+   +0 write(4, ..., 517) = 517
+   +0 > P. 1:518(517) ack 1
+  +.1 < . 1:1(0) ack 518 win 229
+
+   +0 < P. 1:144(143) ack 1 win 229
+   +0 > . 518:518(0) ack 144
+   +0 read(4, ..., 1000) = 143
+
+// Application POST header (51B) and body (2002B)
+   +0 write(4, ..., 51) = 51
+   +0 > P. 518:569(51) ack 144
+ +.03 write(4, ..., 2002) = 2002
+   +0 > . 569:1543(974) ack 144
+   +0 > P. 1543:2517(974) ack 144
+// Without disabling Nagle, this packet will not happen until the remote ACK.
+   +0 > P. 2517:2571(54) ack 144
+
+  +.1 < . 1:1(0) ack 2571 win 229
+
+// Reset sysctls
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt
new file mode 100644
index 000000000000..fa9c01813996
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test the MSG_MORE flag will correctly corks the tiny writes
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+// Disable Nagle by default on this socket.
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+
+// Test the basic case: MSG_MORE overwrites TCP_NODELAY and enables Nagle.
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 40}], msg_flags=0}, MSG_MORE) = 40
+ +.21~+.215 > P. 1:41(40) ack 1
+ +.01 < . 1:1(0) ack 41 win 257
+
+// Test unsetting MSG_MORE releases the packet
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 100}], msg_flags=0}, MSG_MORE) = 100
++.005 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 160}], msg_flags=0}, MSG_MORE) = 160
+ +.01 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(3)=[{..., 100}, {..., 200}, {..., 195}],
+		  msg_flags=0}, MSG_MORE) = 495
++.008 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 5}], msg_flags=0}, 0) = 5
+   +0 > P. 41:801(760) ack 1
+ +.02 < . 1:1(0) ack 801 win 257
+
+
+// Test >MSS write will unleash MSS packets but hold on the remaining data.
+  +.1 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 3100}], msg_flags=0}, MSG_MORE) = 3100
+   +0 > . 801:3801(3000) ack 1
++.003 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 50}], msg_flags=0}, MSG_MORE) = 50
+
+ +.01 < . 1:1(0) ack 2801 win 257
+// Err... we relase the remaining right after the ACK? note that PUSH is reset
+   +0 > . 3801:3951(150) ack 1
+
+// Test we'll hold on the subsequent writes when inflight (3801:3951) > 0
++.001 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 1}], msg_flags=0}, MSG_MORE) = 1
++.002 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 2}], msg_flags=0}, MSG_MORE) = 2
++.003 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 3}], msg_flags=0}, MSG_MORE) = 3
++.004 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 4}], msg_flags=0}, MSG_MORE) = 4
+ +.02 < . 1:1(0) ack 3951 win 257
+   +0 > . 3951:3961(10) ack 1
+ +.02 < . 1:1(0) ack 3961 win 257
+
+
+// Test the case a MSG_MORE send followed by a write flushes the data
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 20}], msg_flags=0}, MSG_MORE) = 20
+ +.05 write(4, ..., 20) = 20
+   +0 > P. 3961:4001(40) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt
new file mode 100644
index 000000000000..0ddec5f7dc1a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP_CORK and TCP_NODELAY sockopt behavior
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+// Set TCP_CORK sockopt to hold small packets
+   +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0
+
+   +0 write(4, ..., 40) = 40
+ +.05 write(4, ..., 40) = 40
+
+// Unset TCP_CORK should push pending bytes out
+ +.01 setsockopt(4, SOL_TCP, TCP_CORK, [0], 4) = 0
+   +0 > P. 1:81(80) ack 1
+ +.01 < . 1:1(0) ack 81 win 257
+
+// Set TCP_CORK sockopt to hold small packets
+   +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0
+
+   +0 write(4, ..., 40) = 40
+ +.05 write(4, ..., 40) = 40
+
+// Set TCP_NODELAY sockopt should push pending bytes out
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+   +0 > P. 81:161(80) ack 1
+ +.01 < . 1:1(0) ack 161 win 257
+
+// Set MSG_MORE to hold small packets
+   +0 send(4, ..., 40, MSG_MORE) = 40
+ +.05 send(4, ..., 40, MSG_MORE) = 40
+
+// Set TCP_NODELAY sockopt should push pending bytes out
+ +.01 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+   +0 > . 161:241(80) ack 1
+ +.01 < . 1:1(0) ack 241 win 257
diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt
new file mode 100644
index 000000000000..2087ec0c746a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that tx timestamping sends timestamps only for
+// the last byte of each sendmsg.
+`./defaults.sh
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Establish connection and verify that there was no error.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01 < S. 0:0(0) ack 1 win 20000 <mss 1000,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
+   +0	getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0	fcntl(3, F_SETFL, O_RDWR) = 0   // set back to blocking
+
+   +0	setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+   +0	write(3, ..., 11000) = 11000
+   +0	> P. 1:10001(10000) ack 1
+ +.01	< . 1:1(0) ack 10001 win 4000
+   +0	> P. 10001:11001(1000) ack 1
+ +.01	< . 1:1(0) ack 11001 win 4000
+
+// Make sure that internal TCP timestamps are not overwritten and we have sane
+// RTT measurement.
+   +0	%{
+assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt
+}%
+
+// SCM_TSTAMP_SCHED for the last byte should be received almost immediately
+// once 10001 is acked at t=20ms.
+// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...)
+// is called after when SYN is acked. So, we expect the last byte of the first
+// chunk to have a timestamp key of 10999 (i.e., 11000 - 1).
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=10999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the last byte should be received almost immediately
+// once 10001 is acked at t=20ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=10999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the last byte should be received at t=30ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=30000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=10999}}
+		    ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt
new file mode 100644
index 000000000000..876024a31110
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test tx timestamping for partial writes (IPv4).
+`./defaults.sh
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Establish connection and verify that there was no error.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0	> S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01	< S. 0:0(0) ack 1 win 2000 <mss 1000,sackOK,TS val 700 ecr 100,nop,wscale 7>
+   +0	> . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700>
+   +0	getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+
+   +0	setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1000], 4) = 0
+   +0	setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+// We have a partial write.
+   +0	write(3, ..., 10000) = 2964
+   +0	> . 1:989(988) ack 1 <nop,nop,TS val 110 ecr 700>
+   +0	> P. 989:1977(988) ack 1 <nop,nop,TS val 110 ecr 700>
+ +.01	< . 1:1(0) ack 1977 win 92 <nop,nop,TS val 800 ecr 200>
+   +0	> P. 1977:2965(988) ack 1 <nop,nop,TS val 114 ecr 800>
+ +.01	< . 1:1(0) ack 2965 win 92 <nop,nop,TS val 800 ecr 200>
+
+// Make sure that internal TCP timestamps are not overwritten and we have sane
+// RTT measurement.
+   +0	%{
+assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt
+}%
+
+// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately
+// after the first ack at t=20ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=2963}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the first chunk should be received almost immediately
+// after the first ack at t=20ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=2963}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the first chunk should be received after the last ack at
+// t=30ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=30000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=2963}}
+		    ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt
new file mode 100644
index 000000000000..84d94780e6be
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test tx timestamping for server-side (IPv4).
+`./defaults.sh
+`
+
+// Initialize connection
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0	bind(3, ..., ...) = 0
+   +0	listen(3, 1) = 0
+
+   +0	< S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0	> S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01	< . 1:1(0) ack 1 win 514
+
+   +0	accept(3, ..., ...) = 4
+   +0	setsockopt(4, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+// Write two 2KB chunks.
+// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...)
+// is called after when SYN is acked. So, we expect the last byte of the first
+// and the second chunks to have timestamp keys of 1999 (i.e., 2000 - 1) and
+// 3999 (i.e., 4000 - 1) respectively.
+   +0	write(4, ..., 2000) = 2000
+   +0	write(4, ..., 2000) = 2000
+   +0	> P. 1:2001(2000) ack 1
+   +0	> P. 2001:4001(2000) ack 1
+ +.01	< .  1:1(0) ack 2001 win 514
+ +.01	< .  1:1(0) ack 4001 win 514
+
+// Make sure that internal TCP timestamps are not overwritten and we have sane
+// RTT measurement.
+   +0	%{
+assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt
+}%
+
+// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately
+// after write at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=1999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the first chunk should be received almost immediately
+// after write at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=1999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SCHED for the second chunk should be received almost immediately
+// after that at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=3999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the second chunk should be received almost immediately
+// after that at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=3999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the first chunk should be received at t=20ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=1999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the second chunk should be received at t=30ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=30000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=3999}}
+		    ]}, MSG_ERRQUEUE) = 0
-- 
cgit v1.2.3


From 6f6692053939038f48c2f9f404fe414038a44431 Mon Sep 17 00:00:00 2001
From: Soham Chakradeo <sohamch@google.com>
Date: Tue, 17 Dec 2024 18:52:00 +0000
Subject: selftests/net: packetdrill: import tcp/eor, tcp/splice,
 tcp/ts_recent, tcp/blocking

Use the standard import and testing method, as described in the
import of tcp/ecn and tcp/close , tcp/sack , tcp/tcp_info.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Soham Chakradeo <sohamch@google.com>
Link: https://patch.msgid.link/20241217185203.297935-4-sohamch.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../packetdrill/tcp_blocking_blocking-accept.pkt   | 18 ++++++
 .../packetdrill/tcp_blocking_blocking-connect.pkt  | 13 ++++
 .../net/packetdrill/tcp_blocking_blocking-read.pkt | 29 +++++++++
 .../packetdrill/tcp_blocking_blocking-write.pkt    | 35 +++++++++++
 .../net/packetdrill/tcp_eor_no-coalesce-large.pkt  | 38 ++++++++++++
 .../packetdrill/tcp_eor_no-coalesce-retrans.pkt    | 72 ++++++++++++++++++++++
 .../net/packetdrill/tcp_eor_no-coalesce-small.pkt  | 36 +++++++++++
 .../packetdrill/tcp_eor_no-coalesce-subsequent.pkt | 66 ++++++++++++++++++++
 .../tcp_splice_tcp_splice_loop_test.pkt            | 20 ++++++
 .../net/packetdrill/tcp_ts_recent_fin_tsval.pkt    | 23 +++++++
 .../net/packetdrill/tcp_ts_recent_invalid_ack.pkt  | 25 ++++++++
 .../net/packetdrill/tcp_ts_recent_reset_tsval.pkt  | 25 ++++++++
 12 files changed, 400 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt
new file mode 100644
index 000000000000..38535701656e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking accept.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+   +0...0.200 accept(3, ..., ...) = 4
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+
+  +.1 write(4, ..., 2000) = 2000
+   +0 > P. 1:2001(2000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt
new file mode 100644
index 000000000000..3692ef102381
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking connect.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+
+  +.1...0.200 connect(3, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+  +.1 < S. 0:0(0) ack 1 win 5792 <mss 1460,nop,wscale 2,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt
new file mode 100644
index 000000000000..914eabab367a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking read.
+--tolerance_usecs=10000
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+   +0...0.100 read(4, ..., 2000) = 2000
+  +.1 < P. 1:2001(2000) ack 1 win 257
+   +0 > . 1:1(0) ack 2001
+
+  +.1...0.200 read(4, ..., 2000) = 2000
+  +.1 < P. 2001:4001(2000) ack 1 win 257
+   +0 > . 1:1(0) ack 4001
+
+  +.1 < P. 4001:6001(2000) ack 1 win 257
+   +0 > . 1:1(0) ack 6001
+   +0...0.000 read(4, ..., 1000) = 1000
+   +0...0.000 read(4, ..., 1000) = 1000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt
new file mode 100644
index 000000000000..cec5a0725d95
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking write.
+--tolerance_usecs=10000
+
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_min_tso_segs=10
+`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 50000 <mss 1000,nop,wscale 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 50000
+   +0 accept(3, ..., ...) = 4
+
+// Kernel doubles our value -> sk->sk_sndbuf is set to 42000
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF,  [21000], 4) = 0
+   +0 getsockopt(4, SOL_SOCKET, SO_SNDBUF,  [42000], [4]) = 0
+
+// A write of 60000 does not block.
+   +0...0.300 write(4, ..., 61000) = 61000    // this write() blocks
+
+  +.1 < . 1:1(0) ack 10001 win 50000
+
+  +.1 < . 1:1(0) ack 30001 win 50000
+
+// This ACK should wakeup the write(). An ACK of 35001 does not.
+  +.1 < . 1:1(0) ack 36001 win 50000
+
+// Reset to sysctls defaults.
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt
new file mode 100644
index 000000000000..f95b9b3c9fa1
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk. The large chunk itself should be packetized as
+// usual.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write another 10040B chunk with no coalescing options.
+   +0 send(4, ..., 10400, MSG_EOR) = 10400
+
+// Write a 2KB chunk. This chunk should not be appended to the packets created
+// the previous chunk.
+   +0 write(4, ..., 2000) = 2000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:20801(10800) ack 1
++.001 < .  1:1(0) ack 20801 win 514
+// This 2KB packet should be sent alone.
+   +0 > P. 20801:22801(2000) ack 1
++.001 < .  1:1(0) ack 22801 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt
new file mode 100644
index 000000000000..2ff66075288e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk. Also, when packets are retransmitted, they
+// will not be coalesce into the same skb.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write 10 400B chunks with no coalescing options.
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+// This chunk should not be appended to the skbs created for the previous chunk.
+   +0 write(4, ..., 10000) = 10000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:10801(800) ack 1
+// The 9 remaining 400B chunks should be sent as individual packets.
+   +0 > P. 10801:11201(400) ack 1
+   +0 > P. 11201:11601(400) ack 1
+   +0 > P. 11601:12001(400) ack 1
+   +0 > P. 12001:12401(400) ack 1
+   +0 > P. 12401:12801(400) ack 1
+   +0 > P. 12801:13201(400) ack 1
+   +0 > P. 13201:13601(400) ack 1
+   +0 > P. 13601:14001(400) ack 1
+   +0 > P. 14001:14401(400) ack 1
+// The last 10KB chunk should be sent separately.
+   +0 > P. 14401:24401(10000) ack 1
+
++.001 < .  1:1(0) ack 10401 win 514
++.001 < .  1:1(0) ack 10801 win 514
++.001 < .  1:1(0) ack 11201 win 514
++.001 < .  1:1(0) ack 11601 win 514
++.001 < .  1:1(0) ack 12001 win 514 <sack 13201:14401,nop,nop>
+// TCP should fill the hole but no coalescing should happen, and all
+// retransmissions should be sent out as individual packets.
+
+// Note : This is timeout based retransmit.
+// Do not put +0 here or flakes will come back.
++.004~+.008 > P. 12001:12401(400) ack 1
+
++.001 < .  1:1(0) ack 12401 win 514 <sack 13201:14401,nop,nop>
+   +0 > P. 12401:12801(400) ack 1
+   +0 > P. 12801:13201(400) ack 1
++.001 < .  1:1(0) ack 12801 win 514 <sack 13201:14401,nop,nop>
++.001 < .  1:1(0) ack 14401 win 514
++.001 < .  1:1(0) ack 24401 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt
new file mode 100644
index 000000000000..77039c5aac39
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write a 400B chunk with no coalescing options.
+   +0 send(4, ..., 400, MSG_EOR) = 400
+
+// This chunk should not be appended to the skbs created for the previous chunk.
+   +0 write(4, ..., 10000) = 10000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:10801(800) ack 1
+   +0 > P. 10801:20801(10000) ack 1
++.001 < .  1:1(0) ack 10401 win 514
++.001 < .  1:1(0) ack 10801 win 514
++.001 < .  1:1(0) ack 20801 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt
new file mode 100644
index 000000000000..dd5a06250595
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk even though we have 10 back-to-back small
+// writes.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write 10 400B chunks with no coalescing options.
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+// This chunk should not be appended to the skbs created for the previous chunk.
+   +0 write(4, ..., 10000) = 10000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:10801(800) ack 1
+// The 9 remaining 400B chunks should be sent as individual packets.
+   +0 > P. 10801:11201(400) ack 1
+   +0 > P. 11201:11601(400) ack 1
+   +0 > P. 11601:12001(400) ack 1
+   +0 > P. 12001:12401(400) ack 1
+   +0 > P. 12401:12801(400) ack 1
+   +0 > P. 12801:13201(400) ack 1
+   +0 > P. 13201:13601(400) ack 1
+   +0 > P. 13601:14001(400) ack 1
+   +0 > P. 14001:14401(400) ack 1
+// The last 10KB chunk should be sent separately.
+   +0 > P. 14401:24401(10000) ack 1
+
++.001 < .  1:1(0) ack 10401 win 514
++.001 < .  1:1(0) ack 10801 win 514
++.001 < .  1:1(0) ack 11201 win 514
++.001 < .  1:1(0) ack 11601 win 514
++.001 < .  1:1(0) ack 12001 win 514
++.001 < .  1:1(0) ack 12401 win 514
++.001 < .  1:1(0) ack 12801 win 514
++.001 < .  1:1(0) ack 13201 win 514
++.001 < .  1:1(0) ack 13601 win 514
++.001 < .  1:1(0) ack 14001 win 514
++.001 < .  1:1(0) ack 14401 win 514
++.001 < .  1:1(0) ack 24401 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt
new file mode 100644
index 000000000000..0cbd43253236
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+`./defaults.sh`
+
+// Initialize a server socket
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_IP, IP_FREEBIND, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Connection should get accepted
+   +0 < S 0:0(0) win 32972 <mss 1460,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+   +0 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+   +0 pipe([5, 6]) = 0
+   +0 < U. 1:101(100) ack 1 win 257 urg 100
+   +0 splice(4, NULL, 6, NULL, 99, 0) = 99
+   +0 splice(4, NULL, 6, NULL, 1, 0) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt
new file mode 100644
index 000000000000..e61424a7bd0a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we send FIN packet with correct TSval
+--tcp_ts_tick_usecs=1000
+--tolerance_usecs=7000
+
+`./defaults.sh`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+   +0 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+  +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+   +0 accept(3, ..., ...) = 4
+
+   +1 close(4) = 0
+// Check that FIN TSval is updated properly, one second has passed since last sent packet.
+   +0 > F. 1:1(0) ack 1 <nop,nop,TS val 1200 ecr 200>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
new file mode 100644
index 000000000000..174ce9a1bfc0
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we reject TS val updates on a packet with invalid ACK sequence
+
+`./defaults.sh
+`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+  +.1 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+  +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+   +0 accept(3, ..., ...) = 4
+
+// bad packet with high tsval (its ACK sequence is above our sndnxt)
+   +0 < F. 1:1(0) ack 9999 win 20000 <nop,nop,TS val 200000 ecr 100>
+
+
+   +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100>
+   +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt
new file mode 100644
index 000000000000..2e3b3bb7493a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we send RST packet with correct TSval
+--tcp_ts_tick_usecs=1000
+
+`./defaults.sh`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+   +0 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+  +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+   +0 accept(3, ..., ...) = 4
+
+   +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100>
+   +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201>
+
+   +1 close(4) = 0
+// Check that RST TSval is updated properly, one second has passed since last sent packet.
+   +0 > R. 1:1(0) ack 1001 <nop,nop,TS val 1200 ecr 201>
-- 
cgit v1.2.3


From 5d4cadef52f29eea779a0b44e09f59657c1b46d8 Mon Sep 17 00:00:00 2001
From: Soham Chakradeo <sohamch@google.com>
Date: Tue, 17 Dec 2024 18:52:01 +0000
Subject: selftests/net: packetdrill: import tcp/user_timeout, tcp/validate,
 tcp/sendfile, tcp/limited-transmit, tcp/syscall_bad_arg

Use the standard import and testing method, as described in the
import of tcp/ecn and tcp/close , tcp/sack , tcp/tcp_info.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Soham Chakradeo <sohamch@google.com>
Link: https://patch.msgid.link/20241217185203.297935-5-sohamch.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 ...p_limited_transmit_limited-transmit-no-sack.pkt | 53 ++++++++++++++++++++++
 .../tcp_limited_transmit_limited-transmit-sack.pkt | 50 ++++++++++++++++++++
 .../packetdrill/tcp_sendfile_sendfile-simple.pkt   | 26 +++++++++++
 ...cp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt | 42 +++++++++++++++++
 .../tcp_syscall_bad_arg_sendmsg-empty-iov.pkt      | 30 ++++++++++++
 ...tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt | 25 ++++++++++
 .../tcp_user_timeout_user-timeout-probe.pkt        | 37 +++++++++++++++
 .../packetdrill/tcp_user_timeout_user_timeout.pkt  | 32 +++++++++++++
 .../tcp_validate_validate-established-no-flags.pkt | 24 ++++++++++
 9 files changed, 319 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt
new file mode 100644
index 000000000000..96b01eb5b7a4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test RFC 3042 "Limited Transmit": "sending a new data segment in
+// response to each of the first two duplicate acknowledgments that
+// arrive at the sender".
+// This variation tests a receiver that doesn't support SACK.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Write some data, and send the initial congestion window.
+   +0 write(4, ..., 15000) = 15000
+   +0 > P. 1:10001(10000) ack 1
+
+// Limited transmit: on first dupack, send a new data segment.
+ +.11 < . 1:1(0) ack 1 win 320
+   +0 > . 10001:11001(1000) ack 1
+
+// Limited transmit: on second dupack, send a new data segment.
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 > . 11001:12001(1000) ack 1
+
+// It turned out to be reordering, not loss.
+// We have one packet newly acked (1001:3001 were DUP-ACK'd)
+// So we revert state back to Open. Slow start cwnd from 10 to 11
+// and send 11 - 9 = 2 packets
+ +.01 < . 1:1(0) ack 3001 win 320
+   +0 > P. 12001:14001(2000) ack 1
+
+ +.02 < . 1:1(0) ack 5001 win 320
+   +0 > P. 14001:15001(1000) ack 1
+
+// Client gradually ACKs all data.
+ +.02 < . 1:1(0) ack 7001 win 320
+ +.02 < . 1:1(0) ack 9001 win 320
+ +.02 < . 1:1(0) ack 11001 win 320
+ +.02 < . 1:1(0) ack 13001 win 320
+ +.02 < . 1:1(0) ack 15001 win 320
+
+// Clean up.
+ +.17 close(4) = 0
+   +0 > F. 15001:15001(0) ack 1
+  +.1 < F. 1:1(0) ack 15002 win 257
+   +0 > . 15002:15002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt
new file mode 100644
index 000000000000..642da51ec3a4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test RFC 3042 "Limited Transmit": "sending a new data segment in
+// response to each of the first two duplicate acknowledgments that
+// arrive at the sender".
+// This variation tests a receiver that supports SACK.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Write some data, and send the initial congestion window.
+   +0 write(4, ..., 15000) = 15000
+   +0 > P. 1:10001(10000) ack 1
+
+// Limited transmit: on first dupack, send a new data segment.
+ +.11 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+   +0 > . 10001:11001(1000) ack 1
+
+// Limited transmit: on second dupack, send a new data segment.
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+   +0 > . 11001:12001(1000) ack 1
+
+// It turned out to be reordering, not loss.
+ +.01 < . 1:1(0) ack 3001 win 320
+   +0 > P. 12001:14001(2000) ack 1
+
+ +.02 < . 1:1(0) ack 5001 win 320
+   +0 > P. 14001:15001(1000) ack 1
+
+// Client gradually ACKs all data.
+ +.02 < . 1:1(0) ack 7001 win 320
+ +.02 < . 1:1(0) ack 9001 win 320
+ +.02 < . 1:1(0) ack 11001 win 320
+ +.02 < . 1:1(0) ack 13001 win 320
+ +.02 < . 1:1(0) ack 15001 win 320
+
+// Clean up.
+ +.17 close(4) = 0
+   +0 > F. 15001:15001(0) ack 1
+  +.1 < F. 1:1(0) ack 15002 win 257
+   +0 > . 15002:15002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt
new file mode 100644
index 000000000000..6740859a1360
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+// Simplest possible test of open() and then sendfile().
+// We write some zeroes into a file (since packetdrill expects payloads
+// to be all zeroes) and then open() the file, then use sendfile()
+// and verify that the correct number of zeroes goes out.
+
+`./defaults.sh
+/bin/rm -f /tmp/testfile
+/bin/dd bs=1 count=5 if=/dev/zero of=/tmp/testfile status=none
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 open("/tmp/testfile", O_RDONLY) = 5
+   +0 sendfile(4, 5, [0], 5) = 5
+   +0 > P. 1:6(5) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt
new file mode 100644
index 000000000000..8940726a3ec2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP fastopen behavior with NULL as buffer pointer, but a non-zero
+// buffer length.
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0`
+
+// Cache warmup: send a Fast Open cookie request
+ 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0
++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress)
++0 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO,nop,nop>
++0 < S. 123:123(0) ack 1 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 6,FO abcd1234,nop,nop>
++0 > . 1:1(0) ack 1
++0 close(3) = 0
++0 > F. 1:1(0) ack 1
++0 < F. 1:1(0) ack 2 win 92
++0 > .  2:2(0) ack 2
+
+// Test with MSG_FASTOPEN without TCP_FASTOPEN_CONNECT.
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 sendto(4, NULL, 1, MSG_FASTOPEN, ..., ...) = -1
++0 close(4) = 0
+
+// Test with TCP_FASTOPEN_CONNECT without MSG_FASTOPEN.
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5
++0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 setsockopt(5, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0
++0 connect(5, ..., ...) = 0
++0 sendto(5, NULL, 1, 0, ..., ...) = -1
++0 close(5) = 0
+
+// Test with both TCP_FASTOPEN_CONNECT and MSG_FASTOPEN.
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 6
++0 fcntl(6, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 setsockopt(6, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0
++0 connect(6, ..., ...) = 0
++0 sendto(6, NULL, 1, MSG_FASTOPEN, ..., ...) = -1
++0 close(6) = 0
+
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt
new file mode 100644
index 000000000000..b2b2cdf27e20
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we correctly skip zero-length IOVs.
+`./defaults.sh`
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(4)=[{..., 0}, {..., 40}, {..., 0}, {..., 20}],
+                  msg_flags=0}, 0) = 60
+   +0 > P. 1:61(60) ack 1
+ +.01 < . 1:1(0) ack 61 win 257
+
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(4)=[{..., 0}, {..., 0}, {..., 0}, {..., 0}],
+                  msg_flags=0}, MSG_ZEROCOPY) = 0
+
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(4)=[{..., 0}, {..., 10}, {..., 0}, {..., 50}],
+                  msg_flags=0}, MSG_ZEROCOPY) = 60
+   +0 > P. 61:121(60) ack 1
+ +.01 < . 1:1(0) ack 121 win 257
diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt
new file mode 100644
index 000000000000..59f5903f285c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test kernel behavior with NULL as buffer pointer
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.2 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, NULL, 1000) = -1 EFAULT (Bad address)
+   +0 send(4, NULL, 1000, 0) = -1 EFAULT (Bad address)
+   +0 sendto(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address)
+
+   +0 < . 1:1001(1000) ack 1 win 200
+   +0 read(4, NULL, 1000) = -1 EFAULT (Bad address)
+   +0 recv(4, NULL, 1000, 0) = -1 EFAULT (Bad address)
+   +0 recvfrom(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
new file mode 100644
index 000000000000..183051ba0cae
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+
+   +0 < S 0:0(0) win 0 <mss 1460>
+   +0 > S. 0:0(0) ack 1 <mss 1460>
+
+  +.1 < . 1:1(0) ack 1 win 65530
+   +0 accept(3, ..., ...) = 4
+
+   +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
+   +0 write(4, ..., 24) = 24
+   +0 > P. 1:25(24) ack 1
+   +.1 < . 1:1(0) ack 25 win 65530
+   +0 %{ assert tcpi_probes == 0, tcpi_probes; \
+         assert tcpi_backoff == 0, tcpi_backoff }%
+
+// install a qdisc dropping all packets
+   +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0`
+   +0 write(4, ..., 24) = 24
+   // When qdisc is congested we retry every 500ms
+   // (TCP_RESOURCE_PROBE_INTERVAL) and therefore
+   // we retry 6 times before hitting 3s timeout.
+   // First verify that the connection is alive:
++3.250 write(4, ..., 24) = 24
+   // Now verify that shortly after that the socket is dead:
+ +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out)
+
+   +0 %{ assert tcpi_probes == 6, tcpi_probes; \
+         assert tcpi_backoff == 0, tcpi_backoff }%
+   +0 close(4) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt
new file mode 100644
index 000000000000..2efe02bfba9c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+`./defaults.sh`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+  +.1 < . 1:1(0) ack 1 win 32792
+
+
+   +0 accept(3, ..., ...) = 4
+
+// Okay, we received nothing, and decide to close this idle socket.
+// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth
+// trying hard to cleanly close this flow, at the price of keeping
+// a TCP structure in kernel for about 1 minute !
+   +2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
+   +0 close(4) = 0
+
+   +0 > F. 1:1(0) ack 1
+  +.3~+.400 > F. 1:1(0) ack 1
+  +.3~+.400 > F. 1:1(0) ack 1
+  +.6~+.800 > F. 1:1(0) ack 1
+
+// We finally receive something from the peer, but it is way too late
+// Our socket vanished because TCP_USER_TIMEOUT was really small
+   +0 < . 1:2(1) ack 1 win 32792
+   +0 > R 1:1(0)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt
new file mode 100644
index 000000000000..8bd60226ccfc
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+// Verify that established connections drop a segment without the ACK flag set.
+
+`./defaults.sh`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+   +0 < S 0:0(0) win 20000 <mss 1000,sackOK,nop,nop>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+ +.01 < . 1:1(0) ack 1 win 20000
+   +0 accept(3, ..., ...) = 4
+
+// Receive a segment with no flags set, verify that it's not enqueued.
+ +.01 < - 1:1001(1000) win 20000
+   +0 ioctl(4, SIOCINQ, [0]) = 0
+
+// Receive a segment with ACK flag set, verify that it is enqueued.
+ +.01 < . 1:1001(1000) ack 1 win 20000
+   +0 ioctl(4, SIOCINQ, [1000]) = 0
-- 
cgit v1.2.3


From a4e17a8f239a545c463f8ec27db4ed6e74b31841 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <rbm@suse.com>
Date: Thu, 5 Dec 2024 17:50:35 -0300
Subject: ktest.pl: Check kernelrelease return in get_version

In the case of a test that uses the special option ${KERNEL_VERSION} in one
of its settings but has no configuration available in ${OUTPUT_DIR}, for
example if it's a new empty directory, then the `make kernelrelease` call
will fail and the subroutine will chomp an empty string, silently. Fix that
by adding an empty configuration and retrying.

Cc: stable@vger.kernel.org
Cc: John Hawley <warthog9@eaglescrag.net>
Fixes: 5f9b6ced04a4e ("ktest: Bisecting, install modules, add logging")
Link: https://lore.kernel.org/20241205-ktest_kver_fallback-v2-1-869dae4c7777@suse.com
Signed-off-by: Ricardo B. Marliere <rbm@suse.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index dacad94e2be4..171262915636 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -2419,6 +2419,11 @@ sub get_version {
     return if ($have_version);
     doprint "$make kernelrelease ... ";
     $version = `$make -s kernelrelease | tail -1`;
+    if (!length($version)) {
+	run_command "$make allnoconfig" or return 0;
+	doprint "$make kernelrelease ... ";
+	$version = `$make -s kernelrelease | tail -1`;
+    }
     chomp($version);
     doprint "$version\n";
     $have_version = 1;
-- 
cgit v1.2.3


From 776735b954f49f85fd19e1198efa421fae2ad77c Mon Sep 17 00:00:00 2001
From: Ba Jing <bajing@cmss.chinamobile.com>
Date: Mon, 2 Sep 2024 21:07:35 +0800
Subject: ktest.pl: Remove unused declarations in run_bisect_test function

Since $output and $ret are not used in the subsequent code, the declarations
should be removed.

Fixes: a75fececff3c ("ktest: Added sample.conf, new %default option format")
Link: https://lore.kernel.org/20240902130735.6034-1-bajing@cmss.chinamobile.com
Signed-off-by: Ba Jing <bajing@cmss.chinamobile.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 171262915636..c76ad0be54e2 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -2965,8 +2965,6 @@ sub run_bisect_test {
 
     my $failed = 0;
     my $result;
-    my $output;
-    my $ret;
 
     $in_bisect = 1;
 
-- 
cgit v1.2.3


From 770221a36932a65c5a8b7711b5477430a1dbf5e8 Mon Sep 17 00:00:00 2001
From: Ba Jing <bajing@cmss.chinamobile.com>
Date: Mon, 2 Sep 2024 20:46:45 +0800
Subject: ktest.pl: Fix typo in comment

"on of these" should be "one of these".

Fixes: 77d942ceacbad ("ktest: Create variables for the ktest config files")
Link: https://lore.kernel.org/20240902124645.5674-1-bajing@cmss.chinamobile.com
Signed-off-by: Ba Jing <bajing@cmss.chinamobile.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index c76ad0be54e2..8c8da966c641 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1245,7 +1245,7 @@ sub __read_config {
 	    # Config variables are only active while reading the
 	    # config and can be defined anywhere. They also ignore
 	    # TEST_START and DEFAULTS, but are skipped if they are in
-	    # on of these sections that have SKIP defined.
+	    # one of these sections that have SKIP defined.
 	    # The save variable can be
 	    # defined multiple times and the new one simply overrides
 	    # the previous one.
-- 
cgit v1.2.3


From f3a30016e4b557495d49df7851f18ad97b6d5a23 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Wed, 18 Dec 2024 22:04:37 +0800
Subject: ktest.pl: Fix typo "accesing"

There is a spelling mistake of 'accesing' in comments which should
be 'accessing'.

Fixes: 6d76f469c8ac9 ("ktest: Add useful example configs")
Link: https://lore.kernel.org/8714AE3735C0EA0B+20241218140437.194906-1-wangyuli@uniontech.com
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/examples/include/defaults.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/examples/include/defaults.conf b/tools/testing/ktest/examples/include/defaults.conf
index 63a1a83f4f0b..f6d8517a471e 100644
--- a/tools/testing/ktest/examples/include/defaults.conf
+++ b/tools/testing/ktest/examples/include/defaults.conf
@@ -46,7 +46,7 @@ CLEAR_LOG = 1
 
 SSH_USER = root
 
-# For accesing the machine, we will ssh to root@machine.
+# For accessing the machine, we will ssh to root@machine.
 SSH := ssh ${SSH_USER}@${MACHINE}
 
 # Update this. The default here is ktest will ssh to the target box
-- 
cgit v1.2.3


From 09bb926d290789ff35e7fa53045811a8c57356a9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:33 -0800
Subject: KVM: selftests: Return a value from vcpu_get_reg() instead of using
 an out-param

Return a uint64_t from vcpu_get_reg() instead of having the caller provide
a pointer to storage, as none of the vcpu_get_reg() usage in KVM selftests
accesses a register larger than 64 bits, and vcpu_set_reg() only accepts a
64-bit value.  If a use case comes along that needs to get a register that
is larger than 64 bits, then a utility can be added to assert success and
take a void pointer, but until then, forcing an out param yields ugly code
and prevents feeding the output of vcpu_get_reg() into vcpu_set_reg().

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Acked-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/aarch64/aarch32_id_regs.c        | 10 ++--
 .../selftests/kvm/aarch64/debug-exceptions.c       |  4 +-
 tools/testing/selftests/kvm/aarch64/hypercalls.c   |  6 +-
 tools/testing/selftests/kvm/aarch64/no-vgic-v3.c   |  2 +-
 tools/testing/selftests/kvm/aarch64/psci_test.c    |  8 +--
 tools/testing/selftests/kvm/aarch64/set_id_regs.c  | 22 ++++----
 .../selftests/kvm/aarch64/vpmu_counter_access.c    | 19 +++----
 tools/testing/selftests/kvm/include/kvm_util.h     |  6 +-
 .../testing/selftests/kvm/lib/aarch64/processor.c  |  8 +--
 tools/testing/selftests/kvm/lib/riscv/processor.c  | 66 +++++++++++-----------
 tools/testing/selftests/kvm/riscv/arch_timer.c     |  2 +-
 tools/testing/selftests/kvm/riscv/ebreak_test.c    |  2 +-
 tools/testing/selftests/kvm/riscv/sbi_pmu_test.c   |  2 +-
 tools/testing/selftests/kvm/s390x/resets.c         |  2 +-
 tools/testing/selftests/kvm/steal_time.c           |  3 +-
 15 files changed, 81 insertions(+), 81 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
index 8e5bd07a3727..447d61cae4db 100644
--- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
@@ -97,7 +97,7 @@ static void test_user_raz_wi(struct kvm_vcpu *vcpu)
 		uint64_t reg_id = raz_wi_reg_ids[i];
 		uint64_t val;
 
-		vcpu_get_reg(vcpu, reg_id, &val);
+		val = vcpu_get_reg(vcpu, reg_id);
 		TEST_ASSERT_EQ(val, 0);
 
 		/*
@@ -106,7 +106,7 @@ static void test_user_raz_wi(struct kvm_vcpu *vcpu)
 		 */
 		vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
 
-		vcpu_get_reg(vcpu, reg_id, &val);
+		val = vcpu_get_reg(vcpu, reg_id);
 		TEST_ASSERT_EQ(val, 0);
 	}
 }
@@ -126,14 +126,14 @@ static void test_user_raz_invariant(struct kvm_vcpu *vcpu)
 		uint64_t reg_id = raz_invariant_reg_ids[i];
 		uint64_t val;
 
-		vcpu_get_reg(vcpu, reg_id, &val);
+		val = vcpu_get_reg(vcpu, reg_id);
 		TEST_ASSERT_EQ(val, 0);
 
 		r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
 		TEST_ASSERT(r < 0 && errno == EINVAL,
 			    "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
 
-		vcpu_get_reg(vcpu, reg_id, &val);
+		val = vcpu_get_reg(vcpu, reg_id);
 		TEST_ASSERT_EQ(val, 0);
 	}
 }
@@ -144,7 +144,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
 {
 	uint64_t val, el0;
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
 
 	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
 	return el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY;
diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index ff7a949fc96a..c7fb55c9135b 100644
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@ -501,7 +501,7 @@ void test_single_step_from_userspace(int test_cnt)
 		TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG");
 
 		/* Check if the current pc is expected. */
-		vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc);
+		pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
 		TEST_ASSERT(!test_pc || pc == test_pc,
 			    "Unexpected pc 0x%lx (expected 0x%lx)",
 			    pc, test_pc);
@@ -583,7 +583,7 @@ int main(int argc, char *argv[])
 	uint64_t aa64dfr0;
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &aa64dfr0);
+	aa64dfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
 	__TEST_REQUIRE(debug_version(aa64dfr0) >= 6,
 		       "Armv8 debug architecture not supported.");
 	kvm_vm_free(vm);
diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c
index 9d192ce0078d..ec54ec7726e9 100644
--- a/tools/testing/selftests/kvm/aarch64/hypercalls.c
+++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c
@@ -173,7 +173,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu)
 		const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
 
 		/* First 'read' should be an upper limit of the features supported */
-		vcpu_get_reg(vcpu, reg_info->reg, &val);
+		val = vcpu_get_reg(vcpu, reg_info->reg);
 		TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit),
 			"Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx",
 			reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val);
@@ -184,7 +184,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu)
 			"Failed to clear all the features of reg: 0x%lx; ret: %d",
 			reg_info->reg, errno);
 
-		vcpu_get_reg(vcpu, reg_info->reg, &val);
+		val = vcpu_get_reg(vcpu, reg_info->reg);
 		TEST_ASSERT(val == 0,
 			"Expected all the features to be cleared for reg: 0x%lx", reg_info->reg);
 
@@ -214,7 +214,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu)
 		 * Before starting the VM, the test clears all the bits.
 		 * Check if that's still the case.
 		 */
-		vcpu_get_reg(vcpu, reg_info->reg, &val);
+		val = vcpu_get_reg(vcpu, reg_info->reg);
 		TEST_ASSERT(val == 0,
 			"Expected all the features to be cleared for reg: 0x%lx",
 			reg_info->reg);
diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
index 58304bbc2036..ebd70430c89d 100644
--- a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
+++ b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
@@ -164,7 +164,7 @@ int main(int argc, char *argv[])
 	uint64_t pfr0;
 
 	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &pfr0);
+	pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
 	__TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0),
 		       "GICv3 not supported.");
 	kvm_vm_free(vm);
diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c
index eaa7655fefc1..ab491ee9e5f7 100644
--- a/tools/testing/selftests/kvm/aarch64/psci_test.c
+++ b/tools/testing/selftests/kvm/aarch64/psci_test.c
@@ -111,8 +111,8 @@ static void assert_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	uint64_t obs_pc, obs_x0;
 
-	vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &obs_pc);
-	vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.regs[0]), &obs_x0);
+	obs_pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
+	obs_x0 = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.regs[0]));
 
 	TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR,
 		    "unexpected target cpu pc: %lx (expected: %lx)",
@@ -152,7 +152,7 @@ static void host_test_cpu_on(void)
 	 */
 	vcpu_power_off(target);
 
-	vcpu_get_reg(target, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr);
+	target_mpidr = vcpu_get_reg(target, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1));
 	vcpu_args_set(source, 1, target_mpidr & MPIDR_HWID_BITMASK);
 	enter_guest(source);
 
@@ -244,7 +244,7 @@ static void host_test_system_off2(void)
 
 	setup_vm(guest_test_system_off2, &source, &target);
 
-	vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION, &psci_version);
+	psci_version = vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION);
 
 	TEST_ASSERT(psci_version >= PSCI_VERSION(1, 3),
 		    "Unexpected PSCI version %lu.%lu",
diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index a79b7f18452d..bc6cf50e5135 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -346,7 +346,7 @@ static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg,
 	uint64_t mask = ftr_bits->mask;
 	uint64_t val, new_val, ftr;
 
-	vcpu_get_reg(vcpu, reg, &val);
+	val = vcpu_get_reg(vcpu, reg);
 	ftr = (val & mask) >> shift;
 
 	ftr = get_safe_value(ftr_bits, ftr);
@@ -356,7 +356,7 @@ static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg,
 	val |= ftr;
 
 	vcpu_set_reg(vcpu, reg, val);
-	vcpu_get_reg(vcpu, reg, &new_val);
+	new_val = vcpu_get_reg(vcpu, reg);
 	TEST_ASSERT_EQ(new_val, val);
 
 	return new_val;
@@ -370,7 +370,7 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg,
 	uint64_t val, old_val, ftr;
 	int r;
 
-	vcpu_get_reg(vcpu, reg, &val);
+	val = vcpu_get_reg(vcpu, reg);
 	ftr = (val & mask) >> shift;
 
 	ftr = get_invalid_value(ftr_bits, ftr);
@@ -384,7 +384,7 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg,
 	TEST_ASSERT(r < 0 && errno == EINVAL,
 		    "Unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
 
-	vcpu_get_reg(vcpu, reg, &val);
+	val = vcpu_get_reg(vcpu, reg);
 	TEST_ASSERT_EQ(val, old_val);
 }
 
@@ -471,7 +471,7 @@ static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu)
 	}
 
 	/* Get the id register value */
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
 
 	/* Try to set MPAM=0. This should always be possible. */
 	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
@@ -508,7 +508,7 @@ static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu)
 	}
 
 	/* Get the id register value */
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), &val);
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
 
 	/* Try to set MPAM_frac=0. This should always be possible. */
 	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
@@ -576,7 +576,7 @@ static void test_clidr(struct kvm_vcpu *vcpu)
 	uint64_t clidr;
 	int level;
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), &clidr);
+	clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1));
 
 	/* find the first empty level in the cache hierarchy */
 	for (level = 1; level < 7; level++) {
@@ -601,7 +601,7 @@ static void test_ctr(struct kvm_vcpu *vcpu)
 {
 	u64 ctr;
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), &ctr);
+	ctr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0));
 	ctr &= ~CTR_EL0_DIC_MASK;
 	if (ctr & CTR_EL0_IminLine_MASK)
 		ctr--;
@@ -617,7 +617,7 @@ static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu)
 	test_clidr(vcpu);
 	test_ctr(vcpu);
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val);
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1));
 	val++;
 	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val);
 
@@ -630,7 +630,7 @@ static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encodin
 	size_t idx = encoding_to_range_idx(encoding);
 	uint64_t observed;
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding), &observed);
+	observed = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding));
 	TEST_ASSERT_EQ(test_reg_vals[idx], observed);
 }
 
@@ -665,7 +665,7 @@ int main(void)
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
 	/* Check for AARCH64 only system */
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
 	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
 	aarch64_only = (el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
 
diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
index f9c0c86d7e85..f16b3b27e32e 100644
--- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
+++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
@@ -440,8 +440,7 @@ static void create_vpmu_vm(void *guest_code)
 		       "Failed to create vgic-v3, skipping");
 
 	/* Make sure that PMUv3 support is indicated in the ID register */
-	vcpu_get_reg(vpmu_vm.vcpu,
-		     KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &dfr0);
+	dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
 	pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0);
 	TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF &&
 		    pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP,
@@ -484,7 +483,7 @@ static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail)
 	create_vpmu_vm(guest_code);
 	vcpu = vpmu_vm.vcpu;
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), &pmcr_orig);
+	pmcr_orig = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
 	pmcr = pmcr_orig;
 
 	/*
@@ -493,7 +492,7 @@ static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail)
 	 */
 	set_pmcr_n(&pmcr, pmcr_n);
 	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), pmcr);
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), &pmcr);
+	pmcr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
 
 	if (expect_fail)
 		TEST_ASSERT(pmcr_orig == pmcr,
@@ -521,7 +520,7 @@ static void run_access_test(uint64_t pmcr_n)
 	vcpu = vpmu_vm.vcpu;
 
 	/* Save the initial sp to restore them later to run the guest again */
-	vcpu_get_reg(vcpu, ARM64_CORE_REG(sp_el1), &sp);
+	sp = vcpu_get_reg(vcpu, ARM64_CORE_REG(sp_el1));
 
 	run_vcpu(vcpu, pmcr_n);
 
@@ -572,12 +571,12 @@ static void run_pmregs_validity_test(uint64_t pmcr_n)
 		 * Test if the 'set' and 'clr' variants of the registers
 		 * are initialized based on the number of valid counters.
 		 */
-		vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), &reg_val);
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id));
 		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
 			    "Initial read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
 			    KVM_ARM64_SYS_REG(set_reg_id), reg_val);
 
-		vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id), &reg_val);
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id));
 		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
 			    "Initial read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
 			    KVM_ARM64_SYS_REG(clr_reg_id), reg_val);
@@ -589,12 +588,12 @@ static void run_pmregs_validity_test(uint64_t pmcr_n)
 		 */
 		vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), max_counters_mask);
 
-		vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), &reg_val);
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id));
 		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
 			    "Read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
 			    KVM_ARM64_SYS_REG(set_reg_id), reg_val);
 
-		vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id), &reg_val);
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id));
 		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
 			    "Read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
 			    KVM_ARM64_SYS_REG(clr_reg_id), reg_val);
@@ -625,7 +624,7 @@ static uint64_t get_pmcr_n_limit(void)
 	uint64_t pmcr;
 
 	create_vpmu_vm(guest_code);
-	vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), &pmcr);
+	pmcr = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
 	destroy_vpmu_vm();
 	return get_pmcr_n(pmcr);
 }
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index bc7c242480d6..287a3ec06df4 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -702,11 +702,13 @@ static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t va
 
 	return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, &reg);
 }
-static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr)
+static inline uint64_t vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id)
 {
-	struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr };
+	uint64_t val;
+	struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val };
 
 	vcpu_ioctl(vcpu, KVM_GET_ONE_REG, &reg);
+	return val;
 }
 static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val)
 {
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index 698e34f39241..7ba3aa3755f3 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -281,8 +281,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init)
 	 */
 	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20);
 
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1);
-	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1);
+	sctlr_el1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1));
+	tcr_el1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1));
 
 	/* Configure base granule size */
 	switch (vm->mode) {
@@ -360,8 +360,8 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
 {
 	uint64_t pstate, pc;
 
-	vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate), &pstate);
-	vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc);
+	pstate = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate));
+	pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
 
 	fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n",
 		indent, "", pstate, pc);
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index 6ae47b3d6b25..dd663bcf0cc0 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -221,39 +221,39 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
 {
 	struct kvm_riscv_core core;
 
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(mode), &core.mode);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &core.regs.pc);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.ra), &core.regs.ra);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.sp), &core.regs.sp);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.gp), &core.regs.gp);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.tp), &core.regs.tp);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t0), &core.regs.t0);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t1), &core.regs.t1);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t2), &core.regs.t2);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s0), &core.regs.s0);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s1), &core.regs.s1);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a0), &core.regs.a0);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a1), &core.regs.a1);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a2), &core.regs.a2);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a3), &core.regs.a3);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a4), &core.regs.a4);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a5), &core.regs.a5);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a6), &core.regs.a6);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a7), &core.regs.a7);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s2), &core.regs.s2);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s3), &core.regs.s3);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s4), &core.regs.s4);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s5), &core.regs.s5);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s6), &core.regs.s6);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s7), &core.regs.s7);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s8), &core.regs.s8);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s9), &core.regs.s9);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s10), &core.regs.s10);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s11), &core.regs.s11);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t3), &core.regs.t3);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t4), &core.regs.t4);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t5), &core.regs.t5);
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t6), &core.regs.t6);
+	core.mode = vcpu_get_reg(vcpu, RISCV_CORE_REG(mode));
+	core.regs.pc = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc));
+	core.regs.ra = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.ra));
+	core.regs.sp = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.sp));
+	core.regs.gp = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.gp));
+	core.regs.tp = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.tp));
+	core.regs.t0 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t0));
+	core.regs.t1 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t1));
+	core.regs.t2 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t2));
+	core.regs.s0 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s0));
+	core.regs.s1 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s1));
+	core.regs.a0 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a0));
+	core.regs.a1 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a1));
+	core.regs.a2 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a2));
+	core.regs.a3 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a3));
+	core.regs.a4 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a4));
+	core.regs.a5 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a5));
+	core.regs.a6 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a6));
+	core.regs.a7 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a7));
+	core.regs.s2 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s2));
+	core.regs.s3 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s3));
+	core.regs.s4 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s4));
+	core.regs.s5 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s5));
+	core.regs.s6 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s6));
+	core.regs.s7 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s7));
+	core.regs.s8 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s8));
+	core.regs.s9 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s9));
+	core.regs.s10 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s10));
+	core.regs.s11 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s11));
+	core.regs.t3 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t3));
+	core.regs.t4 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t4));
+	core.regs.t5 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t5));
+	core.regs.t6 = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t6));
 
 	fprintf(stream,
 		" MODE:  0x%lx\n", core.mode);
diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c
index 2c792228ac0b..9e370800a6a2 100644
--- a/tools/testing/selftests/kvm/riscv/arch_timer.c
+++ b/tools/testing/selftests/kvm/riscv/arch_timer.c
@@ -93,7 +93,7 @@ struct kvm_vm *test_vm_create(void)
 		vcpu_init_vector_tables(vcpus[i]);
 
 	/* Initialize guest timer frequency. */
-	vcpu_get_reg(vcpus[0], RISCV_TIMER_REG(frequency), &timer_freq);
+	timer_freq = vcpu_get_reg(vcpus[0], RISCV_TIMER_REG(frequency));
 	sync_global_to_guest(vm, timer_freq);
 	pr_debug("timer_freq: %lu\n", timer_freq);
 
diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c
index 0e0712854953..cfed6c727bfc 100644
--- a/tools/testing/selftests/kvm/riscv/ebreak_test.c
+++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c
@@ -60,7 +60,7 @@ int main(void)
 
 	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG);
 
-	vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc);
+	pc = vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc));
 	TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1));
 
 	/* skip sw_bp_1 */
diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c
index f299cbfd23ca..f45c0ecc902d 100644
--- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c
+++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c
@@ -608,7 +608,7 @@ static void test_vm_events_overflow(void *guest_code)
 
 	vcpu_init_vector_tables(vcpu);
 	/* Initialize guest timer frequency. */
-	vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq);
+	timer_freq = vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency));
 	sync_global_to_guest(vm, timer_freq);
 
 	run_vcpu(vcpu);
diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c
index 357943f2bea8..b58f75b381e5 100644
--- a/tools/testing/selftests/kvm/s390x/resets.c
+++ b/tools/testing/selftests/kvm/s390x/resets.c
@@ -61,7 +61,7 @@ static void test_one_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t value)
 {
 	uint64_t eval_reg;
 
-	vcpu_get_reg(vcpu, id, &eval_reg);
+	eval_reg = vcpu_get_reg(vcpu, id);
 	TEST_ASSERT(eval_reg == value, "value == 0x%lx", value);
 }
 
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
index a8d3afa0b86b..cce2520af720 100644
--- a/tools/testing/selftests/kvm/steal_time.c
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -269,9 +269,8 @@ static void guest_code(int cpu)
 static bool is_steal_time_supported(struct kvm_vcpu *vcpu)
 {
 	uint64_t id = RISCV_SBI_EXT_REG(KVM_RISCV_SBI_EXT_STA);
-	unsigned long enabled;
+	unsigned long enabled = vcpu_get_reg(vcpu, id);
 
-	vcpu_get_reg(vcpu, id, &enabled);
 	TEST_ASSERT(enabled == 0 || enabled == 1, "Expected boolean result");
 
 	return enabled;
-- 
cgit v1.2.3


From fe85ce31b2891611a2e4d788872be815cea85a4b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:34 -0800
Subject: KVM: selftests: Assert that vcpu_{g,s}et_reg() won't truncate

Assert that the register being read/written by vcpu_{g,s}et_reg() is no
larger than a uint64_t, i.e. that a selftest isn't unintentionally
truncating the value being read/written.

Ideally, the assert would be done at compile-time, but that would limit
the checks to hardcoded accesses and/or require fancier compile-time
assertion infrastructure to filter out dynamic usage.

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 287a3ec06df4..4c4e5a847f67 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -707,6 +707,8 @@ static inline uint64_t vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id)
 	uint64_t val;
 	struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val };
 
+	TEST_ASSERT(KVM_REG_SIZE(id) <= sizeof(val), "Reg %lx too big", id);
+
 	vcpu_ioctl(vcpu, KVM_GET_ONE_REG, &reg);
 	return val;
 }
@@ -714,6 +716,8 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val
 {
 	struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val };
 
+	TEST_ASSERT(KVM_REG_SIZE(id) <= sizeof(val), "Reg %lx too big", id);
+
 	vcpu_ioctl(vcpu, KVM_SET_ONE_REG, &reg);
 }
 
-- 
cgit v1.2.3


From d6533c15133867cd3032bcab7f839ae5d53d0e70 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:35 -0800
Subject: KVM: selftests: Check for a potential unhandled exception iff KVM_RUN
 succeeded

Don't check for an unhandled exception if KVM_RUN failed, e.g. if it
returned errno=EFAULT, as reporting unhandled exceptions is done via a
ucall, i.e. requires KVM_RUN to exit cleanly.  Theoretically, checking
for a ucall on a failed KVM_RUN could get a false positive, e.g. if there
were stale data in vcpu->run from a previous exit.

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 480e3a40d197..33fefeb3ca44 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1648,7 +1648,8 @@ int _vcpu_run(struct kvm_vcpu *vcpu)
 		rc = __vcpu_run(vcpu);
 	} while (rc == -1 && errno == EINTR);
 
-	assert_on_unhandled_exception(vcpu);
+	if (!rc)
+		assert_on_unhandled_exception(vcpu);
 
 	return rc;
 }
-- 
cgit v1.2.3


From b12391498d1e7ee49390cda34df4a5cc21700e9f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:36 -0800
Subject: KVM: selftests: Rename max_guest_memory_test to mmu_stress_test

Rename max_guest_memory_test to mmu_stress_test so that the name isn't
horribly misleading when future changes extend the test to verify things
like mprotect() interactions, and because the test is useful even when its
configured to populate far less than the maximum amount of guest memory.

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile               |   2 +-
 .../testing/selftests/kvm/max_guest_memory_test.c  | 289 ---------------------
 tools/testing/selftests/kvm/mmu_stress_test.c      | 289 +++++++++++++++++++++
 3 files changed, 290 insertions(+), 290 deletions(-)
 delete mode 100644 tools/testing/selftests/kvm/max_guest_memory_test.c
 create mode 100644 tools/testing/selftests/kvm/mmu_stress_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 41593d2e7de9..4384e5f45c36 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -140,7 +140,7 @@ TEST_GEN_PROGS_x86_64 += guest_print_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += kvm_page_table_test
-TEST_GEN_PROGS_x86_64 += max_guest_memory_test
+TEST_GEN_PROGS_x86_64 += mmu_stress_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += memslot_perf_test
 TEST_GEN_PROGS_x86_64 += rseq_test
diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c
deleted file mode 100644
index 0b9678858b6d..000000000000
--- a/tools/testing/selftests/kvm/max_guest_memory_test.c
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <stdio.h>
-#include <stdlib.h>
-#include <pthread.h>
-#include <semaphore.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <errno.h>
-#include <linux/bitmap.h>
-#include <linux/bitops.h>
-#include <linux/atomic.h>
-#include <linux/sizes.h>
-
-#include "kvm_util.h"
-#include "test_util.h"
-#include "guest_modes.h"
-#include "processor.h"
-
-static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
-{
-	uint64_t gpa;
-
-	for (;;) {
-		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
-			*((volatile uint64_t *)gpa) = gpa;
-		GUEST_SYNC(0);
-	}
-}
-
-struct vcpu_info {
-	struct kvm_vcpu *vcpu;
-	uint64_t start_gpa;
-	uint64_t end_gpa;
-};
-
-static int nr_vcpus;
-static atomic_t rendezvous;
-
-static void rendezvous_with_boss(void)
-{
-	int orig = atomic_read(&rendezvous);
-
-	if (orig > 0) {
-		atomic_dec_and_test(&rendezvous);
-		while (atomic_read(&rendezvous) > 0)
-			cpu_relax();
-	} else {
-		atomic_inc(&rendezvous);
-		while (atomic_read(&rendezvous) < 0)
-			cpu_relax();
-	}
-}
-
-static void run_vcpu(struct kvm_vcpu *vcpu)
-{
-	vcpu_run(vcpu);
-	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
-}
-
-static void *vcpu_worker(void *data)
-{
-	struct vcpu_info *info = data;
-	struct kvm_vcpu *vcpu = info->vcpu;
-	struct kvm_vm *vm = vcpu->vm;
-	struct kvm_sregs sregs;
-
-	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
-
-	rendezvous_with_boss();
-
-	run_vcpu(vcpu);
-	rendezvous_with_boss();
-	vcpu_sregs_get(vcpu, &sregs);
-#ifdef __x86_64__
-	/* Toggle CR0.WP to trigger a MMU context reset. */
-	sregs.cr0 ^= X86_CR0_WP;
-#endif
-	vcpu_sregs_set(vcpu, &sregs);
-	rendezvous_with_boss();
-
-	run_vcpu(vcpu);
-	rendezvous_with_boss();
-
-	return NULL;
-}
-
-static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus,
-				uint64_t start_gpa, uint64_t end_gpa)
-{
-	struct vcpu_info *info;
-	uint64_t gpa, nr_bytes;
-	pthread_t *threads;
-	int i;
-
-	threads = malloc(nr_vcpus * sizeof(*threads));
-	TEST_ASSERT(threads, "Failed to allocate vCPU threads");
-
-	info = malloc(nr_vcpus * sizeof(*info));
-	TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges");
-
-	nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) &
-			~((uint64_t)vm->page_size - 1);
-	TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus);
-
-	for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) {
-		info[i].vcpu = vcpus[i];
-		info[i].start_gpa = gpa;
-		info[i].end_gpa = gpa + nr_bytes;
-		pthread_create(&threads[i], NULL, vcpu_worker, &info[i]);
-	}
-	return threads;
-}
-
-static void rendezvous_with_vcpus(struct timespec *time, const char *name)
-{
-	int i, rendezvoused;
-
-	pr_info("Waiting for vCPUs to finish %s...\n", name);
-
-	rendezvoused = atomic_read(&rendezvous);
-	for (i = 0; abs(rendezvoused) != 1; i++) {
-		usleep(100);
-		if (!(i & 0x3f))
-			pr_info("\r%d vCPUs haven't rendezvoused...",
-				abs(rendezvoused) - 1);
-		rendezvoused = atomic_read(&rendezvous);
-	}
-
-	clock_gettime(CLOCK_MONOTONIC, time);
-
-	/* Release the vCPUs after getting the time of the previous action. */
-	pr_info("\rAll vCPUs finished %s, releasing...\n", name);
-	if (rendezvoused > 0)
-		atomic_set(&rendezvous, -nr_vcpus - 1);
-	else
-		atomic_set(&rendezvous, nr_vcpus + 1);
-}
-
-static void calc_default_nr_vcpus(void)
-{
-	cpu_set_t possible_mask;
-	int r;
-
-	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
-	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
-		    errno, strerror(errno));
-
-	nr_vcpus = CPU_COUNT(&possible_mask) * 3/4;
-	TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
-}
-
-int main(int argc, char *argv[])
-{
-	/*
-	 * Skip the first 4gb and slot0.  slot0 maps <1gb and is used to back
-	 * the guest's code, stack, and page tables.  Because selftests creates
-	 * an IRQCHIP, a.k.a. a local APIC, KVM creates an internal memslot
-	 * just below the 4gb boundary.  This test could create memory at
-	 * 1gb-3gb,but it's simpler to skip straight to 4gb.
-	 */
-	const uint64_t start_gpa = SZ_4G;
-	const int first_slot = 1;
-
-	struct timespec time_start, time_run1, time_reset, time_run2;
-	uint64_t max_gpa, gpa, slot_size, max_mem, i;
-	int max_slots, slot, opt, fd;
-	bool hugepages = false;
-	struct kvm_vcpu **vcpus;
-	pthread_t *threads;
-	struct kvm_vm *vm;
-	void *mem;
-
-	/*
-	 * Default to 2gb so that maxing out systems with MAXPHADDR=46, which
-	 * are quite common for x86, requires changing only max_mem (KVM allows
-	 * 32k memslots, 32k * 2gb == ~64tb of guest memory).
-	 */
-	slot_size = SZ_2G;
-
-	max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
-	TEST_ASSERT(max_slots > first_slot, "KVM is broken");
-
-	/* All KVM MMUs should be able to survive a 128gb guest. */
-	max_mem = 128ull * SZ_1G;
-
-	calc_default_nr_vcpus();
-
-	while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) {
-		switch (opt) {
-		case 'c':
-			nr_vcpus = atoi_positive("Number of vCPUs", optarg);
-			break;
-		case 'm':
-			max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G;
-			break;
-		case 's':
-			slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G;
-			break;
-		case 'H':
-			hugepages = true;
-			break;
-		case 'h':
-		default:
-			printf("usage: %s [-c nr_vcpus] [-m max_mem_in_gb] [-s slot_size_in_gb] [-H]\n", argv[0]);
-			exit(1);
-		}
-	}
-
-	vcpus = malloc(nr_vcpus * sizeof(*vcpus));
-	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
-
-	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
-
-	max_gpa = vm->max_gfn << vm->page_shift;
-	TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb ");
-
-	fd = kvm_memfd_alloc(slot_size, hugepages);
-	mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
-
-	TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed");
-
-	/* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */
-	for (i = 0; i < slot_size; i += vm->page_size)
-		((uint8_t *)mem)[i] = 0xaa;
-
-	gpa = 0;
-	for (slot = first_slot; slot < max_slots; slot++) {
-		gpa = start_gpa + ((slot - first_slot) * slot_size);
-		if (gpa + slot_size > max_gpa)
-			break;
-
-		if ((gpa - start_gpa) >= max_mem)
-			break;
-
-		vm_set_user_memory_region(vm, slot, 0, gpa, slot_size, mem);
-
-#ifdef __x86_64__
-		/* Identity map memory in the guest using 1gb pages. */
-		for (i = 0; i < slot_size; i += SZ_1G)
-			__virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
-#else
-		for (i = 0; i < slot_size; i += vm->page_size)
-			virt_pg_map(vm, gpa + i, gpa + i);
-#endif
-	}
-
-	atomic_set(&rendezvous, nr_vcpus + 1);
-	threads = spawn_workers(vm, vcpus, start_gpa, gpa);
-
-	free(vcpus);
-	vcpus = NULL;
-
-	pr_info("Running with %lugb of guest memory and %u vCPUs\n",
-		(gpa - start_gpa) / SZ_1G, nr_vcpus);
-
-	rendezvous_with_vcpus(&time_start, "spawning");
-	rendezvous_with_vcpus(&time_run1, "run 1");
-	rendezvous_with_vcpus(&time_reset, "reset");
-	rendezvous_with_vcpus(&time_run2, "run 2");
-
-	time_run2  = timespec_sub(time_run2,   time_reset);
-	time_reset = timespec_sub(time_reset, time_run1);
-	time_run1  = timespec_sub(time_run1,   time_start);
-
-	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 =  %ld.%.9lds\n",
-		time_run1.tv_sec, time_run1.tv_nsec,
-		time_reset.tv_sec, time_reset.tv_nsec,
-		time_run2.tv_sec, time_run2.tv_nsec);
-
-	/*
-	 * Delete even numbered slots (arbitrary) and unmap the first half of
-	 * the backing (also arbitrary) to verify KVM correctly drops all
-	 * references to the removed regions.
-	 */
-	for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2)
-		vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL);
-
-	munmap(mem, slot_size / 2);
-
-	/* Sanity check that the vCPUs actually ran. */
-	for (i = 0; i < nr_vcpus; i++)
-		pthread_join(threads[i], NULL);
-
-	/*
-	 * Deliberately exit without deleting the remaining memslots or closing
-	 * kvm_fd to test cleanup via mmu_notifier.release.
-	 */
-}
diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
new file mode 100644
index 000000000000..0b9678858b6d
--- /dev/null
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/atomic.h>
+#include <linux/sizes.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "guest_modes.h"
+#include "processor.h"
+
+static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
+{
+	uint64_t gpa;
+
+	for (;;) {
+		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
+			*((volatile uint64_t *)gpa) = gpa;
+		GUEST_SYNC(0);
+	}
+}
+
+struct vcpu_info {
+	struct kvm_vcpu *vcpu;
+	uint64_t start_gpa;
+	uint64_t end_gpa;
+};
+
+static int nr_vcpus;
+static atomic_t rendezvous;
+
+static void rendezvous_with_boss(void)
+{
+	int orig = atomic_read(&rendezvous);
+
+	if (orig > 0) {
+		atomic_dec_and_test(&rendezvous);
+		while (atomic_read(&rendezvous) > 0)
+			cpu_relax();
+	} else {
+		atomic_inc(&rendezvous);
+		while (atomic_read(&rendezvous) < 0)
+			cpu_relax();
+	}
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+}
+
+static void *vcpu_worker(void *data)
+{
+	struct vcpu_info *info = data;
+	struct kvm_vcpu *vcpu = info->vcpu;
+	struct kvm_vm *vm = vcpu->vm;
+	struct kvm_sregs sregs;
+
+	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
+
+	rendezvous_with_boss();
+
+	run_vcpu(vcpu);
+	rendezvous_with_boss();
+	vcpu_sregs_get(vcpu, &sregs);
+#ifdef __x86_64__
+	/* Toggle CR0.WP to trigger a MMU context reset. */
+	sregs.cr0 ^= X86_CR0_WP;
+#endif
+	vcpu_sregs_set(vcpu, &sregs);
+	rendezvous_with_boss();
+
+	run_vcpu(vcpu);
+	rendezvous_with_boss();
+
+	return NULL;
+}
+
+static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus,
+				uint64_t start_gpa, uint64_t end_gpa)
+{
+	struct vcpu_info *info;
+	uint64_t gpa, nr_bytes;
+	pthread_t *threads;
+	int i;
+
+	threads = malloc(nr_vcpus * sizeof(*threads));
+	TEST_ASSERT(threads, "Failed to allocate vCPU threads");
+
+	info = malloc(nr_vcpus * sizeof(*info));
+	TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges");
+
+	nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) &
+			~((uint64_t)vm->page_size - 1);
+	TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus);
+
+	for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) {
+		info[i].vcpu = vcpus[i];
+		info[i].start_gpa = gpa;
+		info[i].end_gpa = gpa + nr_bytes;
+		pthread_create(&threads[i], NULL, vcpu_worker, &info[i]);
+	}
+	return threads;
+}
+
+static void rendezvous_with_vcpus(struct timespec *time, const char *name)
+{
+	int i, rendezvoused;
+
+	pr_info("Waiting for vCPUs to finish %s...\n", name);
+
+	rendezvoused = atomic_read(&rendezvous);
+	for (i = 0; abs(rendezvoused) != 1; i++) {
+		usleep(100);
+		if (!(i & 0x3f))
+			pr_info("\r%d vCPUs haven't rendezvoused...",
+				abs(rendezvoused) - 1);
+		rendezvoused = atomic_read(&rendezvous);
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, time);
+
+	/* Release the vCPUs after getting the time of the previous action. */
+	pr_info("\rAll vCPUs finished %s, releasing...\n", name);
+	if (rendezvoused > 0)
+		atomic_set(&rendezvous, -nr_vcpus - 1);
+	else
+		atomic_set(&rendezvous, nr_vcpus + 1);
+}
+
+static void calc_default_nr_vcpus(void)
+{
+	cpu_set_t possible_mask;
+	int r;
+
+	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
+	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
+		    errno, strerror(errno));
+
+	nr_vcpus = CPU_COUNT(&possible_mask) * 3/4;
+	TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
+}
+
+int main(int argc, char *argv[])
+{
+	/*
+	 * Skip the first 4gb and slot0.  slot0 maps <1gb and is used to back
+	 * the guest's code, stack, and page tables.  Because selftests creates
+	 * an IRQCHIP, a.k.a. a local APIC, KVM creates an internal memslot
+	 * just below the 4gb boundary.  This test could create memory at
+	 * 1gb-3gb,but it's simpler to skip straight to 4gb.
+	 */
+	const uint64_t start_gpa = SZ_4G;
+	const int first_slot = 1;
+
+	struct timespec time_start, time_run1, time_reset, time_run2;
+	uint64_t max_gpa, gpa, slot_size, max_mem, i;
+	int max_slots, slot, opt, fd;
+	bool hugepages = false;
+	struct kvm_vcpu **vcpus;
+	pthread_t *threads;
+	struct kvm_vm *vm;
+	void *mem;
+
+	/*
+	 * Default to 2gb so that maxing out systems with MAXPHADDR=46, which
+	 * are quite common for x86, requires changing only max_mem (KVM allows
+	 * 32k memslots, 32k * 2gb == ~64tb of guest memory).
+	 */
+	slot_size = SZ_2G;
+
+	max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+	TEST_ASSERT(max_slots > first_slot, "KVM is broken");
+
+	/* All KVM MMUs should be able to survive a 128gb guest. */
+	max_mem = 128ull * SZ_1G;
+
+	calc_default_nr_vcpus();
+
+	while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) {
+		switch (opt) {
+		case 'c':
+			nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+			break;
+		case 'm':
+			max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G;
+			break;
+		case 's':
+			slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G;
+			break;
+		case 'H':
+			hugepages = true;
+			break;
+		case 'h':
+		default:
+			printf("usage: %s [-c nr_vcpus] [-m max_mem_in_gb] [-s slot_size_in_gb] [-H]\n", argv[0]);
+			exit(1);
+		}
+	}
+
+	vcpus = malloc(nr_vcpus * sizeof(*vcpus));
+	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
+
+	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
+
+	max_gpa = vm->max_gfn << vm->page_shift;
+	TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb ");
+
+	fd = kvm_memfd_alloc(slot_size, hugepages);
+	mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
+
+	TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed");
+
+	/* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */
+	for (i = 0; i < slot_size; i += vm->page_size)
+		((uint8_t *)mem)[i] = 0xaa;
+
+	gpa = 0;
+	for (slot = first_slot; slot < max_slots; slot++) {
+		gpa = start_gpa + ((slot - first_slot) * slot_size);
+		if (gpa + slot_size > max_gpa)
+			break;
+
+		if ((gpa - start_gpa) >= max_mem)
+			break;
+
+		vm_set_user_memory_region(vm, slot, 0, gpa, slot_size, mem);
+
+#ifdef __x86_64__
+		/* Identity map memory in the guest using 1gb pages. */
+		for (i = 0; i < slot_size; i += SZ_1G)
+			__virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
+#else
+		for (i = 0; i < slot_size; i += vm->page_size)
+			virt_pg_map(vm, gpa + i, gpa + i);
+#endif
+	}
+
+	atomic_set(&rendezvous, nr_vcpus + 1);
+	threads = spawn_workers(vm, vcpus, start_gpa, gpa);
+
+	free(vcpus);
+	vcpus = NULL;
+
+	pr_info("Running with %lugb of guest memory and %u vCPUs\n",
+		(gpa - start_gpa) / SZ_1G, nr_vcpus);
+
+	rendezvous_with_vcpus(&time_start, "spawning");
+	rendezvous_with_vcpus(&time_run1, "run 1");
+	rendezvous_with_vcpus(&time_reset, "reset");
+	rendezvous_with_vcpus(&time_run2, "run 2");
+
+	time_run2  = timespec_sub(time_run2,   time_reset);
+	time_reset = timespec_sub(time_reset, time_run1);
+	time_run1  = timespec_sub(time_run1,   time_start);
+
+	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 =  %ld.%.9lds\n",
+		time_run1.tv_sec, time_run1.tv_nsec,
+		time_reset.tv_sec, time_reset.tv_nsec,
+		time_run2.tv_sec, time_run2.tv_nsec);
+
+	/*
+	 * Delete even numbered slots (arbitrary) and unmap the first half of
+	 * the backing (also arbitrary) to verify KVM correctly drops all
+	 * references to the removed regions.
+	 */
+	for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2)
+		vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL);
+
+	munmap(mem, slot_size / 2);
+
+	/* Sanity check that the vCPUs actually ran. */
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_join(threads[i], NULL);
+
+	/*
+	 * Deliberately exit without deleting the remaining memslots or closing
+	 * kvm_fd to test cleanup via mmu_notifier.release.
+	 */
+}
-- 
cgit v1.2.3


From 55e164df482a48e168b26994197c2a848aae5959 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:37 -0800
Subject: KVM: selftests: Only muck with SREGS on x86 in mmu_stress_test

Try to get/set SREGS in mmu_stress_test only when running on x86, as the
ioctls are supported only by x86 and PPC, and the latter doesn't yet
support KVM selftests.

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 0b9678858b6d..847da23ec1b1 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -59,10 +59,10 @@ static void run_vcpu(struct kvm_vcpu *vcpu)
 
 static void *vcpu_worker(void *data)
 {
+	struct kvm_sregs __maybe_unused sregs;
 	struct vcpu_info *info = data;
 	struct kvm_vcpu *vcpu = info->vcpu;
 	struct kvm_vm *vm = vcpu->vm;
-	struct kvm_sregs sregs;
 
 	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
 
@@ -70,12 +70,12 @@ static void *vcpu_worker(void *data)
 
 	run_vcpu(vcpu);
 	rendezvous_with_boss();
-	vcpu_sregs_get(vcpu, &sregs);
 #ifdef __x86_64__
+	vcpu_sregs_get(vcpu, &sregs);
 	/* Toggle CR0.WP to trigger a MMU context reset. */
 	sregs.cr0 ^= X86_CR0_WP;
-#endif
 	vcpu_sregs_set(vcpu, &sregs);
+#endif
 	rendezvous_with_boss();
 
 	run_vcpu(vcpu);
-- 
cgit v1.2.3


From 1ddd3ea75ac3be79dbd800507dd7e08928bd454d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:38 -0800
Subject: KVM: selftests: Compute number of extra pages needed in
 mmu_stress_test

Create mmu_stress_tests's VM with the correct number of extra pages needed
to map all of memory in the guest.  The bug hasn't been noticed before as
the test currently runs only on x86, which maps guest memory with 1GiB
pages, i.e. doesn't need much memory in the guest for page tables.

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 847da23ec1b1..5467b12f5903 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -209,7 +209,13 @@ int main(int argc, char *argv[])
 	vcpus = malloc(nr_vcpus * sizeof(*vcpus));
 	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
 
-	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
+	vm = __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus,
+#ifdef __x86_64__
+				    max_mem / SZ_1G,
+#else
+				    max_mem / vm_guest_mode_params[VM_MODE_DEFAULT].page_size,
+#endif
+				    guest_code, vcpus);
 
 	max_gpa = vm->max_gfn << vm->page_shift;
 	TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb ");
-- 
cgit v1.2.3


From c35d8f579e50328f539bc049cc057f2158bb8e60 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:39 -0800
Subject: KVM: sefltests: Explicitly include ucall_common.h in
 mmu_stress_test.c

Explicitly include ucall_common.h in the MMU stress test, as unlike arm64
and x86-64, RISC-V doesn't include ucall_common.h in its processor.h, i.e.
this will allow enabling the test on RISC-V.

Reported-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 5467b12f5903..fbb693428a82 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -15,6 +15,7 @@
 #include "test_util.h"
 #include "guest_modes.h"
 #include "processor.h"
+#include "ucall_common.h"
 
 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 {
-- 
cgit v1.2.3


From 8abe7632a1eebdfac2c553a21b4f980db416c166 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:40 -0800
Subject: KVM: selftests: Enable mmu_stress_test on arm64

Enable the mmu_stress_test on arm64.  The intent was to enable the test
across all architectures when it was first added, but a few goofs made it
unrunnable on !x86.  Now that those goofs are fixed, at least for arm64,
enable the test.

Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Marc Zyngier <maz@kernel.org>
Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 4384e5f45c36..c59a337cd4da 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -180,6 +180,7 @@ TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += memslot_modification_stress_test
 TEST_GEN_PROGS_aarch64 += memslot_perf_test
+TEST_GEN_PROGS_aarch64 += mmu_stress_test
 TEST_GEN_PROGS_aarch64 += rseq_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
-- 
cgit v1.2.3


From 3a042252640450fd288c56953bf21583d5198fba Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:41 -0800
Subject: KVM: selftests: Use vcpu_arch_put_guest() in mmu_stress_test

Use vcpu_arch_put_guest() to write memory from the guest in
mmu_stress_test as an easy way to provide a bit of extra coverage.

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-11-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index fbb693428a82..656a837c7f49 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -23,7 +23,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 
 	for (;;) {
 		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
-			*((volatile uint64_t *)gpa) = gpa;
+			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
 		GUEST_SYNC(0);
 	}
 }
-- 
cgit v1.2.3


From 82b542e1184884885fe2b4aabd47672540db02c7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:42 -0800
Subject: KVM: selftests: Precisely limit the number of guest loops in
 mmu_stress_test

Run the exact number of guest loops required in mmu_stress_test instead
of looping indefinitely in anticipation of adding more stages that run
different code (e.g. reads instead of writes).

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-12-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 656a837c7f49..c6bf18cb7c89 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -20,12 +20,15 @@
 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 {
 	uint64_t gpa;
+	int i;
 
-	for (;;) {
+	for (i = 0; i < 2; i++) {
 		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
 			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
-		GUEST_SYNC(0);
+		GUEST_SYNC(i);
 	}
+
+	GUEST_ASSERT(0);
 }
 
 struct vcpu_info {
@@ -52,10 +55,18 @@ static void rendezvous_with_boss(void)
 	}
 }
 
-static void run_vcpu(struct kvm_vcpu *vcpu)
+static void assert_sync_stage(struct kvm_vcpu *vcpu, int stage)
+{
+	struct ucall uc;
+
+	TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
+	TEST_ASSERT_EQ(uc.args[1], stage);
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
 {
 	vcpu_run(vcpu);
-	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+	assert_sync_stage(vcpu, stage);
 }
 
 static void *vcpu_worker(void *data)
@@ -69,7 +80,8 @@ static void *vcpu_worker(void *data)
 
 	rendezvous_with_boss();
 
-	run_vcpu(vcpu);
+	/* Stage 0, write all of guest memory. */
+	run_vcpu(vcpu, 0);
 	rendezvous_with_boss();
 #ifdef __x86_64__
 	vcpu_sregs_get(vcpu, &sregs);
@@ -79,7 +91,8 @@ static void *vcpu_worker(void *data)
 #endif
 	rendezvous_with_boss();
 
-	run_vcpu(vcpu);
+	/* Stage 1, re-write all of guest memory. */
+	run_vcpu(vcpu, 1);
 	rendezvous_with_boss();
 
 	return NULL;
-- 
cgit v1.2.3


From 80b7859a3a43ff8bb924947a03b144626aeb1d0c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:43 -0800
Subject: KVM: selftests: Add a read-only mprotect() phase to mmu_stress_test

Add a third phase of mmu_stress_test to verify that mprotect()ing guest
memory to make it read-only doesn't cause explosions, e.g. to verify KVM
correctly handles the resulting mmu_notifier invalidations.

Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-13-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index c6bf18cb7c89..0918fade9267 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -28,6 +28,10 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 		GUEST_SYNC(i);
 	}
 
+	for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
+		*((volatile uint64_t *)gpa);
+	GUEST_SYNC(2);
+
 	GUEST_ASSERT(0);
 }
 
@@ -95,6 +99,10 @@ static void *vcpu_worker(void *data)
 	run_vcpu(vcpu, 1);
 	rendezvous_with_boss();
 
+	/* Stage 2, read all of guest memory, which is now read-only. */
+	run_vcpu(vcpu, 2);
+	rendezvous_with_boss();
+
 	return NULL;
 }
 
@@ -175,7 +183,7 @@ int main(int argc, char *argv[])
 	const uint64_t start_gpa = SZ_4G;
 	const int first_slot = 1;
 
-	struct timespec time_start, time_run1, time_reset, time_run2;
+	struct timespec time_start, time_run1, time_reset, time_run2, time_ro;
 	uint64_t max_gpa, gpa, slot_size, max_mem, i;
 	int max_slots, slot, opt, fd;
 	bool hugepages = false;
@@ -279,14 +287,20 @@ int main(int argc, char *argv[])
 	rendezvous_with_vcpus(&time_reset, "reset");
 	rendezvous_with_vcpus(&time_run2, "run 2");
 
+	mprotect(mem, slot_size, PROT_READ);
+	rendezvous_with_vcpus(&time_ro, "mprotect RO");
+
+	time_ro    = timespec_sub(time_ro,     time_run2);
 	time_run2  = timespec_sub(time_run2,   time_reset);
-	time_reset = timespec_sub(time_reset, time_run1);
+	time_reset = timespec_sub(time_reset,  time_run1);
 	time_run1  = timespec_sub(time_run1,   time_start);
 
-	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 =  %ld.%.9lds\n",
+	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 = %ld.%.9lds, "
+		"ro = %ld.%.9lds\n",
 		time_run1.tv_sec, time_run1.tv_nsec,
 		time_reset.tv_sec, time_reset.tv_nsec,
-		time_run2.tv_sec, time_run2.tv_nsec);
+		time_run2.tv_sec, time_run2.tv_nsec,
+		time_ro.tv_sec, time_ro.tv_nsec);
 
 	/*
 	 * Delete even numbered slots (arbitrary) and unmap the first half of
-- 
cgit v1.2.3


From b6c304aec6483f6cd254df690eda35b225cd856c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:44 -0800
Subject: KVM: selftests: Verify KVM correctly handles mprotect(PROT_READ)

Add two phases to mmu_stress_test to verify that KVM correctly handles
guest memory that was writable, and then made read-only in the primary MMU,
and then made writable again.

Add bonus coverage for x86 and arm64 to verify that all of guest memory was
marked read-only.  Making forward progress (without making memory writable)
requires arch specific code to skip over the faulting instruction, but the
test can at least verify each vCPU's starting page was made read-only for
other architectures.

Link: https://lore.kernel.org/r/20241128005547.4077116-14-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 104 +++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 0918fade9267..d9c76b4c0d88 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -17,6 +17,8 @@
 #include "processor.h"
 #include "ucall_common.h"
 
+static bool mprotect_ro_done;
+
 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 {
 	uint64_t gpa;
@@ -32,6 +34,42 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 		*((volatile uint64_t *)gpa);
 	GUEST_SYNC(2);
 
+	/*
+	 * Write to the region while mprotect(PROT_READ) is underway.  Keep
+	 * looping until the memory is guaranteed to be read-only, otherwise
+	 * vCPUs may complete their writes and advance to the next stage
+	 * prematurely.
+	 *
+	 * For architectures that support skipping the faulting instruction,
+	 * generate the store via inline assembly to ensure the exact length
+	 * of the instruction is known and stable (vcpu_arch_put_guest() on
+	 * fixed-length architectures should work, but the cost of paranoia
+	 * is low in this case).  For x86, hand-code the exact opcode so that
+	 * there is no room for variability in the generated instruction.
+	 */
+	do {
+		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
+#ifdef __x86_64__
+			asm volatile(".byte 0x48,0x89,0x00" :: "a"(gpa) : "memory"); /* mov %rax, (%rax) */
+#elif defined(__aarch64__)
+			asm volatile("str %0, [%0]" :: "r" (gpa) : "memory");
+#else
+			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
+#endif
+	} while (!READ_ONCE(mprotect_ro_done));
+
+	/*
+	 * Only architectures that write the entire range can explicitly sync,
+	 * as other architectures will be stuck on the write fault.
+	 */
+#if defined(__x86_64__) || defined(__aarch64__)
+	GUEST_SYNC(3);
+#endif
+
+	for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
+		vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
+	GUEST_SYNC(4);
+
 	GUEST_ASSERT(0);
 }
 
@@ -79,6 +117,7 @@ static void *vcpu_worker(void *data)
 	struct vcpu_info *info = data;
 	struct kvm_vcpu *vcpu = info->vcpu;
 	struct kvm_vm *vm = vcpu->vm;
+	int r;
 
 	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
 
@@ -101,6 +140,57 @@ static void *vcpu_worker(void *data)
 
 	/* Stage 2, read all of guest memory, which is now read-only. */
 	run_vcpu(vcpu, 2);
+
+	/*
+	 * Stage 3, write guest memory and verify KVM returns -EFAULT for once
+	 * the mprotect(PROT_READ) lands.  Only architectures that support
+	 * validating *all* of guest memory sync for this stage, as vCPUs will
+	 * be stuck on the faulting instruction for other architectures.  Go to
+	 * stage 3 without a rendezvous
+	 */
+	do {
+		r = _vcpu_run(vcpu);
+	} while (!r);
+	TEST_ASSERT(r == -1 && errno == EFAULT,
+		    "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
+
+#if defined(__x86_64__) || defined(__aarch64__)
+	/*
+	 * Verify *all* writes from the guest hit EFAULT due to the VMA now
+	 * being read-only.  x86 and arm64 only at this time as skipping the
+	 * instruction that hits the EFAULT requires advancing the program
+	 * counter, which is arch specific and relies on inline assembly.
+	 */
+#ifdef __x86_64__
+	vcpu->run->kvm_valid_regs = KVM_SYNC_X86_REGS;
+#endif
+	for (;;) {
+		r = _vcpu_run(vcpu);
+		if (!r)
+			break;
+		TEST_ASSERT_EQ(errno, EFAULT);
+#if defined(__x86_64__)
+		WRITE_ONCE(vcpu->run->kvm_dirty_regs, KVM_SYNC_X86_REGS);
+		vcpu->run->s.regs.regs.rip += 3;
+#elif defined(__aarch64__)
+		vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc),
+			     vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc)) + 4);
+#endif
+
+	}
+	assert_sync_stage(vcpu, 3);
+#endif /* __x86_64__ || __aarch64__ */
+	rendezvous_with_boss();
+
+	/*
+	 * Stage 4.  Run to completion, waiting for mprotect(PROT_WRITE) to
+	 * make the memory writable again.
+	 */
+	do {
+		r = _vcpu_run(vcpu);
+	} while (r && errno == EFAULT);
+	TEST_ASSERT_EQ(r, 0);
+	assert_sync_stage(vcpu, 4);
 	rendezvous_with_boss();
 
 	return NULL;
@@ -183,7 +273,7 @@ int main(int argc, char *argv[])
 	const uint64_t start_gpa = SZ_4G;
 	const int first_slot = 1;
 
-	struct timespec time_start, time_run1, time_reset, time_run2, time_ro;
+	struct timespec time_start, time_run1, time_reset, time_run2, time_ro, time_rw;
 	uint64_t max_gpa, gpa, slot_size, max_mem, i;
 	int max_slots, slot, opt, fd;
 	bool hugepages = false;
@@ -288,19 +378,27 @@ int main(int argc, char *argv[])
 	rendezvous_with_vcpus(&time_run2, "run 2");
 
 	mprotect(mem, slot_size, PROT_READ);
+	usleep(10);
+	mprotect_ro_done = true;
+	sync_global_to_guest(vm, mprotect_ro_done);
+
 	rendezvous_with_vcpus(&time_ro, "mprotect RO");
+	mprotect(mem, slot_size, PROT_READ | PROT_WRITE);
+	rendezvous_with_vcpus(&time_rw, "mprotect RW");
 
+	time_rw    = timespec_sub(time_rw,     time_ro);
 	time_ro    = timespec_sub(time_ro,     time_run2);
 	time_run2  = timespec_sub(time_run2,   time_reset);
 	time_reset = timespec_sub(time_reset,  time_run1);
 	time_run1  = timespec_sub(time_run1,   time_start);
 
 	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 = %ld.%.9lds, "
-		"ro = %ld.%.9lds\n",
+		"ro = %ld.%.9lds, rw = %ld.%.9lds\n",
 		time_run1.tv_sec, time_run1.tv_nsec,
 		time_reset.tv_sec, time_reset.tv_nsec,
 		time_run2.tv_sec, time_run2.tv_nsec,
-		time_ro.tv_sec, time_ro.tv_nsec);
+		time_ro.tv_sec, time_ro.tv_nsec,
+		time_rw.tv_sec, time_rw.tv_nsec);
 
 	/*
 	 * Delete even numbered slots (arbitrary) and unmap the first half of
-- 
cgit v1.2.3


From 43fbd8cd389faa9760c5152b1c58e893c812953b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:45 -0800
Subject: KVM: selftests: Provide empty 'all' and 'clean' targets for
 unsupported ARCHs

Provide empty targets for KVM selftests if the target architecture is
unsupported to make it obvious which architectures are supported, and so
that various side effects don't fail and/or do weird things, e.g. as is,
"mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))" fails due to a missing operand,
and conversely, "$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) ..." will
create an empty, useless directory for the unsupported architecture.

Move the guts of the Makefile to Makefile.kvm so that it's easier to see
that the if-statement effectively guards all of KVM selftests.

Reported-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Acked-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Acked-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-15-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/.gitignore   |   1 +
 tools/testing/selftests/kvm/Makefile     | 336 +------------------------------
 tools/testing/selftests/kvm/Makefile.kvm | 334 ++++++++++++++++++++++++++++++
 3 files changed, 340 insertions(+), 331 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/Makefile.kvm

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 7f57abf936e7..1d41a046a7bf 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -9,3 +9,4 @@
 !config
 !settings
 !Makefile
+!Makefile.kvm
\ No newline at end of file
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c59a337cd4da..7b33464bf8cc 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -1,12 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
-include ../../../build/Build.include
-
-all:
-
 top_srcdir = ../../../..
 include $(top_srcdir)/scripts/subarch.include
 ARCH            ?= $(SUBARCH)
 
+ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64))
 ifeq ($(ARCH),x86)
 	ARCH_DIR := x86_64
 else ifeq ($(ARCH),arm64)
@@ -17,332 +14,9 @@ else
 	ARCH_DIR := $(ARCH)
 endif
 
-LIBKVM += lib/assert.c
-LIBKVM += lib/elf.c
-LIBKVM += lib/guest_modes.c
-LIBKVM += lib/io.c
-LIBKVM += lib/kvm_util.c
-LIBKVM += lib/memstress.c
-LIBKVM += lib/guest_sprintf.c
-LIBKVM += lib/rbtree.c
-LIBKVM += lib/sparsebit.c
-LIBKVM += lib/test_util.c
-LIBKVM += lib/ucall_common.c
-LIBKVM += lib/userfaultfd_util.c
-
-LIBKVM_STRING += lib/string_override.c
-
-LIBKVM_x86_64 += lib/x86_64/apic.c
-LIBKVM_x86_64 += lib/x86_64/handlers.S
-LIBKVM_x86_64 += lib/x86_64/hyperv.c
-LIBKVM_x86_64 += lib/x86_64/memstress.c
-LIBKVM_x86_64 += lib/x86_64/pmu.c
-LIBKVM_x86_64 += lib/x86_64/processor.c
-LIBKVM_x86_64 += lib/x86_64/sev.c
-LIBKVM_x86_64 += lib/x86_64/svm.c
-LIBKVM_x86_64 += lib/x86_64/ucall.c
-LIBKVM_x86_64 += lib/x86_64/vmx.c
-
-LIBKVM_aarch64 += lib/aarch64/gic.c
-LIBKVM_aarch64 += lib/aarch64/gic_v3.c
-LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c
-LIBKVM_aarch64 += lib/aarch64/handlers.S
-LIBKVM_aarch64 += lib/aarch64/processor.c
-LIBKVM_aarch64 += lib/aarch64/spinlock.c
-LIBKVM_aarch64 += lib/aarch64/ucall.c
-LIBKVM_aarch64 += lib/aarch64/vgic.c
-
-LIBKVM_s390x += lib/s390x/diag318_test_handler.c
-LIBKVM_s390x += lib/s390x/processor.c
-LIBKVM_s390x += lib/s390x/ucall.c
-LIBKVM_s390x += lib/s390x/facility.c
-
-LIBKVM_riscv += lib/riscv/handlers.S
-LIBKVM_riscv += lib/riscv/processor.c
-LIBKVM_riscv += lib/riscv/ucall.c
-
-# Non-compiled test targets
-TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
-
-# Compiled test targets
-TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
-TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
-TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
-TEST_GEN_PROGS_x86_64 += x86_64/feature_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
-TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
-TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_extended_hypercalls
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush
-TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
-TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
-TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
-TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
-TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
-TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
-TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
-TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
-TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
-TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
-TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
-TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
-TEST_GEN_PROGS_x86_64 += x86_64/smm_test
-TEST_GEN_PROGS_x86_64 += x86_64/state_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_shutdown_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test
-TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
-TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
-TEST_GEN_PROGS_x86_64 += x86_64/ucna_injection_test
-TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
-TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
-TEST_GEN_PROGS_x86_64 += x86_64/apic_bus_clock_test
-TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
-TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
-TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test
-TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
-TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
-TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
-TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
-TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
-TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests
-TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
-TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test
-TEST_GEN_PROGS_x86_64 += x86_64/amx_test
-TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
-TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test
-TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test
-TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
-TEST_GEN_PROGS_x86_64 += coalesced_io_test
-TEST_GEN_PROGS_x86_64 += demand_paging_test
-TEST_GEN_PROGS_x86_64 += dirty_log_test
-TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
-TEST_GEN_PROGS_x86_64 += guest_memfd_test
-TEST_GEN_PROGS_x86_64 += guest_print_test
-TEST_GEN_PROGS_x86_64 += hardware_disable_test
-TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
-TEST_GEN_PROGS_x86_64 += kvm_page_table_test
-TEST_GEN_PROGS_x86_64 += mmu_stress_test
-TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
-TEST_GEN_PROGS_x86_64 += memslot_perf_test
-TEST_GEN_PROGS_x86_64 += rseq_test
-TEST_GEN_PROGS_x86_64 += set_memory_region_test
-TEST_GEN_PROGS_x86_64 += steal_time
-TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
-TEST_GEN_PROGS_x86_64 += system_counter_offset_test
-TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
-
-# Compiled outputs used by test targets
-TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
-
-TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
-TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases
-TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
-TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
-TEST_GEN_PROGS_aarch64 += aarch64/mmio_abort
-TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
-TEST_GEN_PROGS_aarch64 += aarch64/psci_test
-TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs
-TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter
-TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
-TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
-TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
-TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress
-TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access
-TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3
-TEST_GEN_PROGS_aarch64 += access_tracking_perf_test
-TEST_GEN_PROGS_aarch64 += arch_timer
-TEST_GEN_PROGS_aarch64 += coalesced_io_test
-TEST_GEN_PROGS_aarch64 += demand_paging_test
-TEST_GEN_PROGS_aarch64 += dirty_log_test
-TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
-TEST_GEN_PROGS_aarch64 += guest_print_test
-TEST_GEN_PROGS_aarch64 += get-reg-list
-TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
-TEST_GEN_PROGS_aarch64 += kvm_page_table_test
-TEST_GEN_PROGS_aarch64 += memslot_modification_stress_test
-TEST_GEN_PROGS_aarch64 += memslot_perf_test
-TEST_GEN_PROGS_aarch64 += mmu_stress_test
-TEST_GEN_PROGS_aarch64 += rseq_test
-TEST_GEN_PROGS_aarch64 += set_memory_region_test
-TEST_GEN_PROGS_aarch64 += steal_time
-TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
-
-TEST_GEN_PROGS_s390x = s390x/memop
-TEST_GEN_PROGS_s390x += s390x/resets
-TEST_GEN_PROGS_s390x += s390x/sync_regs_test
-TEST_GEN_PROGS_s390x += s390x/tprot
-TEST_GEN_PROGS_s390x += s390x/cmma_test
-TEST_GEN_PROGS_s390x += s390x/debug_test
-TEST_GEN_PROGS_s390x += s390x/cpumodel_subfuncs_test
-TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test
-TEST_GEN_PROGS_s390x += s390x/ucontrol_test
-TEST_GEN_PROGS_s390x += demand_paging_test
-TEST_GEN_PROGS_s390x += dirty_log_test
-TEST_GEN_PROGS_s390x += guest_print_test
-TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
-TEST_GEN_PROGS_s390x += kvm_page_table_test
-TEST_GEN_PROGS_s390x += rseq_test
-TEST_GEN_PROGS_s390x += set_memory_region_test
-TEST_GEN_PROGS_s390x += kvm_binary_stats_test
-
-TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test
-TEST_GEN_PROGS_riscv += riscv/ebreak_test
-TEST_GEN_PROGS_riscv += arch_timer
-TEST_GEN_PROGS_riscv += coalesced_io_test
-TEST_GEN_PROGS_riscv += demand_paging_test
-TEST_GEN_PROGS_riscv += dirty_log_test
-TEST_GEN_PROGS_riscv += get-reg-list
-TEST_GEN_PROGS_riscv += guest_print_test
-TEST_GEN_PROGS_riscv += kvm_binary_stats_test
-TEST_GEN_PROGS_riscv += kvm_create_max_vcpus
-TEST_GEN_PROGS_riscv += kvm_page_table_test
-TEST_GEN_PROGS_riscv += set_memory_region_test
-TEST_GEN_PROGS_riscv += steal_time
-
-SPLIT_TESTS += arch_timer
-SPLIT_TESTS += get-reg-list
-
-TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR))
-TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
-TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
-LIBKVM += $(LIBKVM_$(ARCH_DIR))
-
-OVERRIDE_TARGETS = 1
-
-# lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most
-# importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`,
-# which causes the environment variable to override the makefile).
-include ../lib.mk
-
-INSTALL_HDR_PATH = $(top_srcdir)/usr
-LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
-LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
-ifeq ($(ARCH),x86_64)
-LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include
-else
-LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
-endif
-CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
-	-fno-builtin-memcmp -fno-builtin-memcpy \
-	-fno-builtin-memset -fno-builtin-strnlen \
-	-fno-stack-protector -fno-PIE -fno-strict-aliasing \
-	-I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
-	-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
-	-I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
-ifeq ($(ARCH),s390)
-	CFLAGS += -march=z10
-endif
-ifeq ($(ARCH),x86)
-ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
-	CFLAGS += -march=x86-64-v2
-endif
-endif
-ifeq ($(ARCH),arm64)
-tools_dir := $(top_srcdir)/tools
-arm64_tools_dir := $(tools_dir)/arch/arm64/tools/
-
-ifneq ($(abs_objdir),)
-arm64_hdr_outdir := $(abs_objdir)/tools/
+include Makefile.kvm
 else
-arm64_hdr_outdir := $(tools_dir)/
-endif
-
-GEN_HDRS := $(arm64_hdr_outdir)arch/arm64/include/generated/
-CFLAGS += -I$(GEN_HDRS)
-
-$(GEN_HDRS): $(wildcard $(arm64_tools_dir)/*)
-	$(MAKE) -C $(arm64_tools_dir) OUTPUT=$(arm64_hdr_outdir)
+# Empty targets for unsupported architectures
+all:
+clean:
 endif
-
-no-pie-option := $(call try-run, echo 'int main(void) { return 0; }' | \
-        $(CC) -Werror $(CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie)
-
-# On s390, build the testcases KVM-enabled
-pgste-option = $(call try-run, echo 'int main(void) { return 0; }' | \
-	$(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste)
-
-LDLIBS += -ldl
-LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
-
-LIBKVM_C := $(filter %.c,$(LIBKVM))
-LIBKVM_S := $(filter %.S,$(LIBKVM))
-LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
-LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
-LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
-SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
-SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH_DIR)/%.o, $(SPLIT_TESTS))
-
-TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS))
-TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED))
-TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ))
-TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS))
-TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TEST_GEN_OBJ))
--include $(TEST_DEP_FILES)
-
-$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
-
-$(filter-out $(SPLIT_TEST_GEN_PROGS), $(TEST_GEN_PROGS)) \
-$(TEST_GEN_PROGS_EXTENDED): %: %.o
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@
-$(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
-
-$(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH_DIR)/%.o
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $^ $(LDLIBS) -o $@
-$(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH_DIR)/%.o: $(ARCH_DIR)/%.c
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
-
-EXTRA_CLEAN += $(GEN_HDRS) \
-	       $(LIBKVM_OBJS) \
-	       $(SPLIT_TEST_GEN_OBJ) \
-	       $(TEST_DEP_FILES) \
-	       $(TEST_GEN_OBJ) \
-	       cscope.*
-
-$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(GEN_HDRS)
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
-
-$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(GEN_HDRS)
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
-
-# Compile the string overrides as freestanding to prevent the compiler from
-# generating self-referential code, e.g. without "freestanding" the compiler may
-# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
-$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
-	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
-
-$(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
-$(SPLIT_TEST_GEN_OBJ): $(GEN_HDRS)
-$(TEST_GEN_PROGS): $(LIBKVM_OBJS)
-$(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
-$(TEST_GEN_OBJ): $(GEN_HDRS)
-
-cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
-cscope:
-	$(RM) cscope.*
-	(find $(include_paths) -name '*.h' \
-		-exec realpath --relative-base=$(PWD) {} \;; \
-	find . -name '*.c' \
-		-exec realpath --relative-base=$(PWD) {} \;) | sort -u > cscope.files
-	cscope -b
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
new file mode 100644
index 000000000000..e988a72f8c20
--- /dev/null
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: GPL-2.0-only
+include ../../../build/Build.include
+
+all:
+
+LIBKVM += lib/assert.c
+LIBKVM += lib/elf.c
+LIBKVM += lib/guest_modes.c
+LIBKVM += lib/io.c
+LIBKVM += lib/kvm_util.c
+LIBKVM += lib/memstress.c
+LIBKVM += lib/guest_sprintf.c
+LIBKVM += lib/rbtree.c
+LIBKVM += lib/sparsebit.c
+LIBKVM += lib/test_util.c
+LIBKVM += lib/ucall_common.c
+LIBKVM += lib/userfaultfd_util.c
+
+LIBKVM_STRING += lib/string_override.c
+
+LIBKVM_x86_64 += lib/x86_64/apic.c
+LIBKVM_x86_64 += lib/x86_64/handlers.S
+LIBKVM_x86_64 += lib/x86_64/hyperv.c
+LIBKVM_x86_64 += lib/x86_64/memstress.c
+LIBKVM_x86_64 += lib/x86_64/pmu.c
+LIBKVM_x86_64 += lib/x86_64/processor.c
+LIBKVM_x86_64 += lib/x86_64/sev.c
+LIBKVM_x86_64 += lib/x86_64/svm.c
+LIBKVM_x86_64 += lib/x86_64/ucall.c
+LIBKVM_x86_64 += lib/x86_64/vmx.c
+
+LIBKVM_aarch64 += lib/aarch64/gic.c
+LIBKVM_aarch64 += lib/aarch64/gic_v3.c
+LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c
+LIBKVM_aarch64 += lib/aarch64/handlers.S
+LIBKVM_aarch64 += lib/aarch64/processor.c
+LIBKVM_aarch64 += lib/aarch64/spinlock.c
+LIBKVM_aarch64 += lib/aarch64/ucall.c
+LIBKVM_aarch64 += lib/aarch64/vgic.c
+
+LIBKVM_s390x += lib/s390x/diag318_test_handler.c
+LIBKVM_s390x += lib/s390x/processor.c
+LIBKVM_s390x += lib/s390x/ucall.c
+LIBKVM_s390x += lib/s390x/facility.c
+
+LIBKVM_riscv += lib/riscv/handlers.S
+LIBKVM_riscv += lib/riscv/processor.c
+LIBKVM_riscv += lib/riscv/ucall.c
+
+# Non-compiled test targets
+TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
+
+# Compiled test targets
+TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
+TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
+TEST_GEN_PROGS_x86_64 += x86_64/feature_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
+TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
+TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_extended_hypercalls
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush
+TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
+TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
+TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
+TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
+TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
+TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
+TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
+TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
+TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
+TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
+TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
+TEST_GEN_PROGS_x86_64 += x86_64/smm_test
+TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_shutdown_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
+TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/ucna_injection_test
+TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
+TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
+TEST_GEN_PROGS_x86_64 += x86_64/apic_bus_clock_test
+TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
+TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
+TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test
+TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
+TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
+TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
+TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
+TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests
+TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
+TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test
+TEST_GEN_PROGS_x86_64 += x86_64/amx_test
+TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
+TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test
+TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test
+TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
+TEST_GEN_PROGS_x86_64 += coalesced_io_test
+TEST_GEN_PROGS_x86_64 += demand_paging_test
+TEST_GEN_PROGS_x86_64 += dirty_log_test
+TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += guest_memfd_test
+TEST_GEN_PROGS_x86_64 += guest_print_test
+TEST_GEN_PROGS_x86_64 += hardware_disable_test
+TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
+TEST_GEN_PROGS_x86_64 += mmu_stress_test
+TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
+TEST_GEN_PROGS_x86_64 += memslot_perf_test
+TEST_GEN_PROGS_x86_64 += rseq_test
+TEST_GEN_PROGS_x86_64 += set_memory_region_test
+TEST_GEN_PROGS_x86_64 += steal_time
+TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
+TEST_GEN_PROGS_x86_64 += system_counter_offset_test
+TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
+
+# Compiled outputs used by test targets
+TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
+
+TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
+TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases
+TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
+TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
+TEST_GEN_PROGS_aarch64 += aarch64/mmio_abort
+TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
+TEST_GEN_PROGS_aarch64 += aarch64/psci_test
+TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs
+TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter
+TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress
+TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access
+TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3
+TEST_GEN_PROGS_aarch64 += access_tracking_perf_test
+TEST_GEN_PROGS_aarch64 += arch_timer
+TEST_GEN_PROGS_aarch64 += coalesced_io_test
+TEST_GEN_PROGS_aarch64 += demand_paging_test
+TEST_GEN_PROGS_aarch64 += dirty_log_test
+TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
+TEST_GEN_PROGS_aarch64 += guest_print_test
+TEST_GEN_PROGS_aarch64 += get-reg-list
+TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
+TEST_GEN_PROGS_aarch64 += memslot_modification_stress_test
+TEST_GEN_PROGS_aarch64 += memslot_perf_test
+TEST_GEN_PROGS_aarch64 += mmu_stress_test
+TEST_GEN_PROGS_aarch64 += rseq_test
+TEST_GEN_PROGS_aarch64 += set_memory_region_test
+TEST_GEN_PROGS_aarch64 += steal_time
+TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
+
+TEST_GEN_PROGS_s390x = s390x/memop
+TEST_GEN_PROGS_s390x += s390x/resets
+TEST_GEN_PROGS_s390x += s390x/sync_regs_test
+TEST_GEN_PROGS_s390x += s390x/tprot
+TEST_GEN_PROGS_s390x += s390x/cmma_test
+TEST_GEN_PROGS_s390x += s390x/debug_test
+TEST_GEN_PROGS_s390x += s390x/cpumodel_subfuncs_test
+TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test
+TEST_GEN_PROGS_s390x += s390x/ucontrol_test
+TEST_GEN_PROGS_s390x += demand_paging_test
+TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += guest_print_test
+TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
+TEST_GEN_PROGS_s390x += rseq_test
+TEST_GEN_PROGS_s390x += set_memory_region_test
+TEST_GEN_PROGS_s390x += kvm_binary_stats_test
+
+TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test
+TEST_GEN_PROGS_riscv += riscv/ebreak_test
+TEST_GEN_PROGS_riscv += arch_timer
+TEST_GEN_PROGS_riscv += coalesced_io_test
+TEST_GEN_PROGS_riscv += demand_paging_test
+TEST_GEN_PROGS_riscv += dirty_log_test
+TEST_GEN_PROGS_riscv += get-reg-list
+TEST_GEN_PROGS_riscv += guest_print_test
+TEST_GEN_PROGS_riscv += kvm_binary_stats_test
+TEST_GEN_PROGS_riscv += kvm_create_max_vcpus
+TEST_GEN_PROGS_riscv += kvm_page_table_test
+TEST_GEN_PROGS_riscv += set_memory_region_test
+TEST_GEN_PROGS_riscv += steal_time
+
+SPLIT_TESTS += arch_timer
+SPLIT_TESTS += get-reg-list
+
+TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR))
+TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
+TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
+LIBKVM += $(LIBKVM_$(ARCH_DIR))
+
+OVERRIDE_TARGETS = 1
+
+# lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most
+# importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`,
+# which causes the environment variable to override the makefile).
+include ../lib.mk
+
+INSTALL_HDR_PATH = $(top_srcdir)/usr
+LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
+LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
+ifeq ($(ARCH),x86_64)
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include
+else
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
+endif
+CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
+	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
+	-fno-builtin-memcmp -fno-builtin-memcpy \
+	-fno-builtin-memset -fno-builtin-strnlen \
+	-fno-stack-protector -fno-PIE -fno-strict-aliasing \
+	-I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
+	-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
+	-I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+ifeq ($(ARCH),s390)
+	CFLAGS += -march=z10
+endif
+ifeq ($(ARCH),x86)
+ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
+	CFLAGS += -march=x86-64-v2
+endif
+endif
+ifeq ($(ARCH),arm64)
+tools_dir := $(top_srcdir)/tools
+arm64_tools_dir := $(tools_dir)/arch/arm64/tools/
+
+ifneq ($(abs_objdir),)
+arm64_hdr_outdir := $(abs_objdir)/tools/
+else
+arm64_hdr_outdir := $(tools_dir)/
+endif
+
+GEN_HDRS := $(arm64_hdr_outdir)arch/arm64/include/generated/
+CFLAGS += -I$(GEN_HDRS)
+
+$(GEN_HDRS): $(wildcard $(arm64_tools_dir)/*)
+	$(MAKE) -C $(arm64_tools_dir) OUTPUT=$(arm64_hdr_outdir)
+endif
+
+no-pie-option := $(call try-run, echo 'int main(void) { return 0; }' | \
+        $(CC) -Werror $(CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie)
+
+# On s390, build the testcases KVM-enabled
+pgste-option = $(call try-run, echo 'int main(void) { return 0; }' | \
+	$(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste)
+
+LDLIBS += -ldl
+LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
+
+LIBKVM_C := $(filter %.c,$(LIBKVM))
+LIBKVM_S := $(filter %.S,$(LIBKVM))
+LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
+LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
+LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
+SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
+SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH_DIR)/%.o, $(SPLIT_TESTS))
+
+TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS))
+TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED))
+TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ))
+TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS))
+TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TEST_GEN_OBJ))
+-include $(TEST_DEP_FILES)
+
+$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
+
+$(filter-out $(SPLIT_TEST_GEN_PROGS), $(TEST_GEN_PROGS)) \
+$(TEST_GEN_PROGS_EXTENDED): %: %.o
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@
+$(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+$(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH_DIR)/%.o
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $^ $(LDLIBS) -o $@
+$(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH_DIR)/%.o: $(ARCH_DIR)/%.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+EXTRA_CLEAN += $(GEN_HDRS) \
+	       $(LIBKVM_OBJS) \
+	       $(SPLIT_TEST_GEN_OBJ) \
+	       $(TEST_DEP_FILES) \
+	       $(TEST_GEN_OBJ) \
+	       cscope.*
+
+$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(GEN_HDRS)
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(GEN_HDRS)
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+# Compile the string overrides as freestanding to prevent the compiler from
+# generating self-referential code, e.g. without "freestanding" the compiler may
+# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
+$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+
+$(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
+$(SPLIT_TEST_GEN_OBJ): $(GEN_HDRS)
+$(TEST_GEN_PROGS): $(LIBKVM_OBJS)
+$(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
+$(TEST_GEN_OBJ): $(GEN_HDRS)
+
+cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
+cscope:
+	$(RM) cscope.*
+	(find $(include_paths) -name '*.h' \
+		-exec realpath --relative-base=$(PWD) {} \;; \
+	find . -name '*.c' \
+		-exec realpath --relative-base=$(PWD) {} \;) | sort -u > cscope.files
+	cscope -b
-- 
cgit v1.2.3


From 67730e6c53d70fb31618230f81c4acee9f72eaa3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:46 -0800
Subject: KVM: selftests: Use canonical $(ARCH) paths for KVM selftests
 directories

Use the kernel's canonical $(ARCH) paths instead of the raw target triple
for KVM selftests directories.  KVM selftests are quite nearly the only
place in the entire kernel that using the target triple for directories,
tools/testing/selftests/drivers/s390x being the lone holdout.

Using the kernel's preferred nomenclature eliminates the minor, but
annoying, friction of having to translate to KVM's selftests directories,
e.g. for pattern matching, opening files, running selftests, etc.

Opportunsitically delete file comments that reference the full path of the
file, as they are obviously prone to becoming stale, and serve no known
purpose.

Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Acked-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Acked-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-16-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile               |   10 +-
 tools/testing/selftests/kvm/Makefile.kvm           |  328 +++--
 .../selftests/kvm/aarch64/aarch32_id_regs.c        |  167 ---
 tools/testing/selftests/kvm/aarch64/arch_timer.c   |  220 ---
 .../selftests/kvm/aarch64/arch_timer_edge_cases.c  | 1062 ---------------
 .../selftests/kvm/aarch64/debug-exceptions.c       |  607 ---------
 tools/testing/selftests/kvm/aarch64/get-reg-list.c |  771 -----------
 tools/testing/selftests/kvm/aarch64/hypercalls.c   |  308 -----
 tools/testing/selftests/kvm/aarch64/mmio_abort.c   |  159 ---
 tools/testing/selftests/kvm/aarch64/no-vgic-v3.c   |  175 ---
 .../selftests/kvm/aarch64/page_fault_test.c        | 1135 ----------------
 tools/testing/selftests/kvm/aarch64/psci_test.c    |  290 ----
 tools/testing/selftests/kvm/aarch64/set_id_regs.c  |  695 ----------
 tools/testing/selftests/kvm/aarch64/smccc_filter.c |  268 ----
 .../selftests/kvm/aarch64/vcpu_width_config.c      |  121 --
 tools/testing/selftests/kvm/aarch64/vgic_init.c    |  764 -----------
 tools/testing/selftests/kvm/aarch64/vgic_irq.c     |  847 ------------
 .../selftests/kvm/aarch64/vgic_lpi_stress.c        |  410 ------
 .../selftests/kvm/aarch64/vpmu_counter_access.c    |  648 ---------
 .../testing/selftests/kvm/arm64/aarch32_id_regs.c  |  167 +++
 tools/testing/selftests/kvm/arm64/arch_timer.c     |  220 +++
 .../selftests/kvm/arm64/arch_timer_edge_cases.c    | 1062 +++++++++++++++
 .../testing/selftests/kvm/arm64/debug-exceptions.c |  607 +++++++++
 tools/testing/selftests/kvm/arm64/get-reg-list.c   |  771 +++++++++++
 tools/testing/selftests/kvm/arm64/hypercalls.c     |  308 +++++
 tools/testing/selftests/kvm/arm64/mmio_abort.c     |  159 +++
 tools/testing/selftests/kvm/arm64/no-vgic-v3.c     |  175 +++
 .../testing/selftests/kvm/arm64/page_fault_test.c  | 1135 ++++++++++++++++
 tools/testing/selftests/kvm/arm64/psci_test.c      |  290 ++++
 tools/testing/selftests/kvm/arm64/set_id_regs.c    |  695 ++++++++++
 tools/testing/selftests/kvm/arm64/smccc_filter.c   |  268 ++++
 .../selftests/kvm/arm64/vcpu_width_config.c        |  121 ++
 tools/testing/selftests/kvm/arm64/vgic_init.c      |  764 +++++++++++
 tools/testing/selftests/kvm/arm64/vgic_irq.c       |  847 ++++++++++++
 .../testing/selftests/kvm/arm64/vgic_lpi_stress.c  |  410 ++++++
 .../selftests/kvm/arm64/vpmu_counter_access.c      |  648 +++++++++
 tools/testing/selftests/kvm/dirty_log_perf_test.c  |    2 +-
 .../selftests/kvm/include/aarch64/arch_timer.h     |  158 ---
 .../testing/selftests/kvm/include/aarch64/delay.h  |   25 -
 tools/testing/selftests/kvm/include/aarch64/gic.h  |   64 -
 .../testing/selftests/kvm/include/aarch64/gic_v3.h |  604 ---------
 .../selftests/kvm/include/aarch64/gic_v3_its.h     |   19 -
 .../selftests/kvm/include/aarch64/kvm_util_arch.h  |    7 -
 .../selftests/kvm/include/aarch64/processor.h      |  238 ----
 .../selftests/kvm/include/aarch64/spinlock.h       |   13 -
 .../testing/selftests/kvm/include/aarch64/ucall.h  |   20 -
 tools/testing/selftests/kvm/include/aarch64/vgic.h |   37 -
 .../selftests/kvm/include/arm64/arch_timer.h       |  158 +++
 tools/testing/selftests/kvm/include/arm64/delay.h  |   25 +
 tools/testing/selftests/kvm/include/arm64/gic.h    |   64 +
 tools/testing/selftests/kvm/include/arm64/gic_v3.h |  604 +++++++++
 .../selftests/kvm/include/arm64/gic_v3_its.h       |   19 +
 .../selftests/kvm/include/arm64/kvm_util_arch.h    |    7 +
 .../selftests/kvm/include/arm64/processor.h        |  238 ++++
 .../testing/selftests/kvm/include/arm64/spinlock.h |   13 +
 tools/testing/selftests/kvm/include/arm64/ucall.h  |   20 +
 tools/testing/selftests/kvm/include/arm64/vgic.h   |   37 +
 .../selftests/kvm/include/s390/debug_print.h       |   69 +
 .../kvm/include/s390/diag318_test_handler.h        |   13 +
 .../testing/selftests/kvm/include/s390/facility.h  |   50 +
 .../selftests/kvm/include/s390/kvm_util_arch.h     |    7 +
 .../testing/selftests/kvm/include/s390/processor.h |   41 +
 tools/testing/selftests/kvm/include/s390/sie.h     |  240 ++++
 tools/testing/selftests/kvm/include/s390/ucall.h   |   19 +
 .../selftests/kvm/include/s390x/debug_print.h      |   69 -
 .../kvm/include/s390x/diag318_test_handler.h       |   13 -
 .../testing/selftests/kvm/include/s390x/facility.h |   50 -
 .../selftests/kvm/include/s390x/kvm_util_arch.h    |    7 -
 .../selftests/kvm/include/s390x/processor.h        |   41 -
 tools/testing/selftests/kvm/include/s390x/sie.h    |  240 ----
 tools/testing/selftests/kvm/include/s390x/ucall.h  |   19 -
 tools/testing/selftests/kvm/include/x86/apic.h     |  118 ++
 tools/testing/selftests/kvm/include/x86/evmcs.h    | 1276 ++++++++++++++++++
 tools/testing/selftests/kvm/include/x86/hyperv.h   |  361 +++++
 .../selftests/kvm/include/x86/kvm_util_arch.h      |   51 +
 tools/testing/selftests/kvm/include/x86/mce.h      |   23 +
 tools/testing/selftests/kvm/include/x86/pmu.h      |   97 ++
 .../testing/selftests/kvm/include/x86/processor.h  | 1395 +++++++++++++++++++
 tools/testing/selftests/kvm/include/x86/sev.h      |   96 ++
 tools/testing/selftests/kvm/include/x86/svm.h      |  320 +++++
 tools/testing/selftests/kvm/include/x86/svm_util.h |   62 +
 tools/testing/selftests/kvm/include/x86/ucall.h    |   13 +
 tools/testing/selftests/kvm/include/x86/vmx.h      |  575 ++++++++
 tools/testing/selftests/kvm/include/x86_64/apic.h  |  120 --
 tools/testing/selftests/kvm/include/x86_64/evmcs.h | 1279 ------------------
 .../testing/selftests/kvm/include/x86_64/hyperv.h  |  364 -----
 .../selftests/kvm/include/x86_64/kvm_util_arch.h   |   51 -
 tools/testing/selftests/kvm/include/x86_64/mce.h   |   25 -
 tools/testing/selftests/kvm/include/x86_64/pmu.h   |   97 --
 .../selftests/kvm/include/x86_64/processor.h       | 1397 --------------------
 tools/testing/selftests/kvm/include/x86_64/sev.h   |   96 --
 tools/testing/selftests/kvm/include/x86_64/svm.h   |  326 -----
 .../selftests/kvm/include/x86_64/svm_util.h        |   65 -
 tools/testing/selftests/kvm/include/x86_64/ucall.h |   13 -
 tools/testing/selftests/kvm/include/x86_64/vmx.h   |  577 --------
 tools/testing/selftests/kvm/lib/aarch64/gic.c      |  157 ---
 .../selftests/kvm/lib/aarch64/gic_private.h        |   32 -
 tools/testing/selftests/kvm/lib/aarch64/gic_v3.c   |  427 ------
 .../testing/selftests/kvm/lib/aarch64/gic_v3_its.c |  248 ----
 tools/testing/selftests/kvm/lib/aarch64/handlers.S |  126 --
 .../testing/selftests/kvm/lib/aarch64/processor.c  |  647 ---------
 tools/testing/selftests/kvm/lib/aarch64/spinlock.c |   27 -
 tools/testing/selftests/kvm/lib/aarch64/ucall.c    |   34 -
 tools/testing/selftests/kvm/lib/aarch64/vgic.c     |  188 ---
 tools/testing/selftests/kvm/lib/arm64/gic.c        |  157 +++
 .../testing/selftests/kvm/lib/arm64/gic_private.h  |   32 +
 tools/testing/selftests/kvm/lib/arm64/gic_v3.c     |  427 ++++++
 tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c |  248 ++++
 tools/testing/selftests/kvm/lib/arm64/handlers.S   |  126 ++
 tools/testing/selftests/kvm/lib/arm64/processor.c  |  647 +++++++++
 tools/testing/selftests/kvm/lib/arm64/spinlock.c   |   27 +
 tools/testing/selftests/kvm/lib/arm64/ucall.c      |   34 +
 tools/testing/selftests/kvm/lib/arm64/vgic.c       |  188 +++
 .../selftests/kvm/lib/s390/diag318_test_handler.c  |   80 ++
 tools/testing/selftests/kvm/lib/s390/facility.c    |   14 +
 tools/testing/selftests/kvm/lib/s390/processor.c   |  223 ++++
 tools/testing/selftests/kvm/lib/s390/ucall.c       |   22 +
 .../selftests/kvm/lib/s390x/diag318_test_handler.c |   80 --
 tools/testing/selftests/kvm/lib/s390x/facility.c   |   14 -
 tools/testing/selftests/kvm/lib/s390x/processor.c  |  223 ----
 tools/testing/selftests/kvm/lib/s390x/ucall.c      |   22 -
 tools/testing/selftests/kvm/lib/x86/apic.c         |   43 +
 tools/testing/selftests/kvm/lib/x86/handlers.S     |   81 ++
 tools/testing/selftests/kvm/lib/x86/hyperv.c       |  113 ++
 tools/testing/selftests/kvm/lib/x86/memstress.c    |  112 ++
 tools/testing/selftests/kvm/lib/x86/pmu.c          |   31 +
 tools/testing/selftests/kvm/lib/x86/processor.c    | 1293 ++++++++++++++++++
 tools/testing/selftests/kvm/lib/x86/sev.c          |  141 ++
 tools/testing/selftests/kvm/lib/x86/svm.c          |  163 +++
 tools/testing/selftests/kvm/lib/x86/ucall.c        |   56 +
 tools/testing/selftests/kvm/lib/x86/vmx.c          |  552 ++++++++
 tools/testing/selftests/kvm/lib/x86_64/apic.c      |   43 -
 tools/testing/selftests/kvm/lib/x86_64/handlers.S  |   81 --
 tools/testing/selftests/kvm/lib/x86_64/hyperv.c    |  113 --
 tools/testing/selftests/kvm/lib/x86_64/memstress.c |  112 --
 tools/testing/selftests/kvm/lib/x86_64/pmu.c       |   31 -
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 1295 ------------------
 tools/testing/selftests/kvm/lib/x86_64/sev.c       |  141 --
 tools/testing/selftests/kvm/lib/x86_64/svm.c       |  164 ---
 tools/testing/selftests/kvm/lib/x86_64/ucall.c     |   56 -
 tools/testing/selftests/kvm/lib/x86_64/vmx.c       |  554 --------
 tools/testing/selftests/kvm/s390/cmma_test.c       |  695 ++++++++++
 tools/testing/selftests/kvm/s390/config            |    2 +
 .../selftests/kvm/s390/cpumodel_subfuncs_test.c    |  301 +++++
 tools/testing/selftests/kvm/s390/debug_test.c      |  160 +++
 tools/testing/selftests/kvm/s390/memop.c           | 1187 +++++++++++++++++
 tools/testing/selftests/kvm/s390/resets.c          |  313 +++++
 .../selftests/kvm/s390/shared_zeropage_test.c      |  111 ++
 tools/testing/selftests/kvm/s390/sync_regs_test.c  |  238 ++++
 tools/testing/selftests/kvm/s390/tprot.c           |  244 ++++
 tools/testing/selftests/kvm/s390/ucontrol_test.c   |  638 +++++++++
 tools/testing/selftests/kvm/s390x/cmma_test.c      |  695 ----------
 tools/testing/selftests/kvm/s390x/config           |    2 -
 .../selftests/kvm/s390x/cpumodel_subfuncs_test.c   |  301 -----
 tools/testing/selftests/kvm/s390x/debug_test.c     |  160 ---
 tools/testing/selftests/kvm/s390x/memop.c          | 1187 -----------------
 tools/testing/selftests/kvm/s390x/resets.c         |  313 -----
 .../selftests/kvm/s390x/shared_zeropage_test.c     |  111 --
 tools/testing/selftests/kvm/s390x/sync_regs_test.c |  238 ----
 tools/testing/selftests/kvm/s390x/tprot.c          |  244 ----
 tools/testing/selftests/kvm/s390x/ucontrol_test.c  |  638 ---------
 .../testing/selftests/kvm/set_memory_region_test.c |    6 +-
 tools/testing/selftests/kvm/x86/amx_test.c         |  315 +++++
 .../selftests/kvm/x86/apic_bus_clock_test.c        |  194 +++
 tools/testing/selftests/kvm/x86/cpuid_test.c       |  225 ++++
 .../selftests/kvm/x86/cr4_cpuid_sync_test.c        |  100 ++
 tools/testing/selftests/kvm/x86/debug_regs.c       |  217 +++
 .../kvm/x86/dirty_log_page_splitting_test.c        |  263 ++++
 .../kvm/x86/exit_on_emulation_failure_test.c       |   39 +
 .../testing/selftests/kvm/x86/feature_msrs_test.c  |  113 ++
 .../testing/selftests/kvm/x86/fix_hypercall_test.c |  142 ++
 tools/testing/selftests/kvm/x86/flds_emulation.h   |   52 +
 tools/testing/selftests/kvm/x86/hwcr_msr_test.c    |   45 +
 tools/testing/selftests/kvm/x86/hyperv_clock.c     |  263 ++++
 tools/testing/selftests/kvm/x86/hyperv_cpuid.c     |  172 +++
 tools/testing/selftests/kvm/x86/hyperv_evmcs.c     |  307 +++++
 .../selftests/kvm/x86/hyperv_extended_hypercalls.c |   98 ++
 tools/testing/selftests/kvm/x86/hyperv_features.c  |  695 ++++++++++
 tools/testing/selftests/kvm/x86/hyperv_ipi.c       |  308 +++++
 tools/testing/selftests/kvm/x86/hyperv_svm_test.c  |  199 +++
 tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c |  680 ++++++++++
 tools/testing/selftests/kvm/x86/kvm_clock_test.c   |  156 +++
 tools/testing/selftests/kvm/x86/kvm_pv_test.c      |  190 +++
 .../selftests/kvm/x86/max_vcpuid_cap_test.c        |   62 +
 .../testing/selftests/kvm/x86/monitor_mwait_test.c |  129 ++
 .../selftests/kvm/x86/nested_exceptions_test.c     |  288 ++++
 .../testing/selftests/kvm/x86/nx_huge_pages_test.c |  266 ++++
 .../selftests/kvm/x86/nx_huge_pages_test.sh        |   69 +
 .../testing/selftests/kvm/x86/platform_info_test.c |   78 ++
 .../testing/selftests/kvm/x86/pmu_counters_test.c  |  644 +++++++++
 .../selftests/kvm/x86/pmu_event_filter_test.c      |  876 ++++++++++++
 .../kvm/x86/private_mem_conversions_test.c         |  483 +++++++
 .../selftests/kvm/x86/private_mem_kvm_exits_test.c |  120 ++
 .../selftests/kvm/x86/recalc_apic_map_test.c       |   74 ++
 tools/testing/selftests/kvm/x86/set_boot_cpu_id.c  |  146 ++
 tools/testing/selftests/kvm/x86/set_sregs_test.c   |  141 ++
 tools/testing/selftests/kvm/x86/sev_init2_tests.c  |  152 +++
 .../testing/selftests/kvm/x86/sev_migrate_tests.c  |  397 ++++++
 tools/testing/selftests/kvm/x86/sev_smoke_test.c   |  205 +++
 .../kvm/x86/smaller_maxphyaddr_emulation_test.c    |  105 ++
 tools/testing/selftests/kvm/x86/smm_test.c         |  209 +++
 tools/testing/selftests/kvm/x86/state_test.c       |  323 +++++
 tools/testing/selftests/kvm/x86/svm_int_ctl_test.c |  118 ++
 .../selftests/kvm/x86/svm_nested_shutdown_test.c   |   59 +
 .../kvm/x86/svm_nested_soft_inject_test.c          |  210 +++
 tools/testing/selftests/kvm/x86/svm_vmcall_test.c  |   70 +
 tools/testing/selftests/kvm/x86/sync_regs_test.c   |  411 ++++++
 .../selftests/kvm/x86/triple_fault_event_test.c    |  124 ++
 tools/testing/selftests/kvm/x86/tsc_msrs_test.c    |  161 +++
 tools/testing/selftests/kvm/x86/tsc_scaling_sync.c |  110 ++
 .../selftests/kvm/x86/ucna_injection_test.c        |  295 +++++
 .../testing/selftests/kvm/x86/userspace_io_test.c  |  103 ++
 .../selftests/kvm/x86/userspace_msr_exit_test.c    |  769 +++++++++++
 .../selftests/kvm/x86/vmx_apic_access_test.c       |  124 ++
 .../kvm/x86/vmx_close_while_nested_test.c          |   80 ++
 .../testing/selftests/kvm/x86/vmx_dirty_log_test.c |  179 +++
 .../x86/vmx_exception_with_invalid_guest_state.c   |  142 ++
 .../kvm/x86/vmx_invalid_nested_guest_state.c       |  103 ++
 tools/testing/selftests/kvm/x86/vmx_msrs_test.c    |  131 ++
 .../kvm/x86/vmx_nested_tsc_scaling_test.c          |  206 +++
 .../testing/selftests/kvm/x86/vmx_pmu_caps_test.c  |  247 ++++
 .../selftests/kvm/x86/vmx_preemption_timer_test.c  |  245 ++++
 .../selftests/kvm/x86/vmx_set_nested_state_test.c  |  304 +++++
 .../selftests/kvm/x86/vmx_tsc_adjust_test.c        |  156 +++
 tools/testing/selftests/kvm/x86/xapic_ipi_test.c   |  487 +++++++
 tools/testing/selftests/kvm/x86/xapic_state_test.c |  262 ++++
 tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c  |  139 ++
 tools/testing/selftests/kvm/x86/xen_shinfo_test.c  | 1161 ++++++++++++++++
 tools/testing/selftests/kvm/x86/xen_vmcall_test.c  |  143 ++
 tools/testing/selftests/kvm/x86/xss_msr_test.c     |   54 +
 tools/testing/selftests/kvm/x86_64/amx_test.c      |  315 -----
 .../selftests/kvm/x86_64/apic_bus_clock_test.c     |  194 ---
 tools/testing/selftests/kvm/x86_64/cpuid_test.c    |  225 ----
 .../selftests/kvm/x86_64/cr4_cpuid_sync_test.c     |  100 --
 tools/testing/selftests/kvm/x86_64/debug_regs.c    |  217 ---
 .../kvm/x86_64/dirty_log_page_splitting_test.c     |  263 ----
 .../kvm/x86_64/exit_on_emulation_failure_test.c    |   39 -
 .../selftests/kvm/x86_64/feature_msrs_test.c       |  113 --
 .../selftests/kvm/x86_64/fix_hypercall_test.c      |  142 --
 .../testing/selftests/kvm/x86_64/flds_emulation.h  |   52 -
 tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c |   45 -
 tools/testing/selftests/kvm/x86_64/hyperv_clock.c  |  263 ----
 tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c  |  172 ---
 tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c  |  307 -----
 .../kvm/x86_64/hyperv_extended_hypercalls.c        |   98 --
 .../testing/selftests/kvm/x86_64/hyperv_features.c |  695 ----------
 tools/testing/selftests/kvm/x86_64/hyperv_ipi.c    |  308 -----
 .../testing/selftests/kvm/x86_64/hyperv_svm_test.c |  199 ---
 .../selftests/kvm/x86_64/hyperv_tlb_flush.c        |  680 ----------
 .../testing/selftests/kvm/x86_64/kvm_clock_test.c  |  156 ---
 tools/testing/selftests/kvm/x86_64/kvm_pv_test.c   |  190 ---
 .../selftests/kvm/x86_64/max_vcpuid_cap_test.c     |   62 -
 .../selftests/kvm/x86_64/monitor_mwait_test.c      |  129 --
 .../selftests/kvm/x86_64/nested_exceptions_test.c  |  288 ----
 .../selftests/kvm/x86_64/nx_huge_pages_test.c      |  266 ----
 .../selftests/kvm/x86_64/nx_huge_pages_test.sh     |   69 -
 .../selftests/kvm/x86_64/platform_info_test.c      |   78 --
 .../selftests/kvm/x86_64/pmu_counters_test.c       |  644 ---------
 .../selftests/kvm/x86_64/pmu_event_filter_test.c   |  876 ------------
 .../kvm/x86_64/private_mem_conversions_test.c      |  483 -------
 .../kvm/x86_64/private_mem_kvm_exits_test.c        |  120 --
 .../selftests/kvm/x86_64/recalc_apic_map_test.c    |   74 --
 .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c |  146 --
 .../testing/selftests/kvm/x86_64/set_sregs_test.c  |  141 --
 .../testing/selftests/kvm/x86_64/sev_init2_tests.c |  152 ---
 .../selftests/kvm/x86_64/sev_migrate_tests.c       |  397 ------
 .../testing/selftests/kvm/x86_64/sev_smoke_test.c  |  205 ---
 .../kvm/x86_64/smaller_maxphyaddr_emulation_test.c |  105 --
 tools/testing/selftests/kvm/x86_64/smm_test.c      |  209 ---
 tools/testing/selftests/kvm/x86_64/state_test.c    |  323 -----
 .../selftests/kvm/x86_64/svm_int_ctl_test.c        |  118 --
 .../kvm/x86_64/svm_nested_shutdown_test.c          |   59 -
 .../kvm/x86_64/svm_nested_soft_inject_test.c       |  210 ---
 .../testing/selftests/kvm/x86_64/svm_vmcall_test.c |   70 -
 .../testing/selftests/kvm/x86_64/sync_regs_test.c  |  411 ------
 .../selftests/kvm/x86_64/triple_fault_event_test.c |  124 --
 tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c |  161 ---
 .../selftests/kvm/x86_64/tsc_scaling_sync.c        |  110 --
 .../selftests/kvm/x86_64/ucna_injection_test.c     |  295 -----
 .../selftests/kvm/x86_64/userspace_io_test.c       |  103 --
 .../selftests/kvm/x86_64/userspace_msr_exit_test.c |  769 -----------
 .../selftests/kvm/x86_64/vmx_apic_access_test.c    |  124 --
 .../kvm/x86_64/vmx_close_while_nested_test.c       |   80 --
 .../selftests/kvm/x86_64/vmx_dirty_log_test.c      |  179 ---
 .../vmx_exception_with_invalid_guest_state.c       |  142 --
 .../kvm/x86_64/vmx_invalid_nested_guest_state.c    |  103 --
 tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c |  131 --
 .../kvm/x86_64/vmx_nested_tsc_scaling_test.c       |  206 ---
 .../selftests/kvm/x86_64/vmx_pmu_caps_test.c       |  247 ----
 .../kvm/x86_64/vmx_preemption_timer_test.c         |  245 ----
 .../kvm/x86_64/vmx_set_nested_state_test.c         |  304 -----
 .../selftests/kvm/x86_64/vmx_tsc_adjust_test.c     |  156 ---
 .../testing/selftests/kvm/x86_64/xapic_ipi_test.c  |  487 -------
 .../selftests/kvm/x86_64/xapic_state_test.c        |  262 ----
 .../testing/selftests/kvm/x86_64/xcr0_cpuid_test.c |  139 --
 .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 1161 ----------------
 .../testing/selftests/kvm/x86_64/xen_vmcall_test.c |  143 --
 tools/testing/selftests/kvm/x86_64/xss_msr_test.c  |   54 -
 298 files changed, 39659 insertions(+), 39695 deletions(-)
 delete mode 100644 tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/arch_timer.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/debug-exceptions.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/get-reg-list.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/hypercalls.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/mmio_abort.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/page_fault_test.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/psci_test.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/set_id_regs.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/smccc_filter.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/vcpu_width_config.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/vgic_init.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/vgic_irq.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c
 delete mode 100644 tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
 create mode 100644 tools/testing/selftests/kvm/arm64/aarch32_id_regs.c
 create mode 100644 tools/testing/selftests/kvm/arm64/arch_timer.c
 create mode 100644 tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
 create mode 100644 tools/testing/selftests/kvm/arm64/debug-exceptions.c
 create mode 100644 tools/testing/selftests/kvm/arm64/get-reg-list.c
 create mode 100644 tools/testing/selftests/kvm/arm64/hypercalls.c
 create mode 100644 tools/testing/selftests/kvm/arm64/mmio_abort.c
 create mode 100644 tools/testing/selftests/kvm/arm64/no-vgic-v3.c
 create mode 100644 tools/testing/selftests/kvm/arm64/page_fault_test.c
 create mode 100644 tools/testing/selftests/kvm/arm64/psci_test.c
 create mode 100644 tools/testing/selftests/kvm/arm64/set_id_regs.c
 create mode 100644 tools/testing/selftests/kvm/arm64/smccc_filter.c
 create mode 100644 tools/testing/selftests/kvm/arm64/vcpu_width_config.c
 create mode 100644 tools/testing/selftests/kvm/arm64/vgic_init.c
 create mode 100644 tools/testing/selftests/kvm/arm64/vgic_irq.c
 create mode 100644 tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c
 create mode 100644 tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/arch_timer.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/delay.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/gic.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/gic_v3.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/processor.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/spinlock.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/ucall.h
 delete mode 100644 tools/testing/selftests/kvm/include/aarch64/vgic.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/arch_timer.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/delay.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/gic.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/gic_v3.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/gic_v3_its.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/processor.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/spinlock.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/ucall.h
 create mode 100644 tools/testing/selftests/kvm/include/arm64/vgic.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/debug_print.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/diag318_test_handler.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/facility.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/kvm_util_arch.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/processor.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/sie.h
 create mode 100644 tools/testing/selftests/kvm/include/s390/ucall.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/debug_print.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/facility.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/processor.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/sie.h
 delete mode 100644 tools/testing/selftests/kvm/include/s390x/ucall.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/apic.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/evmcs.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/hyperv.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/mce.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/pmu.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/processor.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/sev.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/svm.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/svm_util.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/ucall.h
 create mode 100644 tools/testing/selftests/kvm/include/x86/vmx.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/apic.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/evmcs.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/hyperv.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/mce.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/pmu.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/processor.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/sev.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/svm.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/svm_util.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/ucall.h
 delete mode 100644 tools/testing/selftests/kvm/include/x86_64/vmx.h
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic.c
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic_private.h
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/handlers.S
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/processor.c
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/spinlock.c
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/ucall.c
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/vgic.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/gic.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/gic_private.h
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/gic_v3.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/handlers.S
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/processor.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/spinlock.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/ucall.c
 create mode 100644 tools/testing/selftests/kvm/lib/arm64/vgic.c
 create mode 100644 tools/testing/selftests/kvm/lib/s390/diag318_test_handler.c
 create mode 100644 tools/testing/selftests/kvm/lib/s390/facility.c
 create mode 100644 tools/testing/selftests/kvm/lib/s390/processor.c
 create mode 100644 tools/testing/selftests/kvm/lib/s390/ucall.c
 delete mode 100644 tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c
 delete mode 100644 tools/testing/selftests/kvm/lib/s390x/facility.c
 delete mode 100644 tools/testing/selftests/kvm/lib/s390x/processor.c
 delete mode 100644 tools/testing/selftests/kvm/lib/s390x/ucall.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/apic.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/handlers.S
 create mode 100644 tools/testing/selftests/kvm/lib/x86/hyperv.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/memstress.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/pmu.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/processor.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/sev.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/svm.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/ucall.c
 create mode 100644 tools/testing/selftests/kvm/lib/x86/vmx.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/apic.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/handlers.S
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/hyperv.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/memstress.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/pmu.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/processor.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/sev.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/svm.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/ucall.c
 delete mode 100644 tools/testing/selftests/kvm/lib/x86_64/vmx.c
 create mode 100644 tools/testing/selftests/kvm/s390/cmma_test.c
 create mode 100644 tools/testing/selftests/kvm/s390/config
 create mode 100644 tools/testing/selftests/kvm/s390/cpumodel_subfuncs_test.c
 create mode 100644 tools/testing/selftests/kvm/s390/debug_test.c
 create mode 100644 tools/testing/selftests/kvm/s390/memop.c
 create mode 100644 tools/testing/selftests/kvm/s390/resets.c
 create mode 100644 tools/testing/selftests/kvm/s390/shared_zeropage_test.c
 create mode 100644 tools/testing/selftests/kvm/s390/sync_regs_test.c
 create mode 100644 tools/testing/selftests/kvm/s390/tprot.c
 create mode 100644 tools/testing/selftests/kvm/s390/ucontrol_test.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/cmma_test.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/config
 delete mode 100644 tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/debug_test.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/memop.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/resets.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/shared_zeropage_test.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/sync_regs_test.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/tprot.c
 delete mode 100644 tools/testing/selftests/kvm/s390x/ucontrol_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/amx_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/apic_bus_clock_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/cpuid_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/cr4_cpuid_sync_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/debug_regs.c
 create mode 100644 tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/exit_on_emulation_failure_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/feature_msrs_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/fix_hypercall_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/flds_emulation.h
 create mode 100644 tools/testing/selftests/kvm/x86/hwcr_msr_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_clock.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_cpuid.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_evmcs.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_extended_hypercalls.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_features.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_ipi.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_svm_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c
 create mode 100644 tools/testing/selftests/kvm/x86/kvm_clock_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/kvm_pv_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/max_vcpuid_cap_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/monitor_mwait_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/nested_exceptions_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/nx_huge_pages_test.c
 create mode 100755 tools/testing/selftests/kvm/x86/nx_huge_pages_test.sh
 create mode 100644 tools/testing/selftests/kvm/x86/platform_info_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/pmu_counters_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/pmu_event_filter_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/recalc_apic_map_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/set_boot_cpu_id.c
 create mode 100644 tools/testing/selftests/kvm/x86/set_sregs_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/sev_init2_tests.c
 create mode 100644 tools/testing/selftests/kvm/x86/sev_migrate_tests.c
 create mode 100644 tools/testing/selftests/kvm/x86/sev_smoke_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/smm_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/state_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/svm_int_ctl_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/svm_vmcall_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/sync_regs_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/triple_fault_event_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/tsc_msrs_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/tsc_scaling_sync.c
 create mode 100644 tools/testing/selftests/kvm/x86/ucna_injection_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/userspace_io_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/userspace_msr_exit_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_apic_access_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_exception_with_invalid_guest_state.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_msrs_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_pmu_caps_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/xapic_ipi_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/xapic_state_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/xen_shinfo_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/xen_vmcall_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/xss_msr_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/amx_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/cpuid_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/debug_regs.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/feature_msrs_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/flds_emulation.h
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_clock.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_features.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
 delete mode 100755 tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh
 delete mode 100644 tools/testing/selftests/kvm/x86_64/platform_info_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/set_sregs_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/sev_init2_tests.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/smm_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/state_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/sync_regs_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/userspace_io_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/xapic_state_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/xss_msr_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 7b33464bf8cc..9bc2eba1af1c 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -4,16 +4,12 @@ include $(top_srcdir)/scripts/subarch.include
 ARCH            ?= $(SUBARCH)
 
 ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64))
-ifeq ($(ARCH),x86)
-	ARCH_DIR := x86_64
-else ifeq ($(ARCH),arm64)
-	ARCH_DIR := aarch64
-else ifeq ($(ARCH),s390)
-	ARCH_DIR := s390x
+# Top-level selftests allows ARCH=x86_64 :-(
+ifeq ($(ARCH),x86_64)
+	ARCH_DIR := x86
 else
 	ARCH_DIR := $(ARCH)
 endif
-
 include Makefile.kvm
 else
 # Empty targets for unsupported architectures
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index e988a72f8c20..9888dd6bb483 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -18,177 +18,177 @@ LIBKVM += lib/userfaultfd_util.c
 
 LIBKVM_STRING += lib/string_override.c
 
-LIBKVM_x86_64 += lib/x86_64/apic.c
-LIBKVM_x86_64 += lib/x86_64/handlers.S
-LIBKVM_x86_64 += lib/x86_64/hyperv.c
-LIBKVM_x86_64 += lib/x86_64/memstress.c
-LIBKVM_x86_64 += lib/x86_64/pmu.c
-LIBKVM_x86_64 += lib/x86_64/processor.c
-LIBKVM_x86_64 += lib/x86_64/sev.c
-LIBKVM_x86_64 += lib/x86_64/svm.c
-LIBKVM_x86_64 += lib/x86_64/ucall.c
-LIBKVM_x86_64 += lib/x86_64/vmx.c
-
-LIBKVM_aarch64 += lib/aarch64/gic.c
-LIBKVM_aarch64 += lib/aarch64/gic_v3.c
-LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c
-LIBKVM_aarch64 += lib/aarch64/handlers.S
-LIBKVM_aarch64 += lib/aarch64/processor.c
-LIBKVM_aarch64 += lib/aarch64/spinlock.c
-LIBKVM_aarch64 += lib/aarch64/ucall.c
-LIBKVM_aarch64 += lib/aarch64/vgic.c
-
-LIBKVM_s390x += lib/s390x/diag318_test_handler.c
-LIBKVM_s390x += lib/s390x/processor.c
-LIBKVM_s390x += lib/s390x/ucall.c
-LIBKVM_s390x += lib/s390x/facility.c
+LIBKVM_x86 += lib/x86/apic.c
+LIBKVM_x86 += lib/x86/handlers.S
+LIBKVM_x86 += lib/x86/hyperv.c
+LIBKVM_x86 += lib/x86/memstress.c
+LIBKVM_x86 += lib/x86/pmu.c
+LIBKVM_x86 += lib/x86/processor.c
+LIBKVM_x86 += lib/x86/sev.c
+LIBKVM_x86 += lib/x86/svm.c
+LIBKVM_x86 += lib/x86/ucall.c
+LIBKVM_x86 += lib/x86/vmx.c
+
+LIBKVM_arm64 += lib/arm64/gic.c
+LIBKVM_arm64 += lib/arm64/gic_v3.c
+LIBKVM_arm64 += lib/arm64/gic_v3_its.c
+LIBKVM_arm64 += lib/arm64/handlers.S
+LIBKVM_arm64 += lib/arm64/processor.c
+LIBKVM_arm64 += lib/arm64/spinlock.c
+LIBKVM_arm64 += lib/arm64/ucall.c
+LIBKVM_arm64 += lib/arm64/vgic.c
+
+LIBKVM_s390 += lib/s390/diag318_test_handler.c
+LIBKVM_s390 += lib/s390/processor.c
+LIBKVM_s390 += lib/s390/ucall.c
+LIBKVM_s390 += lib/s390/facility.c
 
 LIBKVM_riscv += lib/riscv/handlers.S
 LIBKVM_riscv += lib/riscv/processor.c
 LIBKVM_riscv += lib/riscv/ucall.c
 
 # Non-compiled test targets
-TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
+TEST_PROGS_x86 += x86/nx_huge_pages_test.sh
 
 # Compiled test targets
-TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
-TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
-TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
-TEST_GEN_PROGS_x86_64 += x86_64/feature_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
-TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
-TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_extended_hypercalls
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
-TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush
-TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
-TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
-TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
-TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
-TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
-TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
-TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
-TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
-TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
-TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
-TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
-TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
-TEST_GEN_PROGS_x86_64 += x86_64/smm_test
-TEST_GEN_PROGS_x86_64 += x86_64/state_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_shutdown_test
-TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test
-TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
-TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
-TEST_GEN_PROGS_x86_64 += x86_64/ucna_injection_test
-TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
-TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
-TEST_GEN_PROGS_x86_64 += x86_64/apic_bus_clock_test
-TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
-TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
-TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test
-TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
-TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
-TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
-TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
-TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
-TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
-TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests
-TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
-TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test
-TEST_GEN_PROGS_x86_64 += x86_64/amx_test
-TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
-TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test
-TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test
-TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
-TEST_GEN_PROGS_x86_64 += coalesced_io_test
-TEST_GEN_PROGS_x86_64 += demand_paging_test
-TEST_GEN_PROGS_x86_64 += dirty_log_test
-TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
-TEST_GEN_PROGS_x86_64 += guest_memfd_test
-TEST_GEN_PROGS_x86_64 += guest_print_test
-TEST_GEN_PROGS_x86_64 += hardware_disable_test
-TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
-TEST_GEN_PROGS_x86_64 += kvm_page_table_test
-TEST_GEN_PROGS_x86_64 += mmu_stress_test
-TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
-TEST_GEN_PROGS_x86_64 += memslot_perf_test
-TEST_GEN_PROGS_x86_64 += rseq_test
-TEST_GEN_PROGS_x86_64 += set_memory_region_test
-TEST_GEN_PROGS_x86_64 += steal_time
-TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
-TEST_GEN_PROGS_x86_64 += system_counter_offset_test
-TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
+TEST_GEN_PROGS_x86 = x86/cpuid_test
+TEST_GEN_PROGS_x86 += x86/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86 += x86/dirty_log_page_splitting_test
+TEST_GEN_PROGS_x86 += x86/feature_msrs_test
+TEST_GEN_PROGS_x86 += x86/exit_on_emulation_failure_test
+TEST_GEN_PROGS_x86 += x86/fix_hypercall_test
+TEST_GEN_PROGS_x86 += x86/hwcr_msr_test
+TEST_GEN_PROGS_x86 += x86/hyperv_clock
+TEST_GEN_PROGS_x86 += x86/hyperv_cpuid
+TEST_GEN_PROGS_x86 += x86/hyperv_evmcs
+TEST_GEN_PROGS_x86 += x86/hyperv_extended_hypercalls
+TEST_GEN_PROGS_x86 += x86/hyperv_features
+TEST_GEN_PROGS_x86 += x86/hyperv_ipi
+TEST_GEN_PROGS_x86 += x86/hyperv_svm_test
+TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush
+TEST_GEN_PROGS_x86 += x86/kvm_clock_test
+TEST_GEN_PROGS_x86 += x86/kvm_pv_test
+TEST_GEN_PROGS_x86 += x86/monitor_mwait_test
+TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
+TEST_GEN_PROGS_x86 += x86/platform_info_test
+TEST_GEN_PROGS_x86 += x86/pmu_counters_test
+TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test
+TEST_GEN_PROGS_x86 += x86/private_mem_conversions_test
+TEST_GEN_PROGS_x86 += x86/private_mem_kvm_exits_test
+TEST_GEN_PROGS_x86 += x86/set_boot_cpu_id
+TEST_GEN_PROGS_x86 += x86/set_sregs_test
+TEST_GEN_PROGS_x86 += x86/smaller_maxphyaddr_emulation_test
+TEST_GEN_PROGS_x86 += x86/smm_test
+TEST_GEN_PROGS_x86 += x86/state_test
+TEST_GEN_PROGS_x86 += x86/vmx_preemption_timer_test
+TEST_GEN_PROGS_x86 += x86/svm_vmcall_test
+TEST_GEN_PROGS_x86 += x86/svm_int_ctl_test
+TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test
+TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test
+TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync
+TEST_GEN_PROGS_x86 += x86/sync_regs_test
+TEST_GEN_PROGS_x86 += x86/ucna_injection_test
+TEST_GEN_PROGS_x86 += x86/userspace_io_test
+TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test
+TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test
+TEST_GEN_PROGS_x86 += x86/vmx_close_while_nested_test
+TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test
+TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state
+TEST_GEN_PROGS_x86 += x86/vmx_msrs_test
+TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state
+TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test
+TEST_GEN_PROGS_x86 += x86/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86 += x86/vmx_nested_tsc_scaling_test
+TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test
+TEST_GEN_PROGS_x86 += x86/xapic_ipi_test
+TEST_GEN_PROGS_x86 += x86/xapic_state_test
+TEST_GEN_PROGS_x86 += x86/xcr0_cpuid_test
+TEST_GEN_PROGS_x86 += x86/xss_msr_test
+TEST_GEN_PROGS_x86 += x86/debug_regs
+TEST_GEN_PROGS_x86 += x86/tsc_msrs_test
+TEST_GEN_PROGS_x86 += x86/vmx_pmu_caps_test
+TEST_GEN_PROGS_x86 += x86/xen_shinfo_test
+TEST_GEN_PROGS_x86 += x86/xen_vmcall_test
+TEST_GEN_PROGS_x86 += x86/sev_init2_tests
+TEST_GEN_PROGS_x86 += x86/sev_migrate_tests
+TEST_GEN_PROGS_x86 += x86/sev_smoke_test
+TEST_GEN_PROGS_x86 += x86/amx_test
+TEST_GEN_PROGS_x86 += x86/max_vcpuid_cap_test
+TEST_GEN_PROGS_x86 += x86/triple_fault_event_test
+TEST_GEN_PROGS_x86 += x86/recalc_apic_map_test
+TEST_GEN_PROGS_x86 += access_tracking_perf_test
+TEST_GEN_PROGS_x86 += coalesced_io_test
+TEST_GEN_PROGS_x86 += demand_paging_test
+TEST_GEN_PROGS_x86 += dirty_log_test
+TEST_GEN_PROGS_x86 += dirty_log_perf_test
+TEST_GEN_PROGS_x86 += guest_memfd_test
+TEST_GEN_PROGS_x86 += guest_print_test
+TEST_GEN_PROGS_x86 += hardware_disable_test
+TEST_GEN_PROGS_x86 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86 += kvm_page_table_test
+TEST_GEN_PROGS_x86 += memslot_modification_stress_test
+TEST_GEN_PROGS_x86 += memslot_perf_test
+TEST_GEN_PROGS_x86 += mmu_stress_test
+TEST_GEN_PROGS_x86 += rseq_test
+TEST_GEN_PROGS_x86 += set_memory_region_test
+TEST_GEN_PROGS_x86 += steal_time
+TEST_GEN_PROGS_x86 += kvm_binary_stats_test
+TEST_GEN_PROGS_x86 += system_counter_offset_test
+TEST_GEN_PROGS_x86 += pre_fault_memory_test
 
 # Compiled outputs used by test targets
-TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
-
-TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
-TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases
-TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
-TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
-TEST_GEN_PROGS_aarch64 += aarch64/mmio_abort
-TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
-TEST_GEN_PROGS_aarch64 += aarch64/psci_test
-TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs
-TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter
-TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
-TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
-TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
-TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress
-TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access
-TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3
-TEST_GEN_PROGS_aarch64 += access_tracking_perf_test
-TEST_GEN_PROGS_aarch64 += arch_timer
-TEST_GEN_PROGS_aarch64 += coalesced_io_test
-TEST_GEN_PROGS_aarch64 += demand_paging_test
-TEST_GEN_PROGS_aarch64 += dirty_log_test
-TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
-TEST_GEN_PROGS_aarch64 += guest_print_test
-TEST_GEN_PROGS_aarch64 += get-reg-list
-TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
-TEST_GEN_PROGS_aarch64 += kvm_page_table_test
-TEST_GEN_PROGS_aarch64 += memslot_modification_stress_test
-TEST_GEN_PROGS_aarch64 += memslot_perf_test
-TEST_GEN_PROGS_aarch64 += mmu_stress_test
-TEST_GEN_PROGS_aarch64 += rseq_test
-TEST_GEN_PROGS_aarch64 += set_memory_region_test
-TEST_GEN_PROGS_aarch64 += steal_time
-TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
-
-TEST_GEN_PROGS_s390x = s390x/memop
-TEST_GEN_PROGS_s390x += s390x/resets
-TEST_GEN_PROGS_s390x += s390x/sync_regs_test
-TEST_GEN_PROGS_s390x += s390x/tprot
-TEST_GEN_PROGS_s390x += s390x/cmma_test
-TEST_GEN_PROGS_s390x += s390x/debug_test
-TEST_GEN_PROGS_s390x += s390x/cpumodel_subfuncs_test
-TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test
-TEST_GEN_PROGS_s390x += s390x/ucontrol_test
-TEST_GEN_PROGS_s390x += demand_paging_test
-TEST_GEN_PROGS_s390x += dirty_log_test
-TEST_GEN_PROGS_s390x += guest_print_test
-TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
-TEST_GEN_PROGS_s390x += kvm_page_table_test
-TEST_GEN_PROGS_s390x += rseq_test
-TEST_GEN_PROGS_s390x += set_memory_region_test
-TEST_GEN_PROGS_s390x += kvm_binary_stats_test
+TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test
+
+TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs
+TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases
+TEST_GEN_PROGS_arm64 += arm64/debug-exceptions
+TEST_GEN_PROGS_arm64 += arm64/hypercalls
+TEST_GEN_PROGS_arm64 += arm64/mmio_abort
+TEST_GEN_PROGS_arm64 += arm64/page_fault_test
+TEST_GEN_PROGS_arm64 += arm64/psci_test
+TEST_GEN_PROGS_arm64 += arm64/set_id_regs
+TEST_GEN_PROGS_arm64 += arm64/smccc_filter
+TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config
+TEST_GEN_PROGS_arm64 += arm64/vgic_init
+TEST_GEN_PROGS_arm64 += arm64/vgic_irq
+TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
+TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
+TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
+TEST_GEN_PROGS_arm64 += access_tracking_perf_test
+TEST_GEN_PROGS_arm64 += arch_timer
+TEST_GEN_PROGS_arm64 += coalesced_io_test
+TEST_GEN_PROGS_arm64 += demand_paging_test
+TEST_GEN_PROGS_arm64 += dirty_log_test
+TEST_GEN_PROGS_arm64 += dirty_log_perf_test
+TEST_GEN_PROGS_arm64 += guest_print_test
+TEST_GEN_PROGS_arm64 += get-reg-list
+TEST_GEN_PROGS_arm64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_arm64 += kvm_page_table_test
+TEST_GEN_PROGS_arm64 += memslot_modification_stress_test
+TEST_GEN_PROGS_arm64 += memslot_perf_test
+TEST_GEN_PROGS_arm64 += mmu_stress_test
+TEST_GEN_PROGS_arm64 += rseq_test
+TEST_GEN_PROGS_arm64 += set_memory_region_test
+TEST_GEN_PROGS_arm64 += steal_time
+TEST_GEN_PROGS_arm64 += kvm_binary_stats_test
+
+TEST_GEN_PROGS_s390 = s390/memop
+TEST_GEN_PROGS_s390 += s390/resets
+TEST_GEN_PROGS_s390 += s390/sync_regs_test
+TEST_GEN_PROGS_s390 += s390/tprot
+TEST_GEN_PROGS_s390 += s390/cmma_test
+TEST_GEN_PROGS_s390 += s390/debug_test
+TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test
+TEST_GEN_PROGS_s390 += s390/shared_zeropage_test
+TEST_GEN_PROGS_s390 += s390/ucontrol_test
+TEST_GEN_PROGS_s390 += demand_paging_test
+TEST_GEN_PROGS_s390 += dirty_log_test
+TEST_GEN_PROGS_s390 += guest_print_test
+TEST_GEN_PROGS_s390 += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390 += kvm_page_table_test
+TEST_GEN_PROGS_s390 += rseq_test
+TEST_GEN_PROGS_s390 += set_memory_region_test
+TEST_GEN_PROGS_s390 += kvm_binary_stats_test
 
 TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test
 TEST_GEN_PROGS_riscv += riscv/ebreak_test
@@ -222,11 +222,7 @@ include ../lib.mk
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
 LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
-ifeq ($(ARCH),x86_64)
-LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include
-else
-LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
-endif
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH_DIR)/include
 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
 	-fno-builtin-memcmp -fno-builtin-memcpy \
diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
deleted file mode 100644
index 447d61cae4db..000000000000
--- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
+++ /dev/null
@@ -1,167 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * aarch32_id_regs - Test for ID register behavior on AArch64-only systems
- *
- * Copyright (c) 2022 Google LLC.
- *
- * Test that KVM handles the AArch64 views of the AArch32 ID registers as RAZ
- * and WI from userspace.
- */
-
-#include <stdint.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-#include <linux/bitfield.h>
-
-#define BAD_ID_REG_VAL	0x1badc0deul
-
-#define GUEST_ASSERT_REG_RAZ(reg)	GUEST_ASSERT_EQ(read_sysreg_s(reg), 0)
-
-static void guest_main(void)
-{
-	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR0_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR1_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_DFR0_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_AFR0_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR0_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR1_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR2_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR3_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR0_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR1_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR2_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR3_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR4_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR5_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR4_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR6_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_MVFR0_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_MVFR1_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_MVFR2_EL1);
-	GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 3));
-	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR2_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_DFR1_EL1);
-	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR5_EL1);
-	GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 7));
-
-	GUEST_DONE();
-}
-
-static void test_guest_raz(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		break;
-	default:
-		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-	}
-}
-
-static uint64_t raz_wi_reg_ids[] = {
-	KVM_ARM64_SYS_REG(SYS_ID_PFR0_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_PFR1_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_DFR0_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_MMFR0_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_MMFR1_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_MMFR2_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_MMFR3_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR0_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR1_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR2_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR3_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR4_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR5_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_MMFR4_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_ISAR6_EL1),
-	KVM_ARM64_SYS_REG(SYS_MVFR0_EL1),
-	KVM_ARM64_SYS_REG(SYS_MVFR1_EL1),
-	KVM_ARM64_SYS_REG(SYS_MVFR2_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_PFR2_EL1),
-	KVM_ARM64_SYS_REG(SYS_ID_MMFR5_EL1),
-};
-
-static void test_user_raz_wi(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(raz_wi_reg_ids); i++) {
-		uint64_t reg_id = raz_wi_reg_ids[i];
-		uint64_t val;
-
-		val = vcpu_get_reg(vcpu, reg_id);
-		TEST_ASSERT_EQ(val, 0);
-
-		/*
-		 * Expect the ioctl to succeed with no effect on the register
-		 * value.
-		 */
-		vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
-
-		val = vcpu_get_reg(vcpu, reg_id);
-		TEST_ASSERT_EQ(val, 0);
-	}
-}
-
-static uint64_t raz_invariant_reg_ids[] = {
-	KVM_ARM64_SYS_REG(SYS_ID_AFR0_EL1),
-	KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 3)),
-	KVM_ARM64_SYS_REG(SYS_ID_DFR1_EL1),
-	KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 7)),
-};
-
-static void test_user_raz_invariant(struct kvm_vcpu *vcpu)
-{
-	int i, r;
-
-	for (i = 0; i < ARRAY_SIZE(raz_invariant_reg_ids); i++) {
-		uint64_t reg_id = raz_invariant_reg_ids[i];
-		uint64_t val;
-
-		val = vcpu_get_reg(vcpu, reg_id);
-		TEST_ASSERT_EQ(val, 0);
-
-		r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
-		TEST_ASSERT(r < 0 && errno == EINVAL,
-			    "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
-
-		val = vcpu_get_reg(vcpu, reg_id);
-		TEST_ASSERT_EQ(val, 0);
-	}
-}
-
-
-
-static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
-{
-	uint64_t val, el0;
-
-	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
-
-	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
-	return el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY;
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-
-	TEST_REQUIRE(vcpu_aarch64_only(vcpu));
-
-	test_user_raz_wi(vcpu);
-	test_user_raz_invariant(vcpu);
-	test_guest_raz(vcpu);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
deleted file mode 100644
index eeba1cc87ff8..000000000000
--- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
+++ /dev/null
@@ -1,220 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * The test validates both the virtual and physical timer IRQs using
- * CVAL and TVAL registers.
- *
- * Copyright (c) 2021, Google LLC.
- */
-#include "arch_timer.h"
-#include "delay.h"
-#include "gic.h"
-#include "processor.h"
-#include "timer_test.h"
-#include "ucall_common.h"
-#include "vgic.h"
-
-enum guest_stage {
-	GUEST_STAGE_VTIMER_CVAL = 1,
-	GUEST_STAGE_VTIMER_TVAL,
-	GUEST_STAGE_PTIMER_CVAL,
-	GUEST_STAGE_PTIMER_TVAL,
-	GUEST_STAGE_MAX,
-};
-
-static int vtimer_irq, ptimer_irq;
-
-static void
-guest_configure_timer_action(struct test_vcpu_shared_data *shared_data)
-{
-	switch (shared_data->guest_stage) {
-	case GUEST_STAGE_VTIMER_CVAL:
-		timer_set_next_cval_ms(VIRTUAL, test_args.timer_period_ms);
-		shared_data->xcnt = timer_get_cntct(VIRTUAL);
-		timer_set_ctl(VIRTUAL, CTL_ENABLE);
-		break;
-	case GUEST_STAGE_VTIMER_TVAL:
-		timer_set_next_tval_ms(VIRTUAL, test_args.timer_period_ms);
-		shared_data->xcnt = timer_get_cntct(VIRTUAL);
-		timer_set_ctl(VIRTUAL, CTL_ENABLE);
-		break;
-	case GUEST_STAGE_PTIMER_CVAL:
-		timer_set_next_cval_ms(PHYSICAL, test_args.timer_period_ms);
-		shared_data->xcnt = timer_get_cntct(PHYSICAL);
-		timer_set_ctl(PHYSICAL, CTL_ENABLE);
-		break;
-	case GUEST_STAGE_PTIMER_TVAL:
-		timer_set_next_tval_ms(PHYSICAL, test_args.timer_period_ms);
-		shared_data->xcnt = timer_get_cntct(PHYSICAL);
-		timer_set_ctl(PHYSICAL, CTL_ENABLE);
-		break;
-	default:
-		GUEST_ASSERT(0);
-	}
-}
-
-static void guest_validate_irq(unsigned int intid,
-				struct test_vcpu_shared_data *shared_data)
-{
-	enum guest_stage stage = shared_data->guest_stage;
-	uint64_t xcnt = 0, xcnt_diff_us, cval = 0;
-	unsigned long xctl = 0;
-	unsigned int timer_irq = 0;
-	unsigned int accessor;
-
-	if (intid == IAR_SPURIOUS)
-		return;
-
-	switch (stage) {
-	case GUEST_STAGE_VTIMER_CVAL:
-	case GUEST_STAGE_VTIMER_TVAL:
-		accessor = VIRTUAL;
-		timer_irq = vtimer_irq;
-		break;
-	case GUEST_STAGE_PTIMER_CVAL:
-	case GUEST_STAGE_PTIMER_TVAL:
-		accessor = PHYSICAL;
-		timer_irq = ptimer_irq;
-		break;
-	default:
-		GUEST_ASSERT(0);
-		return;
-	}
-
-	xctl = timer_get_ctl(accessor);
-	if ((xctl & CTL_IMASK) || !(xctl & CTL_ENABLE))
-		return;
-
-	timer_set_ctl(accessor, CTL_IMASK);
-	xcnt = timer_get_cntct(accessor);
-	cval = timer_get_cval(accessor);
-
-	xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt);
-
-	/* Make sure we are dealing with the correct timer IRQ */
-	GUEST_ASSERT_EQ(intid, timer_irq);
-
-	/* Basic 'timer condition met' check */
-	__GUEST_ASSERT(xcnt >= cval,
-		       "xcnt = 0x%lx, cval = 0x%lx, xcnt_diff_us = 0x%lx",
-		       xcnt, cval, xcnt_diff_us);
-	__GUEST_ASSERT(xctl & CTL_ISTATUS, "xctl = 0x%lx", xctl);
-
-	WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1);
-}
-
-static void guest_irq_handler(struct ex_regs *regs)
-{
-	unsigned int intid = gic_get_and_ack_irq();
-	uint32_t cpu = guest_get_vcpuid();
-	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu];
-
-	guest_validate_irq(intid, shared_data);
-
-	gic_set_eoi(intid);
-}
-
-static void guest_run_stage(struct test_vcpu_shared_data *shared_data,
-				enum guest_stage stage)
-{
-	uint32_t irq_iter, config_iter;
-
-	shared_data->guest_stage = stage;
-	shared_data->nr_iter = 0;
-
-	for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) {
-		/* Setup the next interrupt */
-		guest_configure_timer_action(shared_data);
-
-		/* Setup a timeout for the interrupt to arrive */
-		udelay(msecs_to_usecs(test_args.timer_period_ms) +
-			test_args.timer_err_margin_us);
-
-		irq_iter = READ_ONCE(shared_data->nr_iter);
-		__GUEST_ASSERT(config_iter + 1 == irq_iter,
-				"config_iter + 1 = 0x%x, irq_iter = 0x%x.\n"
-				"  Guest timer interrupt was not triggered within the specified\n"
-				"  interval, try to increase the error margin by [-e] option.\n",
-				config_iter + 1, irq_iter);
-	}
-}
-
-static void guest_code(void)
-{
-	uint32_t cpu = guest_get_vcpuid();
-	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu];
-
-	local_irq_disable();
-
-	gic_init(GIC_V3, test_args.nr_vcpus);
-
-	timer_set_ctl(VIRTUAL, CTL_IMASK);
-	timer_set_ctl(PHYSICAL, CTL_IMASK);
-
-	gic_irq_enable(vtimer_irq);
-	gic_irq_enable(ptimer_irq);
-	local_irq_enable();
-
-	guest_run_stage(shared_data, GUEST_STAGE_VTIMER_CVAL);
-	guest_run_stage(shared_data, GUEST_STAGE_VTIMER_TVAL);
-	guest_run_stage(shared_data, GUEST_STAGE_PTIMER_CVAL);
-	guest_run_stage(shared_data, GUEST_STAGE_PTIMER_TVAL);
-
-	GUEST_DONE();
-}
-
-static void test_init_timer_irq(struct kvm_vm *vm)
-{
-	/* Timer initid should be same for all the vCPUs, so query only vCPU-0 */
-	vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL,
-			     KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq);
-	vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL,
-			     KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq);
-
-	sync_global_to_guest(vm, ptimer_irq);
-	sync_global_to_guest(vm, vtimer_irq);
-
-	pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
-}
-
-static int gic_fd;
-
-struct kvm_vm *test_vm_create(void)
-{
-	struct kvm_vm *vm;
-	unsigned int i;
-	int nr_vcpus = test_args.nr_vcpus;
-
-	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
-
-	vm_init_descriptor_tables(vm);
-	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
-
-	if (!test_args.reserved) {
-		if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) {
-			struct kvm_arm_counter_offset offset = {
-				.counter_offset = test_args.counter_offset,
-				.reserved = 0,
-			};
-			vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &offset);
-		} else
-			TEST_FAIL("no support for global offset");
-	}
-
-	for (i = 0; i < nr_vcpus; i++)
-		vcpu_init_descriptor_tables(vcpus[i]);
-
-	test_init_timer_irq(vm);
-	gic_fd = vgic_v3_setup(vm, nr_vcpus, 64);
-	__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3");
-
-	/* Make all the test's cmdline args visible to the guest */
-	sync_global_to_guest(vm, test_args);
-
-	return vm;
-}
-
-void test_vm_cleanup(struct kvm_vm *vm)
-{
-	close(gic_fd);
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c
deleted file mode 100644
index a36a7e2db434..000000000000
--- a/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * arch_timer_edge_cases.c - Tests the aarch64 timer IRQ functionality.
- *
- * The test validates some edge cases related to the arch-timer:
- * - timers above the max TVAL value.
- * - timers in the past
- * - moving counters ahead and behind pending timers.
- * - reprograming timers.
- * - timers fired multiple times.
- * - masking/unmasking using the timer control mask.
- *
- * Copyright (c) 2021, Google LLC.
- */
-
-#define _GNU_SOURCE
-
-#include <pthread.h>
-#include <sys/sysinfo.h>
-
-#include "arch_timer.h"
-#include "gic.h"
-#include "vgic.h"
-
-static const uint64_t CVAL_MAX = ~0ULL;
-/* tval is a signed 32-bit int. */
-static const int32_t TVAL_MAX = INT32_MAX;
-static const int32_t TVAL_MIN = INT32_MIN;
-
-/* After how much time we say there is no IRQ. */
-static const uint32_t TIMEOUT_NO_IRQ_US = 50000;
-
-/* A nice counter value to use as the starting one for most tests. */
-static const uint64_t DEF_CNT = (CVAL_MAX / 2);
-
-/* Number of runs. */
-static const uint32_t NR_TEST_ITERS_DEF = 5;
-
-/* Default wait test time in ms. */
-static const uint32_t WAIT_TEST_MS = 10;
-
-/* Default "long" wait test time in ms. */
-static const uint32_t LONG_WAIT_TEST_MS = 100;
-
-/* Shared with IRQ handler. */
-struct test_vcpu_shared_data {
-	atomic_t handled;
-	atomic_t spurious;
-} shared_data;
-
-struct test_args {
-	/* Virtual or physical timer and counter tests. */
-	enum arch_timer timer;
-	/* Delay used for most timer tests. */
-	uint64_t wait_ms;
-	/* Delay used in the test_long_timer_delays test. */
-	uint64_t long_wait_ms;
-	/* Number of iterations. */
-	int iterations;
-	/* Whether to test the physical timer. */
-	bool test_physical;
-	/* Whether to test the virtual timer. */
-	bool test_virtual;
-};
-
-struct test_args test_args = {
-	.wait_ms = WAIT_TEST_MS,
-	.long_wait_ms = LONG_WAIT_TEST_MS,
-	.iterations = NR_TEST_ITERS_DEF,
-	.test_physical = true,
-	.test_virtual = true,
-};
-
-static int vtimer_irq, ptimer_irq;
-
-enum sync_cmd {
-	SET_COUNTER_VALUE,
-	USERSPACE_USLEEP,
-	USERSPACE_SCHED_YIELD,
-	USERSPACE_MIGRATE_SELF,
-	NO_USERSPACE_CMD,
-};
-
-typedef void (*sleep_method_t)(enum arch_timer timer, uint64_t usec);
-
-static void sleep_poll(enum arch_timer timer, uint64_t usec);
-static void sleep_sched_poll(enum arch_timer timer, uint64_t usec);
-static void sleep_in_userspace(enum arch_timer timer, uint64_t usec);
-static void sleep_migrate(enum arch_timer timer, uint64_t usec);
-
-sleep_method_t sleep_method[] = {
-	sleep_poll,
-	sleep_sched_poll,
-	sleep_migrate,
-	sleep_in_userspace,
-};
-
-typedef void (*irq_wait_method_t)(void);
-
-static void wait_for_non_spurious_irq(void);
-static void wait_poll_for_irq(void);
-static void wait_sched_poll_for_irq(void);
-static void wait_migrate_poll_for_irq(void);
-
-irq_wait_method_t irq_wait_method[] = {
-	wait_for_non_spurious_irq,
-	wait_poll_for_irq,
-	wait_sched_poll_for_irq,
-	wait_migrate_poll_for_irq,
-};
-
-enum timer_view {
-	TIMER_CVAL,
-	TIMER_TVAL,
-};
-
-static void assert_irqs_handled(uint32_t n)
-{
-	int h = atomic_read(&shared_data.handled);
-
-	__GUEST_ASSERT(h == n, "Handled %d IRQS but expected %d", h, n);
-}
-
-static void userspace_cmd(uint64_t cmd)
-{
-	GUEST_SYNC_ARGS(cmd, 0, 0, 0, 0);
-}
-
-static void userspace_migrate_vcpu(void)
-{
-	userspace_cmd(USERSPACE_MIGRATE_SELF);
-}
-
-static void userspace_sleep(uint64_t usecs)
-{
-	GUEST_SYNC_ARGS(USERSPACE_USLEEP, usecs, 0, 0, 0);
-}
-
-static void set_counter(enum arch_timer timer, uint64_t counter)
-{
-	GUEST_SYNC_ARGS(SET_COUNTER_VALUE, counter, timer, 0, 0);
-}
-
-static void guest_irq_handler(struct ex_regs *regs)
-{
-	unsigned int intid = gic_get_and_ack_irq();
-	enum arch_timer timer;
-	uint64_t cnt, cval;
-	uint32_t ctl;
-	bool timer_condition, istatus;
-
-	if (intid == IAR_SPURIOUS) {
-		atomic_inc(&shared_data.spurious);
-		goto out;
-	}
-
-	if (intid == ptimer_irq)
-		timer = PHYSICAL;
-	else if (intid == vtimer_irq)
-		timer = VIRTUAL;
-	else
-		goto out;
-
-	ctl = timer_get_ctl(timer);
-	cval = timer_get_cval(timer);
-	cnt = timer_get_cntct(timer);
-	timer_condition = cnt >= cval;
-	istatus = (ctl & CTL_ISTATUS) && (ctl & CTL_ENABLE);
-	GUEST_ASSERT_EQ(timer_condition, istatus);
-
-	/* Disable and mask the timer. */
-	timer_set_ctl(timer, CTL_IMASK);
-
-	atomic_inc(&shared_data.handled);
-
-out:
-	gic_set_eoi(intid);
-}
-
-static void set_cval_irq(enum arch_timer timer, uint64_t cval_cycles,
-			 uint32_t ctl)
-{
-	atomic_set(&shared_data.handled, 0);
-	atomic_set(&shared_data.spurious, 0);
-	timer_set_cval(timer, cval_cycles);
-	timer_set_ctl(timer, ctl);
-}
-
-static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles,
-			 uint32_t ctl)
-{
-	atomic_set(&shared_data.handled, 0);
-	atomic_set(&shared_data.spurious, 0);
-	timer_set_ctl(timer, ctl);
-	timer_set_tval(timer, tval_cycles);
-}
-
-static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl,
-			 enum timer_view tv)
-{
-	switch (tv) {
-	case TIMER_CVAL:
-		set_cval_irq(timer, xval, ctl);
-		break;
-	case TIMER_TVAL:
-		set_tval_irq(timer, xval, ctl);
-		break;
-	default:
-		GUEST_FAIL("Could not get timer %d", timer);
-	}
-}
-
-/*
- * Note that this can theoretically hang forever, so we rely on having
- * a timeout mechanism in the "runner", like:
- * tools/testing/selftests/kselftest/runner.sh.
- */
-static void wait_for_non_spurious_irq(void)
-{
-	int h;
-
-	local_irq_disable();
-
-	for (h = atomic_read(&shared_data.handled); h == atomic_read(&shared_data.handled);) {
-		wfi();
-		local_irq_enable();
-		isb(); /* handle IRQ */
-		local_irq_disable();
-	}
-}
-
-/*
- * Wait for an non-spurious IRQ by polling in the guest or in
- * userspace (e.g. userspace_cmd=USERSPACE_SCHED_YIELD).
- *
- * Note that this can theoretically hang forever, so we rely on having
- * a timeout mechanism in the "runner", like:
- * tools/testing/selftests/kselftest/runner.sh.
- */
-static void poll_for_non_spurious_irq(enum sync_cmd usp_cmd)
-{
-	int h;
-
-	local_irq_disable();
-
-	h = atomic_read(&shared_data.handled);
-
-	local_irq_enable();
-	while (h == atomic_read(&shared_data.handled)) {
-		if (usp_cmd == NO_USERSPACE_CMD)
-			cpu_relax();
-		else
-			userspace_cmd(usp_cmd);
-	}
-	local_irq_disable();
-}
-
-static void wait_poll_for_irq(void)
-{
-	poll_for_non_spurious_irq(NO_USERSPACE_CMD);
-}
-
-static void wait_sched_poll_for_irq(void)
-{
-	poll_for_non_spurious_irq(USERSPACE_SCHED_YIELD);
-}
-
-static void wait_migrate_poll_for_irq(void)
-{
-	poll_for_non_spurious_irq(USERSPACE_MIGRATE_SELF);
-}
-
-/*
- * Sleep for usec microseconds by polling in the guest or in
- * userspace (e.g. userspace_cmd=USERSPACE_SCHEDULE).
- */
-static void guest_poll(enum arch_timer test_timer, uint64_t usec,
-		       enum sync_cmd usp_cmd)
-{
-	uint64_t cycles = usec_to_cycles(usec);
-	/* Whichever timer we are testing with, sleep with the other. */
-	enum arch_timer sleep_timer = 1 - test_timer;
-	uint64_t start = timer_get_cntct(sleep_timer);
-
-	while ((timer_get_cntct(sleep_timer) - start) < cycles) {
-		if (usp_cmd == NO_USERSPACE_CMD)
-			cpu_relax();
-		else
-			userspace_cmd(usp_cmd);
-	}
-}
-
-static void sleep_poll(enum arch_timer timer, uint64_t usec)
-{
-	guest_poll(timer, usec, NO_USERSPACE_CMD);
-}
-
-static void sleep_sched_poll(enum arch_timer timer, uint64_t usec)
-{
-	guest_poll(timer, usec, USERSPACE_SCHED_YIELD);
-}
-
-static void sleep_migrate(enum arch_timer timer, uint64_t usec)
-{
-	guest_poll(timer, usec, USERSPACE_MIGRATE_SELF);
-}
-
-static void sleep_in_userspace(enum arch_timer timer, uint64_t usec)
-{
-	userspace_sleep(usec);
-}
-
-/*
- * Reset the timer state to some nice values like the counter not being close
- * to the edge, and the control register masked and disabled.
- */
-static void reset_timer_state(enum arch_timer timer, uint64_t cnt)
-{
-	set_counter(timer, cnt);
-	timer_set_ctl(timer, CTL_IMASK);
-}
-
-static void test_timer_xval(enum arch_timer timer, uint64_t xval,
-			    enum timer_view tv, irq_wait_method_t wm, bool reset_state,
-			    uint64_t reset_cnt)
-{
-	local_irq_disable();
-
-	if (reset_state)
-		reset_timer_state(timer, reset_cnt);
-
-	set_xval_irq(timer, xval, CTL_ENABLE, tv);
-
-	/* This method re-enables IRQs to handle the one we're looking for. */
-	wm();
-
-	assert_irqs_handled(1);
-	local_irq_enable();
-}
-
-/*
- * The test_timer_* functions will program the timer, wait for it, and assert
- * the firing of the correct IRQ.
- *
- * These functions don't have a timeout and return as soon as they receive an
- * IRQ. They can hang (forever), so we rely on having a timeout mechanism in
- * the "runner", like: tools/testing/selftests/kselftest/runner.sh.
- */
-
-static void test_timer_cval(enum arch_timer timer, uint64_t cval,
-			    irq_wait_method_t wm, bool reset_state,
-			    uint64_t reset_cnt)
-{
-	test_timer_xval(timer, cval, TIMER_CVAL, wm, reset_state, reset_cnt);
-}
-
-static void test_timer_tval(enum arch_timer timer, int32_t tval,
-			    irq_wait_method_t wm, bool reset_state,
-			    uint64_t reset_cnt)
-{
-	test_timer_xval(timer, (uint64_t) tval, TIMER_TVAL, wm, reset_state,
-			reset_cnt);
-}
-
-static void test_xval_check_no_irq(enum arch_timer timer, uint64_t xval,
-				   uint64_t usec, enum timer_view timer_view,
-				   sleep_method_t guest_sleep)
-{
-	local_irq_disable();
-
-	set_xval_irq(timer, xval, CTL_ENABLE | CTL_IMASK, timer_view);
-	guest_sleep(timer, usec);
-
-	local_irq_enable();
-	isb();
-
-	/* Assume success (no IRQ) after waiting usec microseconds */
-	assert_irqs_handled(0);
-}
-
-static void test_cval_no_irq(enum arch_timer timer, uint64_t cval,
-			     uint64_t usec, sleep_method_t wm)
-{
-	test_xval_check_no_irq(timer, cval, usec, TIMER_CVAL, wm);
-}
-
-static void test_tval_no_irq(enum arch_timer timer, int32_t tval, uint64_t usec,
-			     sleep_method_t wm)
-{
-	/* tval will be cast to an int32_t in test_xval_check_no_irq */
-	test_xval_check_no_irq(timer, (uint64_t) tval, usec, TIMER_TVAL, wm);
-}
-
-/* Test masking/unmasking a timer using the timer mask (not the IRQ mask). */
-static void test_timer_control_mask_then_unmask(enum arch_timer timer)
-{
-	reset_timer_state(timer, DEF_CNT);
-	set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK);
-
-	/* Unmask the timer, and then get an IRQ. */
-	local_irq_disable();
-	timer_set_ctl(timer, CTL_ENABLE);
-	/* This method re-enables IRQs to handle the one we're looking for. */
-	wait_for_non_spurious_irq();
-
-	assert_irqs_handled(1);
-	local_irq_enable();
-}
-
-/* Check that timer control masks actually mask a timer being fired. */
-static void test_timer_control_masks(enum arch_timer timer)
-{
-	reset_timer_state(timer, DEF_CNT);
-
-	/* Local IRQs are not masked at this point. */
-
-	set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK);
-
-	/* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */
-	sleep_poll(timer, TIMEOUT_NO_IRQ_US);
-
-	assert_irqs_handled(0);
-	timer_set_ctl(timer, CTL_IMASK);
-}
-
-static void test_fire_a_timer_multiple_times(enum arch_timer timer,
-					     irq_wait_method_t wm, int num)
-{
-	int i;
-
-	local_irq_disable();
-	reset_timer_state(timer, DEF_CNT);
-
-	set_tval_irq(timer, 0, CTL_ENABLE);
-
-	for (i = 1; i <= num; i++) {
-		/* This method re-enables IRQs to handle the one we're looking for. */
-		wm();
-
-		/* The IRQ handler masked and disabled the timer.
-		 * Enable and unmmask it again.
-		 */
-		timer_set_ctl(timer, CTL_ENABLE);
-
-		assert_irqs_handled(i);
-	}
-
-	local_irq_enable();
-}
-
-static void test_timers_fired_multiple_times(enum arch_timer timer)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++)
-		test_fire_a_timer_multiple_times(timer, irq_wait_method[i], 10);
-}
-
-/*
- * Set a timer for tval=delta_1_ms then reprogram it to
- * tval=delta_2_ms. Check that we get the timer fired. There is no
- * timeout for the wait: we use the wfi instruction.
- */
-static void test_reprogramming_timer(enum arch_timer timer, irq_wait_method_t wm,
-				     int32_t delta_1_ms, int32_t delta_2_ms)
-{
-	local_irq_disable();
-	reset_timer_state(timer, DEF_CNT);
-
-	/* Program the timer to DEF_CNT + delta_1_ms. */
-	set_tval_irq(timer, msec_to_cycles(delta_1_ms), CTL_ENABLE);
-
-	/* Reprogram the timer to DEF_CNT + delta_2_ms. */
-	timer_set_tval(timer, msec_to_cycles(delta_2_ms));
-
-	/* This method re-enables IRQs to handle the one we're looking for. */
-	wm();
-
-	/* The IRQ should arrive at DEF_CNT + delta_2_ms (or after). */
-	GUEST_ASSERT(timer_get_cntct(timer) >=
-		     DEF_CNT + msec_to_cycles(delta_2_ms));
-
-	local_irq_enable();
-	assert_irqs_handled(1);
-};
-
-static void test_reprogram_timers(enum arch_timer timer)
-{
-	int i;
-	uint64_t base_wait = test_args.wait_ms;
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
-		/*
-		 * Ensure reprogramming works whether going from a
-		 * longer time to a shorter or vice versa.
-		 */
-		test_reprogramming_timer(timer, irq_wait_method[i], 2 * base_wait,
-					 base_wait);
-		test_reprogramming_timer(timer, irq_wait_method[i], base_wait,
-					 2 * base_wait);
-	}
-}
-
-static void test_basic_functionality(enum arch_timer timer)
-{
-	int32_t tval = (int32_t) msec_to_cycles(test_args.wait_ms);
-	uint64_t cval = DEF_CNT + msec_to_cycles(test_args.wait_ms);
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
-		irq_wait_method_t wm = irq_wait_method[i];
-
-		test_timer_cval(timer, cval, wm, true, DEF_CNT);
-		test_timer_tval(timer, tval, wm, true, DEF_CNT);
-	}
-}
-
-/*
- * This test checks basic timer behavior without actually firing timers, things
- * like: the relationship between cval and tval, tval down-counting.
- */
-static void timers_sanity_checks(enum arch_timer timer, bool use_sched)
-{
-	reset_timer_state(timer, DEF_CNT);
-
-	local_irq_disable();
-
-	/* cval in the past */
-	timer_set_cval(timer,
-		       timer_get_cntct(timer) -
-		       msec_to_cycles(test_args.wait_ms));
-	if (use_sched)
-		userspace_migrate_vcpu();
-	GUEST_ASSERT(timer_get_tval(timer) < 0);
-
-	/* tval in the past */
-	timer_set_tval(timer, -1);
-	if (use_sched)
-		userspace_migrate_vcpu();
-	GUEST_ASSERT(timer_get_cval(timer) < timer_get_cntct(timer));
-
-	/* tval larger than TVAL_MAX. This requires programming with
-	 * timer_set_cval instead so the value is expressible
-	 */
-	timer_set_cval(timer,
-		       timer_get_cntct(timer) + TVAL_MAX +
-		       msec_to_cycles(test_args.wait_ms));
-	if (use_sched)
-		userspace_migrate_vcpu();
-	GUEST_ASSERT(timer_get_tval(timer) <= 0);
-
-	/*
-	 * tval larger than 2 * TVAL_MAX.
-	 * Twice the TVAL_MAX completely loops around the TVAL.
-	 */
-	timer_set_cval(timer,
-		       timer_get_cntct(timer) + 2ULL * TVAL_MAX +
-		       msec_to_cycles(test_args.wait_ms));
-	if (use_sched)
-		userspace_migrate_vcpu();
-	GUEST_ASSERT(timer_get_tval(timer) <=
-		       msec_to_cycles(test_args.wait_ms));
-
-	/* negative tval that rollovers from 0. */
-	set_counter(timer, msec_to_cycles(1));
-	timer_set_tval(timer, -1 * msec_to_cycles(test_args.wait_ms));
-	if (use_sched)
-		userspace_migrate_vcpu();
-	GUEST_ASSERT(timer_get_cval(timer) >= (CVAL_MAX - msec_to_cycles(test_args.wait_ms)));
-
-	/* tval should keep down-counting from 0 to -1. */
-	timer_set_tval(timer, 0);
-	sleep_poll(timer, 1);
-	GUEST_ASSERT(timer_get_tval(timer) < 0);
-
-	local_irq_enable();
-
-	/* Mask and disable any pending timer. */
-	timer_set_ctl(timer, CTL_IMASK);
-}
-
-static void test_timers_sanity_checks(enum arch_timer timer)
-{
-	timers_sanity_checks(timer, false);
-	/* Check how KVM saves/restores these edge-case values. */
-	timers_sanity_checks(timer, true);
-}
-
-static void test_set_cnt_after_tval_max(enum arch_timer timer, irq_wait_method_t wm)
-{
-	local_irq_disable();
-	reset_timer_state(timer, DEF_CNT);
-
-	set_cval_irq(timer,
-		     (uint64_t) TVAL_MAX +
-		     msec_to_cycles(test_args.wait_ms) / 2, CTL_ENABLE);
-
-	set_counter(timer, TVAL_MAX);
-
-	/* This method re-enables IRQs to handle the one we're looking for. */
-	wm();
-
-	assert_irqs_handled(1);
-	local_irq_enable();
-}
-
-/* Test timers set for: cval = now + TVAL_MAX + wait_ms / 2 */
-static void test_timers_above_tval_max(enum arch_timer timer)
-{
-	uint64_t cval;
-	int i;
-
-	/*
-	 * Test that the system is not implementing cval in terms of
-	 * tval.  If that was the case, setting a cval to "cval = now
-	 * + TVAL_MAX + wait_ms" would wrap to "cval = now +
-	 * wait_ms", and the timer would fire immediately. Test that it
-	 * doesn't.
-	 */
-	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
-		reset_timer_state(timer, DEF_CNT);
-		cval = timer_get_cntct(timer) + TVAL_MAX +
-			msec_to_cycles(test_args.wait_ms);
-		test_cval_no_irq(timer, cval,
-				 msecs_to_usecs(test_args.wait_ms) +
-				 TIMEOUT_NO_IRQ_US, sleep_method[i]);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
-		/* Get the IRQ by moving the counter forward. */
-		test_set_cnt_after_tval_max(timer, irq_wait_method[i]);
-	}
-}
-
-/*
- * Template function to be used by the test_move_counter_ahead_* tests.  It
- * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and
- * then waits for an IRQ.
- */
-static void test_set_cnt_after_xval(enum arch_timer timer, uint64_t cnt_1,
-				    uint64_t xval, uint64_t cnt_2,
-				    irq_wait_method_t wm, enum timer_view tv)
-{
-	local_irq_disable();
-
-	set_counter(timer, cnt_1);
-	timer_set_ctl(timer, CTL_IMASK);
-
-	set_xval_irq(timer, xval, CTL_ENABLE, tv);
-	set_counter(timer, cnt_2);
-	/* This method re-enables IRQs to handle the one we're looking for. */
-	wm();
-
-	assert_irqs_handled(1);
-	local_irq_enable();
-}
-
-/*
- * Template function to be used by the test_move_counter_ahead_* tests.  It
- * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and
- * then waits for an IRQ.
- */
-static void test_set_cnt_after_xval_no_irq(enum arch_timer timer,
-					   uint64_t cnt_1, uint64_t xval,
-					   uint64_t cnt_2,
-					   sleep_method_t guest_sleep,
-					   enum timer_view tv)
-{
-	local_irq_disable();
-
-	set_counter(timer, cnt_1);
-	timer_set_ctl(timer, CTL_IMASK);
-
-	set_xval_irq(timer, xval, CTL_ENABLE, tv);
-	set_counter(timer, cnt_2);
-	guest_sleep(timer, TIMEOUT_NO_IRQ_US);
-
-	local_irq_enable();
-	isb();
-
-	/* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */
-	assert_irqs_handled(0);
-	timer_set_ctl(timer, CTL_IMASK);
-}
-
-static void test_set_cnt_after_tval(enum arch_timer timer, uint64_t cnt_1,
-				    int32_t tval, uint64_t cnt_2,
-				    irq_wait_method_t wm)
-{
-	test_set_cnt_after_xval(timer, cnt_1, tval, cnt_2, wm, TIMER_TVAL);
-}
-
-static void test_set_cnt_after_cval(enum arch_timer timer, uint64_t cnt_1,
-				    uint64_t cval, uint64_t cnt_2,
-				    irq_wait_method_t wm)
-{
-	test_set_cnt_after_xval(timer, cnt_1, cval, cnt_2, wm, TIMER_CVAL);
-}
-
-static void test_set_cnt_after_tval_no_irq(enum arch_timer timer,
-					   uint64_t cnt_1, int32_t tval,
-					   uint64_t cnt_2, sleep_method_t wm)
-{
-	test_set_cnt_after_xval_no_irq(timer, cnt_1, tval, cnt_2, wm,
-				       TIMER_TVAL);
-}
-
-static void test_set_cnt_after_cval_no_irq(enum arch_timer timer,
-					   uint64_t cnt_1, uint64_t cval,
-					   uint64_t cnt_2, sleep_method_t wm)
-{
-	test_set_cnt_after_xval_no_irq(timer, cnt_1, cval, cnt_2, wm,
-				       TIMER_CVAL);
-}
-
-/* Set a timer and then move the counter ahead of it. */
-static void test_move_counters_ahead_of_timers(enum arch_timer timer)
-{
-	int i;
-	int32_t tval;
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
-		irq_wait_method_t wm = irq_wait_method[i];
-
-		test_set_cnt_after_cval(timer, 0, DEF_CNT, DEF_CNT + 1, wm);
-		test_set_cnt_after_cval(timer, CVAL_MAX, 1, 2, wm);
-
-		/* Move counter ahead of negative tval. */
-		test_set_cnt_after_tval(timer, 0, -1, DEF_CNT + 1, wm);
-		test_set_cnt_after_tval(timer, 0, -1, TVAL_MAX, wm);
-		tval = TVAL_MAX;
-		test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1,
-					wm);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
-		sleep_method_t sm = sleep_method[i];
-
-		test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm);
-	}
-}
-
-/*
- * Program a timer, mask it, and then change the tval or counter to cancel it.
- * Unmask it and check that nothing fires.
- */
-static void test_move_counters_behind_timers(enum arch_timer timer)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
-		sleep_method_t sm = sleep_method[i];
-
-		test_set_cnt_after_cval_no_irq(timer, DEF_CNT, DEF_CNT - 1, 0,
-					       sm);
-		test_set_cnt_after_tval_no_irq(timer, DEF_CNT, -1, 0, sm);
-	}
-}
-
-static void test_timers_in_the_past(enum arch_timer timer)
-{
-	int32_t tval = -1 * (int32_t) msec_to_cycles(test_args.wait_ms);
-	uint64_t cval;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
-		irq_wait_method_t wm = irq_wait_method[i];
-
-		/* set a timer wait_ms the past. */
-		cval = DEF_CNT - msec_to_cycles(test_args.wait_ms);
-		test_timer_cval(timer, cval, wm, true, DEF_CNT);
-		test_timer_tval(timer, tval, wm, true, DEF_CNT);
-
-		/* Set a timer to counter=0 (in the past) */
-		test_timer_cval(timer, 0, wm, true, DEF_CNT);
-
-		/* Set a time for tval=0 (now) */
-		test_timer_tval(timer, 0, wm, true, DEF_CNT);
-
-		/* Set a timer to as far in the past as possible */
-		test_timer_tval(timer, TVAL_MIN, wm, true, DEF_CNT);
-	}
-
-	/*
-	 * Set the counter to wait_ms, and a tval to -wait_ms. There should be no
-	 * IRQ as that tval means cval=CVAL_MAX-wait_ms.
-	 */
-	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
-		sleep_method_t sm = sleep_method[i];
-
-		set_counter(timer, msec_to_cycles(test_args.wait_ms));
-		test_tval_no_irq(timer, tval, TIMEOUT_NO_IRQ_US, sm);
-	}
-}
-
-static void test_long_timer_delays(enum arch_timer timer)
-{
-	int32_t tval = (int32_t) msec_to_cycles(test_args.long_wait_ms);
-	uint64_t cval = DEF_CNT + msec_to_cycles(test_args.long_wait_ms);
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
-		irq_wait_method_t wm = irq_wait_method[i];
-
-		test_timer_cval(timer, cval, wm, true, DEF_CNT);
-		test_timer_tval(timer, tval, wm, true, DEF_CNT);
-	}
-}
-
-static void guest_run_iteration(enum arch_timer timer)
-{
-	test_basic_functionality(timer);
-	test_timers_sanity_checks(timer);
-
-	test_timers_above_tval_max(timer);
-	test_timers_in_the_past(timer);
-
-	test_move_counters_ahead_of_timers(timer);
-	test_move_counters_behind_timers(timer);
-	test_reprogram_timers(timer);
-
-	test_timers_fired_multiple_times(timer);
-
-	test_timer_control_mask_then_unmask(timer);
-	test_timer_control_masks(timer);
-}
-
-static void guest_code(enum arch_timer timer)
-{
-	int i;
-
-	local_irq_disable();
-
-	gic_init(GIC_V3, 1);
-
-	timer_set_ctl(VIRTUAL, CTL_IMASK);
-	timer_set_ctl(PHYSICAL, CTL_IMASK);
-
-	gic_irq_enable(vtimer_irq);
-	gic_irq_enable(ptimer_irq);
-	local_irq_enable();
-
-	for (i = 0; i < test_args.iterations; i++) {
-		GUEST_SYNC(i);
-		guest_run_iteration(timer);
-	}
-
-	test_long_timer_delays(timer);
-	GUEST_DONE();
-}
-
-static uint32_t next_pcpu(void)
-{
-	uint32_t max = get_nprocs();
-	uint32_t cur = sched_getcpu();
-	uint32_t next = cur;
-	cpu_set_t cpuset;
-
-	TEST_ASSERT(max > 1, "Need at least two physical cpus");
-
-	sched_getaffinity(0, sizeof(cpuset), &cpuset);
-
-	do {
-		next = (next + 1) % CPU_SETSIZE;
-	} while (!CPU_ISSET(next, &cpuset));
-
-	return next;
-}
-
-static void migrate_self(uint32_t new_pcpu)
-{
-	int ret;
-	cpu_set_t cpuset;
-	pthread_t thread;
-
-	thread = pthread_self();
-
-	CPU_ZERO(&cpuset);
-	CPU_SET(new_pcpu, &cpuset);
-
-	pr_debug("Migrating from %u to %u\n", sched_getcpu(), new_pcpu);
-
-	ret = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
-
-	TEST_ASSERT(ret == 0, "Failed to migrate to pCPU: %u; ret: %d\n",
-		    new_pcpu, ret);
-}
-
-static void kvm_set_cntxct(struct kvm_vcpu *vcpu, uint64_t cnt,
-			   enum arch_timer timer)
-{
-	if (timer == PHYSICAL)
-		vcpu_set_reg(vcpu, KVM_REG_ARM_PTIMER_CNT, cnt);
-	else
-		vcpu_set_reg(vcpu, KVM_REG_ARM_TIMER_CNT, cnt);
-}
-
-static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
-{
-	enum sync_cmd cmd = uc->args[1];
-	uint64_t val = uc->args[2];
-	enum arch_timer timer = uc->args[3];
-
-	switch (cmd) {
-	case SET_COUNTER_VALUE:
-		kvm_set_cntxct(vcpu, val, timer);
-		break;
-	case USERSPACE_USLEEP:
-		usleep(val);
-		break;
-	case USERSPACE_SCHED_YIELD:
-		sched_yield();
-		break;
-	case USERSPACE_MIGRATE_SELF:
-		migrate_self(next_pcpu());
-		break;
-	default:
-		break;
-	}
-}
-
-static void test_run(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	/* Start on CPU 0 */
-	migrate_self(0);
-
-	while (true) {
-		vcpu_run(vcpu);
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			handle_sync(vcpu, &uc);
-			break;
-		case UCALL_DONE:
-			goto out;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			goto out;
-		default:
-			TEST_FAIL("Unexpected guest exit\n");
-		}
-	}
-
- out:
-	return;
-}
-
-static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
-{
-	vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL,
-			     KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq);
-	vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL,
-			     KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq);
-
-	sync_global_to_guest(vm, ptimer_irq);
-	sync_global_to_guest(vm, vtimer_irq);
-
-	pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
-}
-
-static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
-			   enum arch_timer timer)
-{
-	*vm = vm_create_with_one_vcpu(vcpu, guest_code);
-	TEST_ASSERT(*vm, "Failed to create the test VM\n");
-
-	vm_init_descriptor_tables(*vm);
-	vm_install_exception_handler(*vm, VECTOR_IRQ_CURRENT,
-				     guest_irq_handler);
-
-	vcpu_init_descriptor_tables(*vcpu);
-	vcpu_args_set(*vcpu, 1, timer);
-
-	test_init_timer_irq(*vm, *vcpu);
-	vgic_v3_setup(*vm, 1, 64);
-	sync_global_to_guest(*vm, test_args);
-}
-
-static void test_print_help(char *name)
-{
-	pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n"
-		, name);
-	pr_info("\t-i: Number of iterations (default: %u)\n",
-		NR_TEST_ITERS_DEF);
-	pr_info("\t-b: Test both physical and virtual timers (default: true)\n");
-	pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n",
-	     LONG_WAIT_TEST_MS);
-	pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n",
-		WAIT_TEST_MS);
-	pr_info("\t-p: Test physical timer (default: true)\n");
-	pr_info("\t-v: Test virtual timer (default: true)\n");
-	pr_info("\t-h: Print this help message\n");
-}
-
-static bool parse_args(int argc, char *argv[])
-{
-	int opt;
-
-	while ((opt = getopt(argc, argv, "bhi:l:pvw:")) != -1) {
-		switch (opt) {
-		case 'b':
-			test_args.test_physical = true;
-			test_args.test_virtual = true;
-			break;
-		case 'i':
-			test_args.iterations =
-			    atoi_positive("Number of iterations", optarg);
-			break;
-		case 'l':
-			test_args.long_wait_ms =
-			    atoi_positive("Long wait time", optarg);
-			break;
-		case 'p':
-			test_args.test_physical = true;
-			test_args.test_virtual = false;
-			break;
-		case 'v':
-			test_args.test_virtual = true;
-			test_args.test_physical = false;
-			break;
-		case 'w':
-			test_args.wait_ms = atoi_positive("Wait time", optarg);
-			break;
-		case 'h':
-		default:
-			goto err;
-		}
-	}
-
-	return true;
-
- err:
-	test_print_help(argv[0]);
-	return false;
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
-
-	if (!parse_args(argc, argv))
-		exit(KSFT_SKIP);
-
-	if (test_args.test_virtual) {
-		test_vm_create(&vm, &vcpu, VIRTUAL);
-		test_run(vm, vcpu);
-		kvm_vm_free(vm);
-	}
-
-	if (test_args.test_physical) {
-		test_vm_create(&vm, &vcpu, PHYSICAL);
-		test_run(vm, vcpu);
-		kvm_vm_free(vm);
-	}
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
deleted file mode 100644
index c7fb55c9135b..000000000000
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ /dev/null
@@ -1,607 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-#include <linux/bitfield.h>
-
-#define MDSCR_KDE	(1 << 13)
-#define MDSCR_MDE	(1 << 15)
-#define MDSCR_SS	(1 << 0)
-
-#define DBGBCR_LEN8	(0xff << 5)
-#define DBGBCR_EXEC	(0x0 << 3)
-#define DBGBCR_EL1	(0x1 << 1)
-#define DBGBCR_E	(0x1 << 0)
-#define DBGBCR_LBN_SHIFT	16
-#define DBGBCR_BT_SHIFT		20
-#define DBGBCR_BT_ADDR_LINK_CTX	(0x1 << DBGBCR_BT_SHIFT)
-#define DBGBCR_BT_CTX_LINK	(0x3 << DBGBCR_BT_SHIFT)
-
-#define DBGWCR_LEN8	(0xff << 5)
-#define DBGWCR_RD	(0x1 << 3)
-#define DBGWCR_WR	(0x2 << 3)
-#define DBGWCR_EL1	(0x1 << 1)
-#define DBGWCR_E	(0x1 << 0)
-#define DBGWCR_LBN_SHIFT	16
-#define DBGWCR_WT_SHIFT		20
-#define DBGWCR_WT_LINK		(0x1 << DBGWCR_WT_SHIFT)
-
-#define SPSR_D		(1 << 9)
-#define SPSR_SS		(1 << 21)
-
-extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start, hw_bp_ctx;
-extern unsigned char iter_ss_begin, iter_ss_end;
-static volatile uint64_t sw_bp_addr, hw_bp_addr;
-static volatile uint64_t wp_addr, wp_data_addr;
-static volatile uint64_t svc_addr;
-static volatile uint64_t ss_addr[4], ss_idx;
-#define  PC(v)  ((uint64_t)&(v))
-
-#define GEN_DEBUG_WRITE_REG(reg_name)			\
-static void write_##reg_name(int num, uint64_t val)	\
-{							\
-	switch (num) {					\
-	case 0:						\
-		write_sysreg(val, reg_name##0_el1);	\
-		break;					\
-	case 1:						\
-		write_sysreg(val, reg_name##1_el1);	\
-		break;					\
-	case 2:						\
-		write_sysreg(val, reg_name##2_el1);	\
-		break;					\
-	case 3:						\
-		write_sysreg(val, reg_name##3_el1);	\
-		break;					\
-	case 4:						\
-		write_sysreg(val, reg_name##4_el1);	\
-		break;					\
-	case 5:						\
-		write_sysreg(val, reg_name##5_el1);	\
-		break;					\
-	case 6:						\
-		write_sysreg(val, reg_name##6_el1);	\
-		break;					\
-	case 7:						\
-		write_sysreg(val, reg_name##7_el1);	\
-		break;					\
-	case 8:						\
-		write_sysreg(val, reg_name##8_el1);	\
-		break;					\
-	case 9:						\
-		write_sysreg(val, reg_name##9_el1);	\
-		break;					\
-	case 10:					\
-		write_sysreg(val, reg_name##10_el1);	\
-		break;					\
-	case 11:					\
-		write_sysreg(val, reg_name##11_el1);	\
-		break;					\
-	case 12:					\
-		write_sysreg(val, reg_name##12_el1);	\
-		break;					\
-	case 13:					\
-		write_sysreg(val, reg_name##13_el1);	\
-		break;					\
-	case 14:					\
-		write_sysreg(val, reg_name##14_el1);	\
-		break;					\
-	case 15:					\
-		write_sysreg(val, reg_name##15_el1);	\
-		break;					\
-	default:					\
-		GUEST_ASSERT(0);			\
-	}						\
-}
-
-/* Define write_dbgbcr()/write_dbgbvr()/write_dbgwcr()/write_dbgwvr() */
-GEN_DEBUG_WRITE_REG(dbgbcr)
-GEN_DEBUG_WRITE_REG(dbgbvr)
-GEN_DEBUG_WRITE_REG(dbgwcr)
-GEN_DEBUG_WRITE_REG(dbgwvr)
-
-static void reset_debug_state(void)
-{
-	uint8_t brps, wrps, i;
-	uint64_t dfr0;
-
-	asm volatile("msr daifset, #8");
-
-	write_sysreg(0, osdlr_el1);
-	write_sysreg(0, oslar_el1);
-	isb();
-
-	write_sysreg(0, mdscr_el1);
-	write_sysreg(0, contextidr_el1);
-
-	/* Reset all bcr/bvr/wcr/wvr registers */
-	dfr0 = read_sysreg(id_aa64dfr0_el1);
-	brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0);
-	for (i = 0; i <= brps; i++) {
-		write_dbgbcr(i, 0);
-		write_dbgbvr(i, 0);
-	}
-	wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0);
-	for (i = 0; i <= wrps; i++) {
-		write_dbgwcr(i, 0);
-		write_dbgwvr(i, 0);
-	}
-
-	isb();
-}
-
-static void enable_os_lock(void)
-{
-	write_sysreg(1, oslar_el1);
-	isb();
-
-	GUEST_ASSERT(read_sysreg(oslsr_el1) & 2);
-}
-
-static void enable_monitor_debug_exceptions(void)
-{
-	uint32_t mdscr;
-
-	asm volatile("msr daifclr, #8");
-
-	mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE;
-	write_sysreg(mdscr, mdscr_el1);
-	isb();
-}
-
-static void install_wp(uint8_t wpn, uint64_t addr)
-{
-	uint32_t wcr;
-
-	wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E;
-	write_dbgwcr(wpn, wcr);
-	write_dbgwvr(wpn, addr);
-
-	isb();
-
-	enable_monitor_debug_exceptions();
-}
-
-static void install_hw_bp(uint8_t bpn, uint64_t addr)
-{
-	uint32_t bcr;
-
-	bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E;
-	write_dbgbcr(bpn, bcr);
-	write_dbgbvr(bpn, addr);
-	isb();
-
-	enable_monitor_debug_exceptions();
-}
-
-static void install_wp_ctx(uint8_t addr_wp, uint8_t ctx_bp, uint64_t addr,
-			   uint64_t ctx)
-{
-	uint32_t wcr;
-	uint64_t ctx_bcr;
-
-	/* Setup a context-aware breakpoint for Linked Context ID Match */
-	ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
-		  DBGBCR_BT_CTX_LINK;
-	write_dbgbcr(ctx_bp, ctx_bcr);
-	write_dbgbvr(ctx_bp, ctx);
-
-	/* Setup a linked watchpoint (linked to the context-aware breakpoint) */
-	wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E |
-	      DBGWCR_WT_LINK | ((uint32_t)ctx_bp << DBGWCR_LBN_SHIFT);
-	write_dbgwcr(addr_wp, wcr);
-	write_dbgwvr(addr_wp, addr);
-	isb();
-
-	enable_monitor_debug_exceptions();
-}
-
-void install_hw_bp_ctx(uint8_t addr_bp, uint8_t ctx_bp, uint64_t addr,
-		       uint64_t ctx)
-{
-	uint32_t addr_bcr, ctx_bcr;
-
-	/* Setup a context-aware breakpoint for Linked Context ID Match */
-	ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
-		  DBGBCR_BT_CTX_LINK;
-	write_dbgbcr(ctx_bp, ctx_bcr);
-	write_dbgbvr(ctx_bp, ctx);
-
-	/*
-	 * Setup a normal breakpoint for Linked Address Match, and link it
-	 * to the context-aware breakpoint.
-	 */
-	addr_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
-		   DBGBCR_BT_ADDR_LINK_CTX |
-		   ((uint32_t)ctx_bp << DBGBCR_LBN_SHIFT);
-	write_dbgbcr(addr_bp, addr_bcr);
-	write_dbgbvr(addr_bp, addr);
-	isb();
-
-	enable_monitor_debug_exceptions();
-}
-
-static void install_ss(void)
-{
-	uint32_t mdscr;
-
-	asm volatile("msr daifclr, #8");
-
-	mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_SS;
-	write_sysreg(mdscr, mdscr_el1);
-	isb();
-}
-
-static volatile char write_data;
-
-static void guest_code(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
-{
-	uint64_t ctx = 0xabcdef;	/* a random context number */
-
-	/* Software-breakpoint */
-	reset_debug_state();
-	asm volatile("sw_bp: brk #0");
-	GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp));
-
-	/* Hardware-breakpoint */
-	reset_debug_state();
-	install_hw_bp(bpn, PC(hw_bp));
-	asm volatile("hw_bp: nop");
-	GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp));
-
-	/* Hardware-breakpoint + svc */
-	reset_debug_state();
-	install_hw_bp(bpn, PC(bp_svc));
-	asm volatile("bp_svc: svc #0");
-	GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc));
-	GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4);
-
-	/* Hardware-breakpoint + software-breakpoint */
-	reset_debug_state();
-	install_hw_bp(bpn, PC(bp_brk));
-	asm volatile("bp_brk: brk #0");
-	GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk));
-	GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk));
-
-	/* Watchpoint */
-	reset_debug_state();
-	install_wp(wpn, PC(write_data));
-	write_data = 'x';
-	GUEST_ASSERT_EQ(write_data, 'x');
-	GUEST_ASSERT_EQ(wp_data_addr, PC(write_data));
-
-	/* Single-step */
-	reset_debug_state();
-	install_ss();
-	ss_idx = 0;
-	asm volatile("ss_start:\n"
-		     "mrs x0, esr_el1\n"
-		     "add x0, x0, #1\n"
-		     "msr daifset, #8\n"
-		     : : : "x0");
-	GUEST_ASSERT_EQ(ss_addr[0], PC(ss_start));
-	GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4);
-	GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8);
-
-	/* OS Lock does not block software-breakpoint */
-	reset_debug_state();
-	enable_os_lock();
-	sw_bp_addr = 0;
-	asm volatile("sw_bp2: brk #0");
-	GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp2));
-
-	/* OS Lock blocking hardware-breakpoint */
-	reset_debug_state();
-	enable_os_lock();
-	install_hw_bp(bpn, PC(hw_bp2));
-	hw_bp_addr = 0;
-	asm volatile("hw_bp2: nop");
-	GUEST_ASSERT_EQ(hw_bp_addr, 0);
-
-	/* OS Lock blocking watchpoint */
-	reset_debug_state();
-	enable_os_lock();
-	write_data = '\0';
-	wp_data_addr = 0;
-	install_wp(wpn, PC(write_data));
-	write_data = 'x';
-	GUEST_ASSERT_EQ(write_data, 'x');
-	GUEST_ASSERT_EQ(wp_data_addr, 0);
-
-	/* OS Lock blocking single-step */
-	reset_debug_state();
-	enable_os_lock();
-	ss_addr[0] = 0;
-	install_ss();
-	ss_idx = 0;
-	asm volatile("mrs x0, esr_el1\n\t"
-		     "add x0, x0, #1\n\t"
-		     "msr daifset, #8\n\t"
-		     : : : "x0");
-	GUEST_ASSERT_EQ(ss_addr[0], 0);
-
-	/* Linked hardware-breakpoint */
-	hw_bp_addr = 0;
-	reset_debug_state();
-	install_hw_bp_ctx(bpn, ctx_bpn, PC(hw_bp_ctx), ctx);
-	/* Set context id */
-	write_sysreg(ctx, contextidr_el1);
-	isb();
-	asm volatile("hw_bp_ctx: nop");
-	write_sysreg(0, contextidr_el1);
-	GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp_ctx));
-
-	/* Linked watchpoint */
-	reset_debug_state();
-	install_wp_ctx(wpn, ctx_bpn, PC(write_data), ctx);
-	/* Set context id */
-	write_sysreg(ctx, contextidr_el1);
-	isb();
-	write_data = 'x';
-	GUEST_ASSERT_EQ(write_data, 'x');
-	GUEST_ASSERT_EQ(wp_data_addr, PC(write_data));
-
-	GUEST_DONE();
-}
-
-static void guest_sw_bp_handler(struct ex_regs *regs)
-{
-	sw_bp_addr = regs->pc;
-	regs->pc += 4;
-}
-
-static void guest_hw_bp_handler(struct ex_regs *regs)
-{
-	hw_bp_addr = regs->pc;
-	regs->pstate |= SPSR_D;
-}
-
-static void guest_wp_handler(struct ex_regs *regs)
-{
-	wp_data_addr = read_sysreg(far_el1);
-	wp_addr = regs->pc;
-	regs->pstate |= SPSR_D;
-}
-
-static void guest_ss_handler(struct ex_regs *regs)
-{
-	__GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%lu'", ss_idx);
-	ss_addr[ss_idx++] = regs->pc;
-	regs->pstate |= SPSR_SS;
-}
-
-static void guest_svc_handler(struct ex_regs *regs)
-{
-	svc_addr = regs->pc;
-}
-
-static void guest_code_ss(int test_cnt)
-{
-	uint64_t i;
-	uint64_t bvr, wvr, w_bvr, w_wvr;
-
-	for (i = 0; i < test_cnt; i++) {
-		/* Bits [1:0] of dbg{b,w}vr are RES0 */
-		w_bvr = i << 2;
-		w_wvr = i << 2;
-
-		/*
-		 * Enable Single Step execution.  Note!  This _must_ be a bare
-		 * ucall as the ucall() path uses atomic operations to manage
-		 * the ucall structures, and the built-in "atomics" are usually
-		 * implemented via exclusive access instructions.  The exlusive
-		 * monitor is cleared on ERET, and so taking debug exceptions
-		 * during a LDREX=>STREX sequence will prevent forward progress
-		 * and hang the guest/test.
-		 */
-		GUEST_UCALL_NONE();
-
-		/*
-		 * The userspace will verify that the pc is as expected during
-		 * single step execution between iter_ss_begin and iter_ss_end.
-		 */
-		asm volatile("iter_ss_begin:nop\n");
-
-		write_sysreg(w_bvr, dbgbvr0_el1);
-		write_sysreg(w_wvr, dbgwvr0_el1);
-		bvr = read_sysreg(dbgbvr0_el1);
-		wvr = read_sysreg(dbgwvr0_el1);
-
-		/* Userspace disables Single Step when the end is nigh. */
-		asm volatile("iter_ss_end:\n");
-
-		GUEST_ASSERT_EQ(bvr, w_bvr);
-		GUEST_ASSERT_EQ(wvr, w_wvr);
-	}
-	GUEST_DONE();
-}
-
-static int debug_version(uint64_t id_aa64dfr0)
-{
-	return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0);
-}
-
-static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(vcpu);
-
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_BRK64, guest_sw_bp_handler);
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_BREAKPT_CUR, guest_hw_bp_handler);
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_WATCHPT_CUR, guest_wp_handler);
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_SOFTSTP_CUR, guest_ss_handler);
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_SVC64, guest_svc_handler);
-
-	/* Specify bpn/wpn/ctx_bpn to be tested */
-	vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn);
-	pr_debug("Use bpn#%d, wpn#%d and ctx_bpn#%d\n", bpn, wpn, ctx_bpn);
-
-	vcpu_run(vcpu);
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		goto done;
-	default:
-		TEST_FAIL("Unknown ucall %lu", uc.cmd);
-	}
-
-done:
-	kvm_vm_free(vm);
-}
-
-void test_single_step_from_userspace(int test_cnt)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	struct kvm_run *run;
-	uint64_t pc, cmd;
-	uint64_t test_pc = 0;
-	bool ss_enable = false;
-	struct kvm_guest_debug debug = {};
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss);
-	run = vcpu->run;
-	vcpu_args_set(vcpu, 1, test_cnt);
-
-	while (1) {
-		vcpu_run(vcpu);
-		if (run->exit_reason != KVM_EXIT_DEBUG) {
-			cmd = get_ucall(vcpu, &uc);
-			if (cmd == UCALL_ABORT) {
-				REPORT_GUEST_ASSERT(uc);
-				/* NOT REACHED */
-			} else if (cmd == UCALL_DONE) {
-				break;
-			}
-
-			TEST_ASSERT(cmd == UCALL_NONE,
-				    "Unexpected ucall cmd 0x%lx", cmd);
-
-			debug.control = KVM_GUESTDBG_ENABLE |
-					KVM_GUESTDBG_SINGLESTEP;
-			ss_enable = true;
-			vcpu_guest_debug_set(vcpu, &debug);
-			continue;
-		}
-
-		TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG");
-
-		/* Check if the current pc is expected. */
-		pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
-		TEST_ASSERT(!test_pc || pc == test_pc,
-			    "Unexpected pc 0x%lx (expected 0x%lx)",
-			    pc, test_pc);
-
-		if ((pc + 4) == (uint64_t)&iter_ss_end) {
-			test_pc = 0;
-			debug.control = KVM_GUESTDBG_ENABLE;
-			ss_enable = false;
-			vcpu_guest_debug_set(vcpu, &debug);
-			continue;
-		}
-
-		/*
-		 * If the current pc is between iter_ss_bgin and
-		 * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should
-		 * be the current pc + 4.
-		 */
-		if ((pc >= (uint64_t)&iter_ss_begin) &&
-		    (pc < (uint64_t)&iter_ss_end))
-			test_pc = pc + 4;
-		else
-			test_pc = 0;
-	}
-
-	kvm_vm_free(vm);
-}
-
-/*
- * Run debug testing using the various breakpoint#, watchpoint# and
- * context-aware breakpoint# with the given ID_AA64DFR0_EL1 configuration.
- */
-void test_guest_debug_exceptions_all(uint64_t aa64dfr0)
-{
-	uint8_t brp_num, wrp_num, ctx_brp_num, normal_brp_num, ctx_brp_base;
-	int b, w, c;
-
-	/* Number of breakpoints */
-	brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1;
-	__TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required");
-
-	/* Number of watchpoints */
-	wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1;
-
-	/* Number of context aware breakpoints */
-	ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1;
-
-	pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__,
-		 brp_num, wrp_num, ctx_brp_num);
-
-	/* Number of normal (non-context aware) breakpoints */
-	normal_brp_num = brp_num - ctx_brp_num;
-
-	/* Lowest context aware breakpoint number */
-	ctx_brp_base = normal_brp_num;
-
-	/* Run tests with all supported breakpoints/watchpoints */
-	for (c = ctx_brp_base; c < ctx_brp_base + ctx_brp_num; c++) {
-		for (b = 0; b < normal_brp_num; b++) {
-			for (w = 0; w < wrp_num; w++)
-				test_guest_debug_exceptions(b, w, c);
-		}
-	}
-}
-
-static void help(char *name)
-{
-	puts("");
-	printf("Usage: %s [-h] [-i iterations of the single step test]\n", name);
-	puts("");
-	exit(0);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	int opt;
-	int ss_iteration = 10000;
-	uint64_t aa64dfr0;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	aa64dfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
-	__TEST_REQUIRE(debug_version(aa64dfr0) >= 6,
-		       "Armv8 debug architecture not supported.");
-	kvm_vm_free(vm);
-
-	while ((opt = getopt(argc, argv, "i:")) != -1) {
-		switch (opt) {
-		case 'i':
-			ss_iteration = atoi_positive("Number of iterations", optarg);
-			break;
-		case 'h':
-		default:
-			help(argv[0]);
-			break;
-		}
-	}
-
-	test_guest_debug_exceptions_all(aa64dfr0);
-	test_single_step_from_userspace(ss_iteration);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
deleted file mode 100644
index d43fb3f49050..000000000000
--- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
+++ /dev/null
@@ -1,771 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Check for KVM_GET_REG_LIST regressions.
- *
- * Copyright (C) 2020, Red Hat, Inc.
- *
- * While the blessed list should be created from the oldest possible
- * kernel, we can't go older than v5.2, though, because that's the first
- * release which includes df205b5c6328 ("KVM: arm64: Filter out invalid
- * core register IDs in KVM_GET_REG_LIST"). Without that commit the core
- * registers won't match expectations.
- */
-#include <stdio.h>
-#include "kvm_util.h"
-#include "test_util.h"
-#include "processor.h"
-
-struct feature_id_reg {
-	__u64 reg;
-	__u64 id_reg;
-	__u64 feat_shift;
-	__u64 feat_min;
-};
-
-static struct feature_id_reg feat_id_regs[] = {
-	{
-		ARM64_SYS_REG(3, 0, 2, 0, 3),	/* TCR2_EL1 */
-		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
-		0,
-		1
-	},
-	{
-		ARM64_SYS_REG(3, 0, 10, 2, 2),	/* PIRE0_EL1 */
-		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
-		8,
-		1
-	},
-	{
-		ARM64_SYS_REG(3, 0, 10, 2, 3),	/* PIR_EL1 */
-		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
-		8,
-		1
-	},
-	{
-		ARM64_SYS_REG(3, 0, 10, 2, 4),	/* POR_EL1 */
-		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
-		16,
-		1
-	},
-	{
-		ARM64_SYS_REG(3, 3, 10, 2, 4),	/* POR_EL0 */
-		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
-		16,
-		1
-	}
-};
-
-bool filter_reg(__u64 reg)
-{
-	/*
-	 * DEMUX register presence depends on the host's CLIDR_EL1.
-	 * This means there's no set of them that we can bless.
-	 */
-	if ((reg & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
-		return true;
-
-	return false;
-}
-
-static bool check_supported_feat_reg(struct kvm_vcpu *vcpu, __u64 reg)
-{
-	int i, ret;
-	__u64 data, feat_val;
-
-	for (i = 0; i < ARRAY_SIZE(feat_id_regs); i++) {
-		if (feat_id_regs[i].reg == reg) {
-			ret = __vcpu_get_reg(vcpu, feat_id_regs[i].id_reg, &data);
-			if (ret < 0)
-				return false;
-
-			feat_val = ((data >> feat_id_regs[i].feat_shift) & 0xf);
-			return feat_val >= feat_id_regs[i].feat_min;
-		}
-	}
-
-	return true;
-}
-
-bool check_supported_reg(struct kvm_vcpu *vcpu, __u64 reg)
-{
-	return check_supported_feat_reg(vcpu, reg);
-}
-
-bool check_reject_set(int err)
-{
-	return err == EPERM;
-}
-
-void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c)
-{
-	struct vcpu_reg_sublist *s;
-	int feature;
-
-	for_each_sublist(c, s) {
-		if (s->finalize) {
-			feature = s->feature;
-			vcpu_ioctl(vcpu, KVM_ARM_VCPU_FINALIZE, &feature);
-		}
-	}
-}
-
-#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK)
-
-#define CORE_REGS_XX_NR_WORDS	2
-#define CORE_SPSR_XX_NR_WORDS	2
-#define CORE_FPREGS_XX_NR_WORDS	4
-
-static const char *core_id_to_str(const char *prefix, __u64 id)
-{
-	__u64 core_off = id & ~REG_MASK, idx;
-
-	/*
-	 * core_off is the offset into struct kvm_regs
-	 */
-	switch (core_off) {
-	case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
-	     KVM_REG_ARM_CORE_REG(regs.regs[30]):
-		idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS;
-		TEST_ASSERT(idx < 31, "%s: Unexpected regs.regs index: %lld", prefix, idx);
-		return strdup_printf("KVM_REG_ARM_CORE_REG(regs.regs[%lld])", idx);
-	case KVM_REG_ARM_CORE_REG(regs.sp):
-		return "KVM_REG_ARM_CORE_REG(regs.sp)";
-	case KVM_REG_ARM_CORE_REG(regs.pc):
-		return "KVM_REG_ARM_CORE_REG(regs.pc)";
-	case KVM_REG_ARM_CORE_REG(regs.pstate):
-		return "KVM_REG_ARM_CORE_REG(regs.pstate)";
-	case KVM_REG_ARM_CORE_REG(sp_el1):
-		return "KVM_REG_ARM_CORE_REG(sp_el1)";
-	case KVM_REG_ARM_CORE_REG(elr_el1):
-		return "KVM_REG_ARM_CORE_REG(elr_el1)";
-	case KVM_REG_ARM_CORE_REG(spsr[0]) ...
-	     KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
-		idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS;
-		TEST_ASSERT(idx < KVM_NR_SPSR, "%s: Unexpected spsr index: %lld", prefix, idx);
-		return strdup_printf("KVM_REG_ARM_CORE_REG(spsr[%lld])", idx);
-	case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
-	     KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
-		idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS;
-		TEST_ASSERT(idx < 32, "%s: Unexpected fp_regs.vregs index: %lld", prefix, idx);
-		return strdup_printf("KVM_REG_ARM_CORE_REG(fp_regs.vregs[%lld])", idx);
-	case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
-		return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)";
-	case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
-		return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)";
-	}
-
-	TEST_FAIL("%s: Unknown core reg id: 0x%llx", prefix, id);
-	return NULL;
-}
-
-static const char *sve_id_to_str(const char *prefix, __u64 id)
-{
-	__u64 sve_off, n, i;
-
-	if (id == KVM_REG_ARM64_SVE_VLS)
-		return "KVM_REG_ARM64_SVE_VLS";
-
-	sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1));
-	i = id & (KVM_ARM64_SVE_MAX_SLICES - 1);
-
-	TEST_ASSERT(i == 0, "%s: Currently we don't expect slice > 0, reg id 0x%llx", prefix, id);
-
-	switch (sve_off) {
-	case KVM_REG_ARM64_SVE_ZREG_BASE ...
-	     KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1:
-		n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1);
-		TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0),
-			    "%s: Unexpected bits set in SVE ZREG id: 0x%llx", prefix, id);
-		return strdup_printf("KVM_REG_ARM64_SVE_ZREG(%lld, 0)", n);
-	case KVM_REG_ARM64_SVE_PREG_BASE ...
-	     KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1:
-		n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1);
-		TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0),
-			    "%s: Unexpected bits set in SVE PREG id: 0x%llx", prefix, id);
-		return strdup_printf("KVM_REG_ARM64_SVE_PREG(%lld, 0)", n);
-	case KVM_REG_ARM64_SVE_FFR_BASE:
-		TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0),
-			    "%s: Unexpected bits set in SVE FFR id: 0x%llx", prefix, id);
-		return "KVM_REG_ARM64_SVE_FFR(0)";
-	}
-
-	return NULL;
-}
-
-void print_reg(const char *prefix, __u64 id)
-{
-	unsigned op0, op1, crn, crm, op2;
-	const char *reg_size = NULL;
-
-	TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
-		    "%s: KVM_REG_ARM64 missing in reg id: 0x%llx", prefix, id);
-
-	switch (id & KVM_REG_SIZE_MASK) {
-	case KVM_REG_SIZE_U8:
-		reg_size = "KVM_REG_SIZE_U8";
-		break;
-	case KVM_REG_SIZE_U16:
-		reg_size = "KVM_REG_SIZE_U16";
-		break;
-	case KVM_REG_SIZE_U32:
-		reg_size = "KVM_REG_SIZE_U32";
-		break;
-	case KVM_REG_SIZE_U64:
-		reg_size = "KVM_REG_SIZE_U64";
-		break;
-	case KVM_REG_SIZE_U128:
-		reg_size = "KVM_REG_SIZE_U128";
-		break;
-	case KVM_REG_SIZE_U256:
-		reg_size = "KVM_REG_SIZE_U256";
-		break;
-	case KVM_REG_SIZE_U512:
-		reg_size = "KVM_REG_SIZE_U512";
-		break;
-	case KVM_REG_SIZE_U1024:
-		reg_size = "KVM_REG_SIZE_U1024";
-		break;
-	case KVM_REG_SIZE_U2048:
-		reg_size = "KVM_REG_SIZE_U2048";
-		break;
-	default:
-		TEST_FAIL("%s: Unexpected reg size: 0x%llx in reg id: 0x%llx",
-			  prefix, (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id);
-	}
-
-	switch (id & KVM_REG_ARM_COPROC_MASK) {
-	case KVM_REG_ARM_CORE:
-		printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(prefix, id));
-		break;
-	case KVM_REG_ARM_DEMUX:
-		TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)),
-			    "%s: Unexpected bits set in DEMUX reg id: 0x%llx", prefix, id);
-		printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n",
-		       reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK);
-		break;
-	case KVM_REG_ARM64_SYSREG:
-		op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT;
-		op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT;
-		crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT;
-		crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT;
-		op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT;
-		TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2),
-			    "%s: Unexpected bits set in SYSREG reg id: 0x%llx", prefix, id);
-		printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2);
-		break;
-	case KVM_REG_ARM_FW:
-		TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff),
-			    "%s: Unexpected bits set in FW reg id: 0x%llx", prefix, id);
-		printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff);
-		break;
-	case KVM_REG_ARM_FW_FEAT_BMAP:
-		TEST_ASSERT(id == KVM_REG_ARM_FW_FEAT_BMAP_REG(id & 0xffff),
-			    "%s: Unexpected bits set in the bitmap feature FW reg id: 0x%llx", prefix, id);
-		printf("\tKVM_REG_ARM_FW_FEAT_BMAP_REG(%lld),\n", id & 0xffff);
-		break;
-	case KVM_REG_ARM64_SVE:
-		printf("\t%s,\n", sve_id_to_str(prefix, id));
-		break;
-	default:
-		TEST_FAIL("%s: Unexpected coproc type: 0x%llx in reg id: 0x%llx",
-			  prefix, (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id);
-	}
-}
-
-/*
- * The original blessed list was primed with the output of kernel version
- * v4.15 with --core-reg-fixup and then later updated with new registers.
- * (The --core-reg-fixup option and it's fixup function have been removed
- * from the test, as it's unlikely to use this type of test on a kernel
- * older than v5.2.)
- *
- * The blessed list is up to date with kernel version v6.4 (or so we hope)
- */
-static __u64 base_regs[] = {
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr),
-	KVM_REG_ARM_FW_REG(0),		/* KVM_REG_ARM_PSCI_VERSION */
-	KVM_REG_ARM_FW_REG(1),		/* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 */
-	KVM_REG_ARM_FW_REG(2),		/* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 */
-	KVM_REG_ARM_FW_REG(3),		/* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 */
-	KVM_REG_ARM_FW_FEAT_BMAP_REG(0),	/* KVM_REG_ARM_STD_BMAP */
-	KVM_REG_ARM_FW_FEAT_BMAP_REG(1),	/* KVM_REG_ARM_STD_HYP_BMAP */
-	KVM_REG_ARM_FW_FEAT_BMAP_REG(2),	/* KVM_REG_ARM_VENDOR_HYP_BMAP */
-	ARM64_SYS_REG(3, 3, 14, 3, 1),	/* CNTV_CTL_EL0 */
-	ARM64_SYS_REG(3, 3, 14, 3, 2),	/* CNTV_CVAL_EL0 */
-	ARM64_SYS_REG(3, 3, 14, 0, 2),
-	ARM64_SYS_REG(3, 0, 0, 0, 0),	/* MIDR_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 0, 6),	/* REVIDR_EL1 */
-	ARM64_SYS_REG(3, 1, 0, 0, 1),	/* CLIDR_EL1 */
-	ARM64_SYS_REG(3, 1, 0, 0, 7),	/* AIDR_EL1 */
-	ARM64_SYS_REG(3, 3, 0, 0, 1),	/* CTR_EL0 */
-	ARM64_SYS_REG(2, 0, 0, 0, 4),
-	ARM64_SYS_REG(2, 0, 0, 0, 5),
-	ARM64_SYS_REG(2, 0, 0, 0, 6),
-	ARM64_SYS_REG(2, 0, 0, 0, 7),
-	ARM64_SYS_REG(2, 0, 0, 1, 4),
-	ARM64_SYS_REG(2, 0, 0, 1, 5),
-	ARM64_SYS_REG(2, 0, 0, 1, 6),
-	ARM64_SYS_REG(2, 0, 0, 1, 7),
-	ARM64_SYS_REG(2, 0, 0, 2, 0),	/* MDCCINT_EL1 */
-	ARM64_SYS_REG(2, 0, 0, 2, 2),	/* MDSCR_EL1 */
-	ARM64_SYS_REG(2, 0, 0, 2, 4),
-	ARM64_SYS_REG(2, 0, 0, 2, 5),
-	ARM64_SYS_REG(2, 0, 0, 2, 6),
-	ARM64_SYS_REG(2, 0, 0, 2, 7),
-	ARM64_SYS_REG(2, 0, 0, 3, 4),
-	ARM64_SYS_REG(2, 0, 0, 3, 5),
-	ARM64_SYS_REG(2, 0, 0, 3, 6),
-	ARM64_SYS_REG(2, 0, 0, 3, 7),
-	ARM64_SYS_REG(2, 0, 0, 4, 4),
-	ARM64_SYS_REG(2, 0, 0, 4, 5),
-	ARM64_SYS_REG(2, 0, 0, 4, 6),
-	ARM64_SYS_REG(2, 0, 0, 4, 7),
-	ARM64_SYS_REG(2, 0, 0, 5, 4),
-	ARM64_SYS_REG(2, 0, 0, 5, 5),
-	ARM64_SYS_REG(2, 0, 0, 5, 6),
-	ARM64_SYS_REG(2, 0, 0, 5, 7),
-	ARM64_SYS_REG(2, 0, 0, 6, 4),
-	ARM64_SYS_REG(2, 0, 0, 6, 5),
-	ARM64_SYS_REG(2, 0, 0, 6, 6),
-	ARM64_SYS_REG(2, 0, 0, 6, 7),
-	ARM64_SYS_REG(2, 0, 0, 7, 4),
-	ARM64_SYS_REG(2, 0, 0, 7, 5),
-	ARM64_SYS_REG(2, 0, 0, 7, 6),
-	ARM64_SYS_REG(2, 0, 0, 7, 7),
-	ARM64_SYS_REG(2, 0, 0, 8, 4),
-	ARM64_SYS_REG(2, 0, 0, 8, 5),
-	ARM64_SYS_REG(2, 0, 0, 8, 6),
-	ARM64_SYS_REG(2, 0, 0, 8, 7),
-	ARM64_SYS_REG(2, 0, 0, 9, 4),
-	ARM64_SYS_REG(2, 0, 0, 9, 5),
-	ARM64_SYS_REG(2, 0, 0, 9, 6),
-	ARM64_SYS_REG(2, 0, 0, 9, 7),
-	ARM64_SYS_REG(2, 0, 0, 10, 4),
-	ARM64_SYS_REG(2, 0, 0, 10, 5),
-	ARM64_SYS_REG(2, 0, 0, 10, 6),
-	ARM64_SYS_REG(2, 0, 0, 10, 7),
-	ARM64_SYS_REG(2, 0, 0, 11, 4),
-	ARM64_SYS_REG(2, 0, 0, 11, 5),
-	ARM64_SYS_REG(2, 0, 0, 11, 6),
-	ARM64_SYS_REG(2, 0, 0, 11, 7),
-	ARM64_SYS_REG(2, 0, 0, 12, 4),
-	ARM64_SYS_REG(2, 0, 0, 12, 5),
-	ARM64_SYS_REG(2, 0, 0, 12, 6),
-	ARM64_SYS_REG(2, 0, 0, 12, 7),
-	ARM64_SYS_REG(2, 0, 0, 13, 4),
-	ARM64_SYS_REG(2, 0, 0, 13, 5),
-	ARM64_SYS_REG(2, 0, 0, 13, 6),
-	ARM64_SYS_REG(2, 0, 0, 13, 7),
-	ARM64_SYS_REG(2, 0, 0, 14, 4),
-	ARM64_SYS_REG(2, 0, 0, 14, 5),
-	ARM64_SYS_REG(2, 0, 0, 14, 6),
-	ARM64_SYS_REG(2, 0, 0, 14, 7),
-	ARM64_SYS_REG(2, 0, 0, 15, 4),
-	ARM64_SYS_REG(2, 0, 0, 15, 5),
-	ARM64_SYS_REG(2, 0, 0, 15, 6),
-	ARM64_SYS_REG(2, 0, 0, 15, 7),
-	ARM64_SYS_REG(2, 0, 1, 1, 4),	/* OSLSR_EL1 */
-	ARM64_SYS_REG(2, 4, 0, 7, 0),	/* DBGVCR32_EL2 */
-	ARM64_SYS_REG(3, 0, 0, 0, 5),	/* MPIDR_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 0),	/* ID_PFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 1),	/* ID_PFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 2),	/* ID_DFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 3),	/* ID_AFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 4),	/* ID_MMFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 5),	/* ID_MMFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 6),	/* ID_MMFR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 1, 7),	/* ID_MMFR3_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 0),	/* ID_ISAR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 1),	/* ID_ISAR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 2),	/* ID_ISAR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 3),	/* ID_ISAR3_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 4),	/* ID_ISAR4_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 5),	/* ID_ISAR5_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 6),	/* ID_MMFR4_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 2, 7),	/* ID_ISAR6_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 0),	/* MVFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 1),	/* MVFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 2),	/* MVFR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 3),
-	ARM64_SYS_REG(3, 0, 0, 3, 4),	/* ID_PFR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 5),	/* ID_DFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 6),	/* ID_MMFR5_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 3, 7),
-	ARM64_SYS_REG(3, 0, 0, 4, 0),	/* ID_AA64PFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 4, 1),	/* ID_AA64PFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 4, 2),	/* ID_AA64PFR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 4, 3),
-	ARM64_SYS_REG(3, 0, 0, 4, 4),	/* ID_AA64ZFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 4, 5),	/* ID_AA64SMFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 4, 6),
-	ARM64_SYS_REG(3, 0, 0, 4, 7),
-	ARM64_SYS_REG(3, 0, 0, 5, 0),	/* ID_AA64DFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 5, 1),	/* ID_AA64DFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 5, 2),
-	ARM64_SYS_REG(3, 0, 0, 5, 3),
-	ARM64_SYS_REG(3, 0, 0, 5, 4),	/* ID_AA64AFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 5, 5),	/* ID_AA64AFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 5, 6),
-	ARM64_SYS_REG(3, 0, 0, 5, 7),
-	ARM64_SYS_REG(3, 0, 0, 6, 0),	/* ID_AA64ISAR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 6, 1),	/* ID_AA64ISAR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 6, 2),	/* ID_AA64ISAR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 6, 3),
-	ARM64_SYS_REG(3, 0, 0, 6, 4),
-	ARM64_SYS_REG(3, 0, 0, 6, 5),
-	ARM64_SYS_REG(3, 0, 0, 6, 6),
-	ARM64_SYS_REG(3, 0, 0, 6, 7),
-	ARM64_SYS_REG(3, 0, 0, 7, 0),	/* ID_AA64MMFR0_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 7, 1),	/* ID_AA64MMFR1_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 7, 2),	/* ID_AA64MMFR2_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 7, 4),	/* ID_AA64MMFR4_EL1 */
-	ARM64_SYS_REG(3, 0, 0, 7, 5),
-	ARM64_SYS_REG(3, 0, 0, 7, 6),
-	ARM64_SYS_REG(3, 0, 0, 7, 7),
-	ARM64_SYS_REG(3, 0, 1, 0, 0),	/* SCTLR_EL1 */
-	ARM64_SYS_REG(3, 0, 1, 0, 1),	/* ACTLR_EL1 */
-	ARM64_SYS_REG(3, 0, 1, 0, 2),	/* CPACR_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 0, 0),	/* TTBR0_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 0, 1),	/* TTBR1_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 0, 2),	/* TCR_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 0, 3),	/* TCR2_EL1 */
-	ARM64_SYS_REG(3, 0, 5, 1, 0),	/* AFSR0_EL1 */
-	ARM64_SYS_REG(3, 0, 5, 1, 1),	/* AFSR1_EL1 */
-	ARM64_SYS_REG(3, 0, 5, 2, 0),	/* ESR_EL1 */
-	ARM64_SYS_REG(3, 0, 6, 0, 0),	/* FAR_EL1 */
-	ARM64_SYS_REG(3, 0, 7, 4, 0),	/* PAR_EL1 */
-	ARM64_SYS_REG(3, 0, 10, 2, 0),	/* MAIR_EL1 */
-	ARM64_SYS_REG(3, 0, 10, 2, 2),	/* PIRE0_EL1 */
-	ARM64_SYS_REG(3, 0, 10, 2, 3),	/* PIR_EL1 */
-	ARM64_SYS_REG(3, 0, 10, 2, 4),	/* POR_EL1 */
-	ARM64_SYS_REG(3, 0, 10, 3, 0),	/* AMAIR_EL1 */
-	ARM64_SYS_REG(3, 0, 12, 0, 0),	/* VBAR_EL1 */
-	ARM64_SYS_REG(3, 0, 12, 1, 1),	/* DISR_EL1 */
-	ARM64_SYS_REG(3, 0, 13, 0, 1),	/* CONTEXTIDR_EL1 */
-	ARM64_SYS_REG(3, 0, 13, 0, 4),	/* TPIDR_EL1 */
-	ARM64_SYS_REG(3, 0, 14, 1, 0),	/* CNTKCTL_EL1 */
-	ARM64_SYS_REG(3, 2, 0, 0, 0),	/* CSSELR_EL1 */
-	ARM64_SYS_REG(3, 3, 10, 2, 4),	/* POR_EL0 */
-	ARM64_SYS_REG(3, 3, 13, 0, 2),	/* TPIDR_EL0 */
-	ARM64_SYS_REG(3, 3, 13, 0, 3),	/* TPIDRRO_EL0 */
-	ARM64_SYS_REG(3, 3, 14, 0, 1),	/* CNTPCT_EL0 */
-	ARM64_SYS_REG(3, 3, 14, 2, 1),	/* CNTP_CTL_EL0 */
-	ARM64_SYS_REG(3, 3, 14, 2, 2),	/* CNTP_CVAL_EL0 */
-	ARM64_SYS_REG(3, 4, 3, 0, 0),	/* DACR32_EL2 */
-	ARM64_SYS_REG(3, 4, 5, 0, 1),	/* IFSR32_EL2 */
-	ARM64_SYS_REG(3, 4, 5, 3, 0),	/* FPEXC32_EL2 */
-};
-
-static __u64 pmu_regs[] = {
-	ARM64_SYS_REG(3, 0, 9, 14, 1),	/* PMINTENSET_EL1 */
-	ARM64_SYS_REG(3, 0, 9, 14, 2),	/* PMINTENCLR_EL1 */
-	ARM64_SYS_REG(3, 3, 9, 12, 0),	/* PMCR_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 12, 1),	/* PMCNTENSET_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 12, 2),	/* PMCNTENCLR_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 12, 3),	/* PMOVSCLR_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 12, 4),	/* PMSWINC_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 12, 5),	/* PMSELR_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 13, 0),	/* PMCCNTR_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 14, 0),	/* PMUSERENR_EL0 */
-	ARM64_SYS_REG(3, 3, 9, 14, 3),	/* PMOVSSET_EL0 */
-	ARM64_SYS_REG(3, 3, 14, 8, 0),
-	ARM64_SYS_REG(3, 3, 14, 8, 1),
-	ARM64_SYS_REG(3, 3, 14, 8, 2),
-	ARM64_SYS_REG(3, 3, 14, 8, 3),
-	ARM64_SYS_REG(3, 3, 14, 8, 4),
-	ARM64_SYS_REG(3, 3, 14, 8, 5),
-	ARM64_SYS_REG(3, 3, 14, 8, 6),
-	ARM64_SYS_REG(3, 3, 14, 8, 7),
-	ARM64_SYS_REG(3, 3, 14, 9, 0),
-	ARM64_SYS_REG(3, 3, 14, 9, 1),
-	ARM64_SYS_REG(3, 3, 14, 9, 2),
-	ARM64_SYS_REG(3, 3, 14, 9, 3),
-	ARM64_SYS_REG(3, 3, 14, 9, 4),
-	ARM64_SYS_REG(3, 3, 14, 9, 5),
-	ARM64_SYS_REG(3, 3, 14, 9, 6),
-	ARM64_SYS_REG(3, 3, 14, 9, 7),
-	ARM64_SYS_REG(3, 3, 14, 10, 0),
-	ARM64_SYS_REG(3, 3, 14, 10, 1),
-	ARM64_SYS_REG(3, 3, 14, 10, 2),
-	ARM64_SYS_REG(3, 3, 14, 10, 3),
-	ARM64_SYS_REG(3, 3, 14, 10, 4),
-	ARM64_SYS_REG(3, 3, 14, 10, 5),
-	ARM64_SYS_REG(3, 3, 14, 10, 6),
-	ARM64_SYS_REG(3, 3, 14, 10, 7),
-	ARM64_SYS_REG(3, 3, 14, 11, 0),
-	ARM64_SYS_REG(3, 3, 14, 11, 1),
-	ARM64_SYS_REG(3, 3, 14, 11, 2),
-	ARM64_SYS_REG(3, 3, 14, 11, 3),
-	ARM64_SYS_REG(3, 3, 14, 11, 4),
-	ARM64_SYS_REG(3, 3, 14, 11, 5),
-	ARM64_SYS_REG(3, 3, 14, 11, 6),
-	ARM64_SYS_REG(3, 3, 14, 12, 0),
-	ARM64_SYS_REG(3, 3, 14, 12, 1),
-	ARM64_SYS_REG(3, 3, 14, 12, 2),
-	ARM64_SYS_REG(3, 3, 14, 12, 3),
-	ARM64_SYS_REG(3, 3, 14, 12, 4),
-	ARM64_SYS_REG(3, 3, 14, 12, 5),
-	ARM64_SYS_REG(3, 3, 14, 12, 6),
-	ARM64_SYS_REG(3, 3, 14, 12, 7),
-	ARM64_SYS_REG(3, 3, 14, 13, 0),
-	ARM64_SYS_REG(3, 3, 14, 13, 1),
-	ARM64_SYS_REG(3, 3, 14, 13, 2),
-	ARM64_SYS_REG(3, 3, 14, 13, 3),
-	ARM64_SYS_REG(3, 3, 14, 13, 4),
-	ARM64_SYS_REG(3, 3, 14, 13, 5),
-	ARM64_SYS_REG(3, 3, 14, 13, 6),
-	ARM64_SYS_REG(3, 3, 14, 13, 7),
-	ARM64_SYS_REG(3, 3, 14, 14, 0),
-	ARM64_SYS_REG(3, 3, 14, 14, 1),
-	ARM64_SYS_REG(3, 3, 14, 14, 2),
-	ARM64_SYS_REG(3, 3, 14, 14, 3),
-	ARM64_SYS_REG(3, 3, 14, 14, 4),
-	ARM64_SYS_REG(3, 3, 14, 14, 5),
-	ARM64_SYS_REG(3, 3, 14, 14, 6),
-	ARM64_SYS_REG(3, 3, 14, 14, 7),
-	ARM64_SYS_REG(3, 3, 14, 15, 0),
-	ARM64_SYS_REG(3, 3, 14, 15, 1),
-	ARM64_SYS_REG(3, 3, 14, 15, 2),
-	ARM64_SYS_REG(3, 3, 14, 15, 3),
-	ARM64_SYS_REG(3, 3, 14, 15, 4),
-	ARM64_SYS_REG(3, 3, 14, 15, 5),
-	ARM64_SYS_REG(3, 3, 14, 15, 6),
-	ARM64_SYS_REG(3, 3, 14, 15, 7),	/* PMCCFILTR_EL0 */
-};
-
-static __u64 vregs[] = {
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]),
-	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]),
-};
-
-static __u64 sve_regs[] = {
-	KVM_REG_ARM64_SVE_VLS,
-	KVM_REG_ARM64_SVE_ZREG(0, 0),
-	KVM_REG_ARM64_SVE_ZREG(1, 0),
-	KVM_REG_ARM64_SVE_ZREG(2, 0),
-	KVM_REG_ARM64_SVE_ZREG(3, 0),
-	KVM_REG_ARM64_SVE_ZREG(4, 0),
-	KVM_REG_ARM64_SVE_ZREG(5, 0),
-	KVM_REG_ARM64_SVE_ZREG(6, 0),
-	KVM_REG_ARM64_SVE_ZREG(7, 0),
-	KVM_REG_ARM64_SVE_ZREG(8, 0),
-	KVM_REG_ARM64_SVE_ZREG(9, 0),
-	KVM_REG_ARM64_SVE_ZREG(10, 0),
-	KVM_REG_ARM64_SVE_ZREG(11, 0),
-	KVM_REG_ARM64_SVE_ZREG(12, 0),
-	KVM_REG_ARM64_SVE_ZREG(13, 0),
-	KVM_REG_ARM64_SVE_ZREG(14, 0),
-	KVM_REG_ARM64_SVE_ZREG(15, 0),
-	KVM_REG_ARM64_SVE_ZREG(16, 0),
-	KVM_REG_ARM64_SVE_ZREG(17, 0),
-	KVM_REG_ARM64_SVE_ZREG(18, 0),
-	KVM_REG_ARM64_SVE_ZREG(19, 0),
-	KVM_REG_ARM64_SVE_ZREG(20, 0),
-	KVM_REG_ARM64_SVE_ZREG(21, 0),
-	KVM_REG_ARM64_SVE_ZREG(22, 0),
-	KVM_REG_ARM64_SVE_ZREG(23, 0),
-	KVM_REG_ARM64_SVE_ZREG(24, 0),
-	KVM_REG_ARM64_SVE_ZREG(25, 0),
-	KVM_REG_ARM64_SVE_ZREG(26, 0),
-	KVM_REG_ARM64_SVE_ZREG(27, 0),
-	KVM_REG_ARM64_SVE_ZREG(28, 0),
-	KVM_REG_ARM64_SVE_ZREG(29, 0),
-	KVM_REG_ARM64_SVE_ZREG(30, 0),
-	KVM_REG_ARM64_SVE_ZREG(31, 0),
-	KVM_REG_ARM64_SVE_PREG(0, 0),
-	KVM_REG_ARM64_SVE_PREG(1, 0),
-	KVM_REG_ARM64_SVE_PREG(2, 0),
-	KVM_REG_ARM64_SVE_PREG(3, 0),
-	KVM_REG_ARM64_SVE_PREG(4, 0),
-	KVM_REG_ARM64_SVE_PREG(5, 0),
-	KVM_REG_ARM64_SVE_PREG(6, 0),
-	KVM_REG_ARM64_SVE_PREG(7, 0),
-	KVM_REG_ARM64_SVE_PREG(8, 0),
-	KVM_REG_ARM64_SVE_PREG(9, 0),
-	KVM_REG_ARM64_SVE_PREG(10, 0),
-	KVM_REG_ARM64_SVE_PREG(11, 0),
-	KVM_REG_ARM64_SVE_PREG(12, 0),
-	KVM_REG_ARM64_SVE_PREG(13, 0),
-	KVM_REG_ARM64_SVE_PREG(14, 0),
-	KVM_REG_ARM64_SVE_PREG(15, 0),
-	KVM_REG_ARM64_SVE_FFR(0),
-	ARM64_SYS_REG(3, 0, 1, 2, 0),   /* ZCR_EL1 */
-};
-
-static __u64 sve_rejects_set[] = {
-	KVM_REG_ARM64_SVE_VLS,
-};
-
-static __u64 pauth_addr_regs[] = {
-	ARM64_SYS_REG(3, 0, 2, 1, 0),	/* APIAKEYLO_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 1, 1),	/* APIAKEYHI_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 1, 2),	/* APIBKEYLO_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 1, 3),	/* APIBKEYHI_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 2, 0),	/* APDAKEYLO_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 2, 1),	/* APDAKEYHI_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 2, 2),	/* APDBKEYLO_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 2, 3)	/* APDBKEYHI_EL1 */
-};
-
-static __u64 pauth_generic_regs[] = {
-	ARM64_SYS_REG(3, 0, 2, 3, 0),	/* APGAKEYLO_EL1 */
-	ARM64_SYS_REG(3, 0, 2, 3, 1),	/* APGAKEYHI_EL1 */
-};
-
-#define BASE_SUBLIST \
-	{ "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), }
-#define VREGS_SUBLIST \
-	{ "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), }
-#define PMU_SUBLIST \
-	{ "pmu", .capability = KVM_CAP_ARM_PMU_V3, .feature = KVM_ARM_VCPU_PMU_V3, \
-	  .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), }
-#define SVE_SUBLIST \
-	{ "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \
-	  .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \
-	  .rejects_set = sve_rejects_set, .rejects_set_n = ARRAY_SIZE(sve_rejects_set), }
-#define PAUTH_SUBLIST							\
-	{								\
-		.name 		= "pauth_address",			\
-		.capability	= KVM_CAP_ARM_PTRAUTH_ADDRESS,		\
-		.feature	= KVM_ARM_VCPU_PTRAUTH_ADDRESS,		\
-		.regs		= pauth_addr_regs,			\
-		.regs_n		= ARRAY_SIZE(pauth_addr_regs),		\
-	},								\
-	{								\
-		.name 		= "pauth_generic",			\
-		.capability	= KVM_CAP_ARM_PTRAUTH_GENERIC,		\
-		.feature	= KVM_ARM_VCPU_PTRAUTH_GENERIC,		\
-		.regs		= pauth_generic_regs,			\
-		.regs_n		= ARRAY_SIZE(pauth_generic_regs),	\
-	}
-
-static struct vcpu_reg_list vregs_config = {
-	.sublists = {
-	BASE_SUBLIST,
-	VREGS_SUBLIST,
-	{0},
-	},
-};
-static struct vcpu_reg_list vregs_pmu_config = {
-	.sublists = {
-	BASE_SUBLIST,
-	VREGS_SUBLIST,
-	PMU_SUBLIST,
-	{0},
-	},
-};
-static struct vcpu_reg_list sve_config = {
-	.sublists = {
-	BASE_SUBLIST,
-	SVE_SUBLIST,
-	{0},
-	},
-};
-static struct vcpu_reg_list sve_pmu_config = {
-	.sublists = {
-	BASE_SUBLIST,
-	SVE_SUBLIST,
-	PMU_SUBLIST,
-	{0},
-	},
-};
-static struct vcpu_reg_list pauth_config = {
-	.sublists = {
-	BASE_SUBLIST,
-	VREGS_SUBLIST,
-	PAUTH_SUBLIST,
-	{0},
-	},
-};
-static struct vcpu_reg_list pauth_pmu_config = {
-	.sublists = {
-	BASE_SUBLIST,
-	VREGS_SUBLIST,
-	PAUTH_SUBLIST,
-	PMU_SUBLIST,
-	{0},
-	},
-};
-
-struct vcpu_reg_list *vcpu_configs[] = {
-	&vregs_config,
-	&vregs_pmu_config,
-	&sve_config,
-	&sve_pmu_config,
-	&pauth_config,
-	&pauth_pmu_config,
-};
-int vcpu_configs_n = ARRAY_SIZE(vcpu_configs);
diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c
deleted file mode 100644
index ec54ec7726e9..000000000000
--- a/tools/testing/selftests/kvm/aarch64/hypercalls.c
+++ /dev/null
@@ -1,308 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-/* hypercalls: Check the ARM64's psuedo-firmware bitmap register interface.
- *
- * The test validates the basic hypercall functionalities that are exposed
- * via the psuedo-firmware bitmap register. This includes the registers'
- * read/write behavior before and after the VM has started, and if the
- * hypercalls are properly masked or unmasked to the guest when disabled or
- * enabled from the KVM userspace, respectively.
- */
-#include <errno.h>
-#include <linux/arm-smccc.h>
-#include <asm/kvm.h>
-#include <kvm_util.h>
-
-#include "processor.h"
-
-#define FW_REG_ULIMIT_VAL(max_feat_bit) (GENMASK(max_feat_bit, 0))
-
-/* Last valid bits of the bitmapped firmware registers */
-#define KVM_REG_ARM_STD_BMAP_BIT_MAX		0
-#define KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX	0
-#define KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX	1
-
-struct kvm_fw_reg_info {
-	uint64_t reg;		/* Register definition */
-	uint64_t max_feat_bit;	/* Bit that represents the upper limit of the feature-map */
-};
-
-#define FW_REG_INFO(r)			\
-	{					\
-		.reg = r,			\
-		.max_feat_bit = r##_BIT_MAX,	\
-	}
-
-static const struct kvm_fw_reg_info fw_reg_info[] = {
-	FW_REG_INFO(KVM_REG_ARM_STD_BMAP),
-	FW_REG_INFO(KVM_REG_ARM_STD_HYP_BMAP),
-	FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP),
-};
-
-enum test_stage {
-	TEST_STAGE_REG_IFACE,
-	TEST_STAGE_HVC_IFACE_FEAT_DISABLED,
-	TEST_STAGE_HVC_IFACE_FEAT_ENABLED,
-	TEST_STAGE_HVC_IFACE_FALSE_INFO,
-	TEST_STAGE_END,
-};
-
-static int stage = TEST_STAGE_REG_IFACE;
-
-struct test_hvc_info {
-	uint32_t func_id;
-	uint64_t arg1;
-};
-
-#define TEST_HVC_INFO(f, a1)	\
-	{			\
-		.func_id = f,	\
-		.arg1 = a1,	\
-	}
-
-static const struct test_hvc_info hvc_info[] = {
-	/* KVM_REG_ARM_STD_BMAP */
-	TEST_HVC_INFO(ARM_SMCCC_TRNG_VERSION, 0),
-	TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_TRNG_RND64),
-	TEST_HVC_INFO(ARM_SMCCC_TRNG_GET_UUID, 0),
-	TEST_HVC_INFO(ARM_SMCCC_TRNG_RND32, 0),
-	TEST_HVC_INFO(ARM_SMCCC_TRNG_RND64, 0),
-
-	/* KVM_REG_ARM_STD_HYP_BMAP */
-	TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_HV_PV_TIME_FEATURES),
-	TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_HV_PV_TIME_ST),
-	TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_ST, 0),
-
-	/* KVM_REG_ARM_VENDOR_HYP_BMAP */
-	TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID,
-			ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID),
-	TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0),
-	TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, KVM_PTP_VIRT_COUNTER),
-};
-
-/* Feed false hypercall info to test the KVM behavior */
-static const struct test_hvc_info false_hvc_info[] = {
-	/* Feature support check against a different family of hypercalls */
-	TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID),
-	TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_TRNG_RND64),
-	TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_TRNG_RND64),
-};
-
-static void guest_test_hvc(const struct test_hvc_info *hc_info)
-{
-	unsigned int i;
-	struct arm_smccc_res res;
-	unsigned int hvc_info_arr_sz;
-
-	hvc_info_arr_sz =
-	hc_info == hvc_info ? ARRAY_SIZE(hvc_info) : ARRAY_SIZE(false_hvc_info);
-
-	for (i = 0; i < hvc_info_arr_sz; i++, hc_info++) {
-		memset(&res, 0, sizeof(res));
-		smccc_hvc(hc_info->func_id, hc_info->arg1, 0, 0, 0, 0, 0, 0, &res);
-
-		switch (stage) {
-		case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
-		case TEST_STAGE_HVC_IFACE_FALSE_INFO:
-			__GUEST_ASSERT(res.a0 == SMCCC_RET_NOT_SUPPORTED,
-				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u",
-					res.a0, hc_info->func_id, hc_info->arg1, stage);
-			break;
-		case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
-			__GUEST_ASSERT(res.a0 != SMCCC_RET_NOT_SUPPORTED,
-				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u",
-					res.a0, hc_info->func_id, hc_info->arg1, stage);
-			break;
-		default:
-			GUEST_FAIL("Unexpected stage = %u", stage);
-		}
-	}
-}
-
-static void guest_code(void)
-{
-	while (stage != TEST_STAGE_END) {
-		switch (stage) {
-		case TEST_STAGE_REG_IFACE:
-			break;
-		case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
-		case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
-			guest_test_hvc(hvc_info);
-			break;
-		case TEST_STAGE_HVC_IFACE_FALSE_INFO:
-			guest_test_hvc(false_hvc_info);
-			break;
-		default:
-			GUEST_FAIL("Unexpected stage = %u", stage);
-		}
-
-		GUEST_SYNC(stage);
-	}
-
-	GUEST_DONE();
-}
-
-struct st_time {
-	uint32_t rev;
-	uint32_t attr;
-	uint64_t st_time;
-};
-
-#define STEAL_TIME_SIZE		((sizeof(struct st_time) + 63) & ~63)
-#define ST_GPA_BASE		(1 << 30)
-
-static void steal_time_init(struct kvm_vcpu *vcpu)
-{
-	uint64_t st_ipa = (ulong)ST_GPA_BASE;
-	unsigned int gpages;
-
-	gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE);
-	vm_userspace_mem_region_add(vcpu->vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
-
-	vcpu_device_attr_set(vcpu, KVM_ARM_VCPU_PVTIME_CTRL,
-			     KVM_ARM_VCPU_PVTIME_IPA, &st_ipa);
-}
-
-static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu)
-{
-	uint64_t val;
-	unsigned int i;
-	int ret;
-
-	for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) {
-		const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
-
-		/* First 'read' should be an upper limit of the features supported */
-		val = vcpu_get_reg(vcpu, reg_info->reg);
-		TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit),
-			"Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx",
-			reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val);
-
-		/* Test a 'write' by disabling all the features of the register map */
-		ret = __vcpu_set_reg(vcpu, reg_info->reg, 0);
-		TEST_ASSERT(ret == 0,
-			"Failed to clear all the features of reg: 0x%lx; ret: %d",
-			reg_info->reg, errno);
-
-		val = vcpu_get_reg(vcpu, reg_info->reg);
-		TEST_ASSERT(val == 0,
-			"Expected all the features to be cleared for reg: 0x%lx", reg_info->reg);
-
-		/*
-		 * Test enabling a feature that's not supported.
-		 * Avoid this check if all the bits are occupied.
-		 */
-		if (reg_info->max_feat_bit < 63) {
-			ret = __vcpu_set_reg(vcpu, reg_info->reg, BIT(reg_info->max_feat_bit + 1));
-			TEST_ASSERT(ret != 0 && errno == EINVAL,
-			"Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx",
-			errno, reg_info->reg);
-		}
-	}
-}
-
-static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu)
-{
-	uint64_t val;
-	unsigned int i;
-	int ret;
-
-	for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) {
-		const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
-
-		/*
-		 * Before starting the VM, the test clears all the bits.
-		 * Check if that's still the case.
-		 */
-		val = vcpu_get_reg(vcpu, reg_info->reg);
-		TEST_ASSERT(val == 0,
-			"Expected all the features to be cleared for reg: 0x%lx",
-			reg_info->reg);
-
-		/*
-		 * Since the VM has run at least once, KVM shouldn't allow modification of
-		 * the registers and should return EBUSY. Set the registers and check for
-		 * the expected errno.
-		 */
-		ret = __vcpu_set_reg(vcpu, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit));
-		TEST_ASSERT(ret != 0 && errno == EBUSY,
-		"Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx",
-		errno, reg_info->reg);
-	}
-}
-
-static struct kvm_vm *test_vm_create(struct kvm_vcpu **vcpu)
-{
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(vcpu, guest_code);
-
-	steal_time_init(*vcpu);
-
-	return vm;
-}
-
-static void test_guest_stage(struct kvm_vm **vm, struct kvm_vcpu **vcpu)
-{
-	int prev_stage = stage;
-
-	pr_debug("Stage: %d\n", prev_stage);
-
-	/* Sync the stage early, the VM might be freed below. */
-	stage++;
-	sync_global_to_guest(*vm, stage);
-
-	switch (prev_stage) {
-	case TEST_STAGE_REG_IFACE:
-		test_fw_regs_after_vm_start(*vcpu);
-		break;
-	case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
-		/* Start a new VM so that all the features are now enabled by default */
-		kvm_vm_free(*vm);
-		*vm = test_vm_create(vcpu);
-		break;
-	case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
-	case TEST_STAGE_HVC_IFACE_FALSE_INFO:
-		break;
-	default:
-		TEST_FAIL("Unknown test stage: %d", prev_stage);
-	}
-}
-
-static void test_run(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	bool guest_done = false;
-
-	vm = test_vm_create(&vcpu);
-
-	test_fw_regs_before_vm_start(vcpu);
-
-	while (!guest_done) {
-		vcpu_run(vcpu);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			test_guest_stage(&vm, &vcpu);
-			break;
-		case UCALL_DONE:
-			guest_done = true;
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		default:
-			TEST_FAIL("Unexpected guest exit");
-		}
-	}
-
-	kvm_vm_free(vm);
-}
-
-int main(void)
-{
-	test_run();
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/mmio_abort.c b/tools/testing/selftests/kvm/aarch64/mmio_abort.c
deleted file mode 100644
index 8b7a80a51b1c..000000000000
--- a/tools/testing/selftests/kvm/aarch64/mmio_abort.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mmio_abort - Tests for userspace MMIO abort injection
- *
- * Copyright (c) 2024 Google LLC
- */
-#include "processor.h"
-#include "test_util.h"
-
-#define MMIO_ADDR	0x8000000ULL
-
-static u64 expected_abort_pc;
-
-static void expect_sea_handler(struct ex_regs *regs)
-{
-	u64 esr = read_sysreg(esr_el1);
-
-	GUEST_ASSERT_EQ(regs->pc, expected_abort_pc);
-	GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR);
-	GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT);
-
-	GUEST_DONE();
-}
-
-static void unexpected_dabt_handler(struct ex_regs *regs)
-{
-	GUEST_FAIL("Unexpected data abort at PC: %lx\n", regs->pc);
-}
-
-static struct kvm_vm *vm_create_with_dabt_handler(struct kvm_vcpu **vcpu, void *guest_code,
-						  handler_fn dabt_handler)
-{
-	struct kvm_vm *vm = vm_create_with_one_vcpu(vcpu, guest_code);
-
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(*vcpu);
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_ELx_EC_DABT_CUR, dabt_handler);
-
-	virt_map(vm, MMIO_ADDR, MMIO_ADDR, 1);
-
-	return vm;
-}
-
-static void vcpu_inject_extabt(struct kvm_vcpu *vcpu)
-{
-	struct kvm_vcpu_events events = {};
-
-	events.exception.ext_dabt_pending = true;
-	vcpu_events_set(vcpu, &events);
-}
-
-static void vcpu_run_expect_done(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		break;
-	default:
-		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-	}
-}
-
-extern char test_mmio_abort_insn;
-
-static void test_mmio_abort_guest(void)
-{
-	WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_abort_insn);
-
-	asm volatile("test_mmio_abort_insn:\n\t"
-		     "ldr x0, [%0]\n\t"
-		     : : "r" (MMIO_ADDR) : "x0", "memory");
-
-	GUEST_FAIL("MMIO instruction should not retire");
-}
-
-/*
- * Test that KVM doesn't complete MMIO emulation when userspace has made an
- * external abort pending for the instruction.
- */
-static void test_mmio_abort(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_abort_guest,
-							expect_sea_handler);
-	struct kvm_run *run = vcpu->run;
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
-	TEST_ASSERT_EQ(run->mmio.phys_addr, MMIO_ADDR);
-	TEST_ASSERT_EQ(run->mmio.len, sizeof(unsigned long));
-	TEST_ASSERT(!run->mmio.is_write, "Expected MMIO read");
-
-	vcpu_inject_extabt(vcpu);
-	vcpu_run_expect_done(vcpu);
-	kvm_vm_free(vm);
-}
-
-extern char test_mmio_nisv_insn;
-
-static void test_mmio_nisv_guest(void)
-{
-	WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_nisv_insn);
-
-	asm volatile("test_mmio_nisv_insn:\n\t"
-		     "ldr x0, [%0], #8\n\t"
-		     : : "r" (MMIO_ADDR) : "x0", "memory");
-
-	GUEST_FAIL("MMIO instruction should not retire");
-}
-
-/*
- * Test that the KVM_RUN ioctl fails for ESR_EL2.ISV=0 MMIO aborts if userspace
- * hasn't enabled KVM_CAP_ARM_NISV_TO_USER.
- */
-static void test_mmio_nisv(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest,
-							unexpected_dabt_handler);
-
-	TEST_ASSERT(_vcpu_run(vcpu), "Expected nonzero return code from KVM_RUN");
-	TEST_ASSERT_EQ(errno, ENOSYS);
-
-	kvm_vm_free(vm);
-}
-
-/*
- * Test that ESR_EL2.ISV=0 MMIO aborts reach userspace and that an injected SEA
- * reaches the guest.
- */
-static void test_mmio_nisv_abort(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest,
-							expect_sea_handler);
-	struct kvm_run *run = vcpu->run;
-
-	vm_enable_cap(vm, KVM_CAP_ARM_NISV_TO_USER, 1);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_NISV);
-	TEST_ASSERT_EQ(run->arm_nisv.fault_ipa, MMIO_ADDR);
-
-	vcpu_inject_extabt(vcpu);
-	vcpu_run_expect_done(vcpu);
-	kvm_vm_free(vm);
-}
-
-int main(void)
-{
-	test_mmio_abort();
-	test_mmio_nisv();
-	test_mmio_nisv_abort();
-}
diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
deleted file mode 100644
index ebd70430c89d..000000000000
--- a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c
+++ /dev/null
@@ -1,175 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// Check that, on a GICv3 system, not configuring GICv3 correctly
-// results in all of the sysregs generating an UNDEF exception.
-
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-
-static volatile bool handled;
-
-#define __check_sr_read(r)					\
-	({							\
-		uint64_t val;					\
-								\
-		handled = false;				\
-		dsb(sy);					\
-		val = read_sysreg_s(SYS_ ## r);			\
-		val;						\
-	})
-
-#define __check_sr_write(r)					\
-	do {							\
-		handled = false;				\
-		dsb(sy);					\
-		write_sysreg_s(0, SYS_ ## r);			\
-		isb();						\
-	} while(0)
-
-/* Fatal checks */
-#define check_sr_read(r)					\
-	do {							\
-		__check_sr_read(r);				\
-		__GUEST_ASSERT(handled, #r " no read trap");	\
-	} while(0)
-
-#define check_sr_write(r)					\
-	do {							\
-		__check_sr_write(r);				\
-		__GUEST_ASSERT(handled, #r " no write trap");	\
-	} while(0)
-
-#define check_sr_rw(r)				\
-	do {					\
-		check_sr_read(r);		\
-		check_sr_write(r);		\
-	} while(0)
-
-static void guest_code(void)
-{
-	uint64_t val;
-
-	/*
-	 * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having
-	 * hidden the feature at runtime without any other userspace action.
-	 */
-	__GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC),
-				 read_sysreg(id_aa64pfr0_el1)) == 0,
-		       "GICv3 wrongly advertised");
-
-	/*
-	 * Access all GICv3 registers, and fail if we don't get an UNDEF.
-	 * Note that we happily access all the APxRn registers without
-	 * checking their existance, as all we want to see is a failure.
-	 */
-	check_sr_rw(ICC_PMR_EL1);
-	check_sr_read(ICC_IAR0_EL1);
-	check_sr_write(ICC_EOIR0_EL1);
-	check_sr_rw(ICC_HPPIR0_EL1);
-	check_sr_rw(ICC_BPR0_EL1);
-	check_sr_rw(ICC_AP0R0_EL1);
-	check_sr_rw(ICC_AP0R1_EL1);
-	check_sr_rw(ICC_AP0R2_EL1);
-	check_sr_rw(ICC_AP0R3_EL1);
-	check_sr_rw(ICC_AP1R0_EL1);
-	check_sr_rw(ICC_AP1R1_EL1);
-	check_sr_rw(ICC_AP1R2_EL1);
-	check_sr_rw(ICC_AP1R3_EL1);
-	check_sr_write(ICC_DIR_EL1);
-	check_sr_read(ICC_RPR_EL1);
-	check_sr_write(ICC_SGI1R_EL1);
-	check_sr_write(ICC_ASGI1R_EL1);
-	check_sr_write(ICC_SGI0R_EL1);
-	check_sr_read(ICC_IAR1_EL1);
-	check_sr_write(ICC_EOIR1_EL1);
-	check_sr_rw(ICC_HPPIR1_EL1);
-	check_sr_rw(ICC_BPR1_EL1);
-	check_sr_rw(ICC_CTLR_EL1);
-	check_sr_rw(ICC_IGRPEN0_EL1);
-	check_sr_rw(ICC_IGRPEN1_EL1);
-
-	/*
-	 * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can
-	 * be RAO/WI. Engage in non-fatal accesses, starting with a
-	 * write of 0 to try and disable SRE, and let's see if it
-	 * sticks.
-	 */
-	__check_sr_write(ICC_SRE_EL1);
-	if (!handled)
-		GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n");
-
-	val = __check_sr_read(ICC_SRE_EL1);
-	if (!handled) {
-		__GUEST_ASSERT((val & BIT(0)),
-			       "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n");
-		GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n");
-	}
-
-	GUEST_DONE();
-}
-
-static void guest_undef_handler(struct ex_regs *regs)
-{
-	/* Success, we've gracefully exploded! */
-	handled = true;
-	regs->pc += 4;
-}
-
-static void test_run_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	do {
-		vcpu_run(vcpu);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_PRINTF:
-			printf("%s", uc.buffer);
-			break;
-		case UCALL_DONE:
-			break;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	} while (uc.cmd != UCALL_DONE);
-}
-
-static void test_guest_no_gicv3(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	/* Create a VM without a GICv3 */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(vcpu);
-
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_UNKNOWN, guest_undef_handler);
-
-	test_run_vcpu(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	uint64_t pfr0;
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-	pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
-	__TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0),
-		       "GICv3 not supported.");
-	kvm_vm_free(vm);
-
-	test_guest_no_gicv3();
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
deleted file mode 100644
index ec33a8f9c908..000000000000
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ /dev/null
@@ -1,1135 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * page_fault_test.c - Test stage 2 faults.
- *
- * This test tries different combinations of guest accesses (e.g., write,
- * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on
- * hugetlbfs with a hole). It checks that the expected handling method is
- * called (e.g., uffd faults with the right address and write/read flag).
- */
-#include <linux/bitmap.h>
-#include <fcntl.h>
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-#include <asm/sysreg.h>
-#include <linux/bitfield.h>
-#include "guest_modes.h"
-#include "userfaultfd_util.h"
-
-/* Guest virtual addresses that point to the test page and its PTE. */
-#define TEST_GVA				0xc0000000
-#define TEST_EXEC_GVA				(TEST_GVA + 0x8)
-#define TEST_PTE_GVA				0xb0000000
-#define TEST_DATA				0x0123456789ABCDEF
-
-static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA;
-
-#define CMD_NONE				(0)
-#define CMD_SKIP_TEST				(1ULL << 1)
-#define CMD_HOLE_PT				(1ULL << 2)
-#define CMD_HOLE_DATA				(1ULL << 3)
-#define CMD_CHECK_WRITE_IN_DIRTY_LOG		(1ULL << 4)
-#define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG		(1ULL << 5)
-#define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG		(1ULL << 6)
-#define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG	(1ULL << 7)
-#define CMD_SET_PTE_AF				(1ULL << 8)
-
-#define PREPARE_FN_NR				10
-#define CHECK_FN_NR				10
-
-static struct event_cnt {
-	int mmio_exits;
-	int fail_vcpu_runs;
-	int uffd_faults;
-	/* uffd_faults is incremented from multiple threads. */
-	pthread_mutex_t uffd_faults_mutex;
-} events;
-
-struct test_desc {
-	const char *name;
-	uint64_t mem_mark_cmd;
-	/* Skip the test if any prepare function returns false */
-	bool (*guest_prepare[PREPARE_FN_NR])(void);
-	void (*guest_test)(void);
-	void (*guest_test_check[CHECK_FN_NR])(void);
-	uffd_handler_t uffd_pt_handler;
-	uffd_handler_t uffd_data_handler;
-	void (*dabt_handler)(struct ex_regs *regs);
-	void (*iabt_handler)(struct ex_regs *regs);
-	void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run);
-	void (*fail_vcpu_run_handler)(int ret);
-	uint32_t pt_memslot_flags;
-	uint32_t data_memslot_flags;
-	bool skip;
-	struct event_cnt expected_events;
-};
-
-struct test_params {
-	enum vm_mem_backing_src_type src_type;
-	struct test_desc *test_desc;
-};
-
-static inline void flush_tlb_page(uint64_t vaddr)
-{
-	uint64_t page = vaddr >> 12;
-
-	dsb(ishst);
-	asm volatile("tlbi vaae1is, %0" :: "r" (page));
-	dsb(ish);
-	isb();
-}
-
-static void guest_write64(void)
-{
-	uint64_t val;
-
-	WRITE_ONCE(*guest_test_memory, TEST_DATA);
-	val = READ_ONCE(*guest_test_memory);
-	GUEST_ASSERT_EQ(val, TEST_DATA);
-}
-
-/* Check the system for atomic instructions. */
-static bool guest_check_lse(void)
-{
-	uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
-	uint64_t atomic;
-
-	atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0);
-	return atomic >= 2;
-}
-
-static bool guest_check_dc_zva(void)
-{
-	uint64_t dczid = read_sysreg(dczid_el0);
-	uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid);
-
-	return dzp == 0;
-}
-
-/* Compare and swap instruction. */
-static void guest_cas(void)
-{
-	uint64_t val;
-
-	GUEST_ASSERT(guest_check_lse());
-	asm volatile(".arch_extension lse\n"
-		     "casal %0, %1, [%2]\n"
-		     :: "r" (0ul), "r" (TEST_DATA), "r" (guest_test_memory));
-	val = READ_ONCE(*guest_test_memory);
-	GUEST_ASSERT_EQ(val, TEST_DATA);
-}
-
-static void guest_read64(void)
-{
-	uint64_t val;
-
-	val = READ_ONCE(*guest_test_memory);
-	GUEST_ASSERT_EQ(val, 0);
-}
-
-/* Address translation instruction */
-static void guest_at(void)
-{
-	uint64_t par;
-
-	asm volatile("at s1e1r, %0" :: "r" (guest_test_memory));
-	isb();
-	par = read_sysreg(par_el1);
-
-	/* Bit 1 indicates whether the AT was successful */
-	GUEST_ASSERT_EQ(par & 1, 0);
-}
-
-/*
- * The size of the block written by "dc zva" is guaranteed to be between (2 <<
- * 0) and (2 << 9), which is safe in our case as we need the write to happen
- * for at least a word, and not more than a page.
- */
-static void guest_dc_zva(void)
-{
-	uint16_t val;
-
-	asm volatile("dc zva, %0" :: "r" (guest_test_memory));
-	dsb(ish);
-	val = READ_ONCE(*guest_test_memory);
-	GUEST_ASSERT_EQ(val, 0);
-}
-
-/*
- * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0).
- * And that's special because KVM must take special care with those: they
- * should still count as accesses for dirty logging or user-faulting, but
- * should be handled differently on mmio.
- */
-static void guest_ld_preidx(void)
-{
-	uint64_t val;
-	uint64_t addr = TEST_GVA - 8;
-
-	/*
-	 * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is
-	 * in a gap between memslots not backing by anything.
-	 */
-	asm volatile("ldr %0, [%1, #8]!"
-		     : "=r" (val), "+r" (addr));
-	GUEST_ASSERT_EQ(val, 0);
-	GUEST_ASSERT_EQ(addr, TEST_GVA);
-}
-
-static void guest_st_preidx(void)
-{
-	uint64_t val = TEST_DATA;
-	uint64_t addr = TEST_GVA - 8;
-
-	asm volatile("str %0, [%1, #8]!"
-		     : "+r" (val), "+r" (addr));
-
-	GUEST_ASSERT_EQ(addr, TEST_GVA);
-	val = READ_ONCE(*guest_test_memory);
-}
-
-static bool guest_set_ha(void)
-{
-	uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1);
-	uint64_t hadbs, tcr;
-
-	/* Skip if HA is not supported. */
-	hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1);
-	if (hadbs == 0)
-		return false;
-
-	tcr = read_sysreg(tcr_el1) | TCR_EL1_HA;
-	write_sysreg(tcr, tcr_el1);
-	isb();
-
-	return true;
-}
-
-static bool guest_clear_pte_af(void)
-{
-	*((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF;
-	flush_tlb_page(TEST_GVA);
-
-	return true;
-}
-
-static void guest_check_pte_af(void)
-{
-	dsb(ish);
-	GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF);
-}
-
-static void guest_check_write_in_dirty_log(void)
-{
-	GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG);
-}
-
-static void guest_check_no_write_in_dirty_log(void)
-{
-	GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG);
-}
-
-static void guest_check_s1ptw_wr_in_dirty_log(void)
-{
-	GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG);
-}
-
-static void guest_check_no_s1ptw_wr_in_dirty_log(void)
-{
-	GUEST_SYNC(CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG);
-}
-
-static void guest_exec(void)
-{
-	int (*code)(void) = (int (*)(void))TEST_EXEC_GVA;
-	int ret;
-
-	ret = code();
-	GUEST_ASSERT_EQ(ret, 0x77);
-}
-
-static bool guest_prepare(struct test_desc *test)
-{
-	bool (*prepare_fn)(void);
-	int i;
-
-	for (i = 0; i < PREPARE_FN_NR; i++) {
-		prepare_fn = test->guest_prepare[i];
-		if (prepare_fn && !prepare_fn())
-			return false;
-	}
-
-	return true;
-}
-
-static void guest_test_check(struct test_desc *test)
-{
-	void (*check_fn)(void);
-	int i;
-
-	for (i = 0; i < CHECK_FN_NR; i++) {
-		check_fn = test->guest_test_check[i];
-		if (check_fn)
-			check_fn();
-	}
-}
-
-static void guest_code(struct test_desc *test)
-{
-	if (!guest_prepare(test))
-		GUEST_SYNC(CMD_SKIP_TEST);
-
-	GUEST_SYNC(test->mem_mark_cmd);
-
-	if (test->guest_test)
-		test->guest_test();
-
-	guest_test_check(test);
-	GUEST_DONE();
-}
-
-static void no_dabt_handler(struct ex_regs *regs)
-{
-	GUEST_FAIL("Unexpected dabt, far_el1 = 0x%lx", read_sysreg(far_el1));
-}
-
-static void no_iabt_handler(struct ex_regs *regs)
-{
-	GUEST_FAIL("Unexpected iabt, pc = 0x%lx", regs->pc);
-}
-
-static struct uffd_args {
-	char *copy;
-	void *hva;
-	uint64_t paging_size;
-} pt_args, data_args;
-
-/* Returns true to continue the test, and false if it should be skipped. */
-static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg,
-				struct uffd_args *args)
-{
-	uint64_t addr = msg->arg.pagefault.address;
-	uint64_t flags = msg->arg.pagefault.flags;
-	struct uffdio_copy copy;
-	int ret;
-
-	TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING,
-		    "The only expected UFFD mode is MISSING");
-	TEST_ASSERT_EQ(addr, (uint64_t)args->hva);
-
-	pr_debug("uffd fault: addr=%p write=%d\n",
-		 (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE));
-
-	copy.src = (uint64_t)args->copy;
-	copy.dst = addr;
-	copy.len = args->paging_size;
-	copy.mode = 0;
-
-	ret = ioctl(uffd, UFFDIO_COPY, &copy);
-	if (ret == -1) {
-		pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n",
-			addr, errno);
-		return ret;
-	}
-
-	pthread_mutex_lock(&events.uffd_faults_mutex);
-	events.uffd_faults += 1;
-	pthread_mutex_unlock(&events.uffd_faults_mutex);
-	return 0;
-}
-
-static int uffd_pt_handler(int mode, int uffd, struct uffd_msg *msg)
-{
-	return uffd_generic_handler(mode, uffd, msg, &pt_args);
-}
-
-static int uffd_data_handler(int mode, int uffd, struct uffd_msg *msg)
-{
-	return uffd_generic_handler(mode, uffd, msg, &data_args);
-}
-
-static void setup_uffd_args(struct userspace_mem_region *region,
-			    struct uffd_args *args)
-{
-	args->hva = (void *)region->region.userspace_addr;
-	args->paging_size = region->region.memory_size;
-
-	args->copy = malloc(args->paging_size);
-	TEST_ASSERT(args->copy, "Failed to allocate data copy.");
-	memcpy(args->copy, args->hva, args->paging_size);
-}
-
-static void setup_uffd(struct kvm_vm *vm, struct test_params *p,
-		       struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd)
-{
-	struct test_desc *test = p->test_desc;
-	int uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
-
-	setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args);
-	setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args);
-
-	*pt_uffd = NULL;
-	if (test->uffd_pt_handler)
-		*pt_uffd = uffd_setup_demand_paging(uffd_mode, 0,
-						    pt_args.hva,
-						    pt_args.paging_size,
-						    1, test->uffd_pt_handler);
-
-	*data_uffd = NULL;
-	if (test->uffd_data_handler)
-		*data_uffd = uffd_setup_demand_paging(uffd_mode, 0,
-						      data_args.hva,
-						      data_args.paging_size,
-						      1, test->uffd_data_handler);
-}
-
-static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd,
-		      struct uffd_desc *data_uffd)
-{
-	if (test->uffd_pt_handler)
-		uffd_stop_demand_paging(pt_uffd);
-	if (test->uffd_data_handler)
-		uffd_stop_demand_paging(data_uffd);
-
-	free(pt_args.copy);
-	free(data_args.copy);
-}
-
-static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg)
-{
-	TEST_FAIL("There was no UFFD fault expected.");
-	return -1;
-}
-
-/* Returns false if the test should be skipped. */
-static bool punch_hole_in_backing_store(struct kvm_vm *vm,
-					struct userspace_mem_region *region)
-{
-	void *hva = (void *)region->region.userspace_addr;
-	uint64_t paging_size = region->region.memory_size;
-	int ret, fd = region->fd;
-
-	if (fd != -1) {
-		ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-				0, paging_size);
-		TEST_ASSERT(ret == 0, "fallocate failed");
-	} else {
-		ret = madvise(hva, paging_size, MADV_DONTNEED);
-		TEST_ASSERT(ret == 0, "madvise failed");
-	}
-
-	return true;
-}
-
-static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run)
-{
-	struct userspace_mem_region *region;
-	void *hva;
-
-	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
-	hva = (void *)region->region.userspace_addr;
-
-	TEST_ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr);
-
-	memcpy(hva, run->mmio.data, run->mmio.len);
-	events.mmio_exits += 1;
-}
-
-static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run)
-{
-	uint64_t data;
-
-	memcpy(&data, run->mmio.data, sizeof(data));
-	pr_debug("addr=%lld len=%d w=%d data=%lx\n",
-		 run->mmio.phys_addr, run->mmio.len,
-		 run->mmio.is_write, data);
-	TEST_FAIL("There was no MMIO exit expected.");
-}
-
-static bool check_write_in_dirty_log(struct kvm_vm *vm,
-				     struct userspace_mem_region *region,
-				     uint64_t host_pg_nr)
-{
-	unsigned long *bmap;
-	bool first_page_dirty;
-	uint64_t size = region->region.memory_size;
-
-	/* getpage_size() is not always equal to vm->page_size */
-	bmap = bitmap_zalloc(size / getpagesize());
-	kvm_vm_get_dirty_log(vm, region->region.slot, bmap);
-	first_page_dirty = test_bit(host_pg_nr, bmap);
-	free(bmap);
-	return first_page_dirty;
-}
-
-/* Returns true to continue the test, and false if it should be skipped. */
-static bool handle_cmd(struct kvm_vm *vm, int cmd)
-{
-	struct userspace_mem_region *data_region, *pt_region;
-	bool continue_test = true;
-	uint64_t pte_gpa, pte_pg;
-
-	data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
-	pt_region = vm_get_mem_region(vm, MEM_REGION_PT);
-	pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
-	pte_pg = (pte_gpa - pt_region->region.guest_phys_addr) / getpagesize();
-
-	if (cmd == CMD_SKIP_TEST)
-		continue_test = false;
-
-	if (cmd & CMD_HOLE_PT)
-		continue_test = punch_hole_in_backing_store(vm, pt_region);
-	if (cmd & CMD_HOLE_DATA)
-		continue_test = punch_hole_in_backing_store(vm, data_region);
-	if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG)
-		TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0),
-			    "Missing write in dirty log");
-	if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG)
-		TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, pte_pg),
-			    "Missing s1ptw write in dirty log");
-	if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG)
-		TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0),
-			    "Unexpected write in dirty log");
-	if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG)
-		TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, pte_pg),
-			    "Unexpected s1ptw write in dirty log");
-
-	return continue_test;
-}
-
-void fail_vcpu_run_no_handler(int ret)
-{
-	TEST_FAIL("Unexpected vcpu run failure");
-}
-
-void fail_vcpu_run_mmio_no_syndrome_handler(int ret)
-{
-	TEST_ASSERT(errno == ENOSYS,
-		    "The mmio handler should have returned not implemented.");
-	events.fail_vcpu_runs += 1;
-}
-
-typedef uint32_t aarch64_insn_t;
-extern aarch64_insn_t __exec_test[2];
-
-noinline void __return_0x77(void)
-{
-	asm volatile("__exec_test: mov x0, #0x77\n"
-		     "ret\n");
-}
-
-/*
- * Note that this function runs on the host before the test VM starts: there's
- * no need to sync the D$ and I$ caches.
- */
-static void load_exec_code_for_test(struct kvm_vm *vm)
-{
-	uint64_t *code;
-	struct userspace_mem_region *region;
-	void *hva;
-
-	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
-	hva = (void *)region->region.userspace_addr;
-
-	assert(TEST_EXEC_GVA > TEST_GVA);
-	code = hva + TEST_EXEC_GVA - TEST_GVA;
-	memcpy(code, __exec_test, sizeof(__exec_test));
-}
-
-static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
-				 struct test_desc *test)
-{
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(vcpu);
-
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_DABT_CUR, no_dabt_handler);
-	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
-				ESR_ELx_EC_IABT_CUR, no_iabt_handler);
-}
-
-static void setup_gva_maps(struct kvm_vm *vm)
-{
-	struct userspace_mem_region *region;
-	uint64_t pte_gpa;
-
-	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
-	/* Map TEST_GVA first. This will install a new PTE. */
-	virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr);
-	/* Then map TEST_PTE_GVA to the above PTE. */
-	pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
-	virt_pg_map(vm, TEST_PTE_GVA, pte_gpa);
-}
-
-enum pf_test_memslots {
-	CODE_AND_DATA_MEMSLOT,
-	PAGE_TABLE_MEMSLOT,
-	TEST_DATA_MEMSLOT,
-};
-
-/*
- * Create a memslot for code and data at pfn=0, and test-data and PT ones
- * at max_gfn.
- */
-static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
-{
-	uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type);
-	uint64_t guest_page_size = vm->page_size;
-	uint64_t max_gfn = vm_compute_max_gfn(vm);
-	/* Enough for 2M of code when using 4K guest pages. */
-	uint64_t code_npages = 512;
-	uint64_t pt_size, data_size, data_gpa;
-
-	/*
-	 * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using
-	 * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs.  That's 13
-	 * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use
-	 * twice that just in case.
-	 */
-	pt_size = 26 * guest_page_size;
-
-	/* memslot sizes and gpa's must be aligned to the backing page size */
-	pt_size = align_up(pt_size, backing_src_pagesz);
-	data_size = align_up(guest_page_size, backing_src_pagesz);
-	data_gpa = (max_gfn * guest_page_size) - data_size;
-	data_gpa = align_down(data_gpa, backing_src_pagesz);
-
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0,
-				    CODE_AND_DATA_MEMSLOT, code_npages, 0);
-	vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT;
-	vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT;
-
-	vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size,
-				    PAGE_TABLE_MEMSLOT, pt_size / guest_page_size,
-				    p->test_desc->pt_memslot_flags);
-	vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT;
-
-	vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT,
-				    data_size / guest_page_size,
-				    p->test_desc->data_memslot_flags);
-	vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
-}
-
-static void setup_ucall(struct kvm_vm *vm)
-{
-	struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
-
-	ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size);
-}
-
-static void setup_default_handlers(struct test_desc *test)
-{
-	if (!test->mmio_handler)
-		test->mmio_handler = mmio_no_handler;
-
-	if (!test->fail_vcpu_run_handler)
-		test->fail_vcpu_run_handler = fail_vcpu_run_no_handler;
-}
-
-static void check_event_counts(struct test_desc *test)
-{
-	TEST_ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults);
-	TEST_ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits);
-	TEST_ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs);
-}
-
-static void print_test_banner(enum vm_guest_mode mode, struct test_params *p)
-{
-	struct test_desc *test = p->test_desc;
-
-	pr_debug("Test: %s\n", test->name);
-	pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode));
-	pr_debug("Testing memory backing src type: %s\n",
-		 vm_mem_backing_src_alias(p->src_type)->name);
-}
-
-static void reset_event_counts(void)
-{
-	memset(&events, 0, sizeof(events));
-}
-
-/*
- * This function either succeeds, skips the test (after setting test->skip), or
- * fails with a TEST_FAIL that aborts all tests.
- */
-static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
-			  struct test_desc *test)
-{
-	struct kvm_run *run;
-	struct ucall uc;
-	int ret;
-
-	run = vcpu->run;
-
-	for (;;) {
-		ret = _vcpu_run(vcpu);
-		if (ret) {
-			test->fail_vcpu_run_handler(ret);
-			goto done;
-		}
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			if (!handle_cmd(vm, uc.args[1])) {
-				test->skip = true;
-				goto done;
-			}
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_DONE:
-			goto done;
-		case UCALL_NONE:
-			if (run->exit_reason == KVM_EXIT_MMIO)
-				test->mmio_handler(vm, run);
-			break;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-
-done:
-	pr_debug(test->skip ? "Skipped.\n" : "Done.\n");
-}
-
-static void run_test(enum vm_guest_mode mode, void *arg)
-{
-	struct test_params *p = (struct test_params *)arg;
-	struct test_desc *test = p->test_desc;
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	struct uffd_desc *pt_uffd, *data_uffd;
-
-	print_test_banner(mode, p);
-
-	vm = ____vm_create(VM_SHAPE(mode));
-	setup_memslots(vm, p);
-	kvm_vm_elf_load(vm, program_invocation_name);
-	setup_ucall(vm);
-	vcpu = vm_vcpu_add(vm, 0, guest_code);
-
-	setup_gva_maps(vm);
-
-	reset_event_counts();
-
-	/*
-	 * Set some code in the data memslot for the guest to execute (only
-	 * applicable to the EXEC tests). This has to be done before
-	 * setup_uffd() as that function copies the memslot data for the uffd
-	 * handler.
-	 */
-	load_exec_code_for_test(vm);
-	setup_uffd(vm, p, &pt_uffd, &data_uffd);
-	setup_abort_handlers(vm, vcpu, test);
-	setup_default_handlers(test);
-	vcpu_args_set(vcpu, 1, test);
-
-	vcpu_run_loop(vm, vcpu, test);
-
-	kvm_vm_free(vm);
-	free_uffd(test, pt_uffd, data_uffd);
-
-	/*
-	 * Make sure we check the events after the uffd threads have exited,
-	 * which means they updated their respective event counters.
-	 */
-	if (!test->skip)
-		check_event_counts(test);
-}
-
-static void help(char *name)
-{
-	puts("");
-	printf("usage: %s [-h] [-s mem-type]\n", name);
-	puts("");
-	guest_modes_help();
-	backing_src_help("-s");
-	puts("");
-}
-
-#define SNAME(s)			#s
-#define SCAT2(a, b)			SNAME(a ## _ ## b)
-#define SCAT3(a, b, c)			SCAT2(a, SCAT2(b, c))
-#define SCAT4(a, b, c, d)		SCAT2(a, SCAT3(b, c, d))
-
-#define _CHECK(_test)			_CHECK_##_test
-#define _PREPARE(_test)			_PREPARE_##_test
-#define _PREPARE_guest_read64		NULL
-#define _PREPARE_guest_ld_preidx	NULL
-#define _PREPARE_guest_write64		NULL
-#define _PREPARE_guest_st_preidx	NULL
-#define _PREPARE_guest_exec		NULL
-#define _PREPARE_guest_at		NULL
-#define _PREPARE_guest_dc_zva		guest_check_dc_zva
-#define _PREPARE_guest_cas		guest_check_lse
-
-/* With or without access flag checks */
-#define _PREPARE_with_af		guest_set_ha, guest_clear_pte_af
-#define _PREPARE_no_af			NULL
-#define _CHECK_with_af			guest_check_pte_af
-#define _CHECK_no_af			NULL
-
-/* Performs an access and checks that no faults were triggered. */
-#define TEST_ACCESS(_access, _with_af, _mark_cmd)				\
-{										\
-	.name			= SCAT3(_access, _with_af, #_mark_cmd),		\
-	.guest_prepare		= { _PREPARE(_with_af),				\
-				    _PREPARE(_access) },			\
-	.mem_mark_cmd		= _mark_cmd,					\
-	.guest_test		= _access,					\
-	.guest_test_check	= { _CHECK(_with_af) },				\
-	.expected_events	= { 0 },					\
-}
-
-#define TEST_UFFD(_access, _with_af, _mark_cmd,					\
-		  _uffd_data_handler, _uffd_pt_handler, _uffd_faults)		\
-{										\
-	.name			= SCAT4(uffd, _access, _with_af, #_mark_cmd),	\
-	.guest_prepare		= { _PREPARE(_with_af),				\
-				    _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.mem_mark_cmd		= _mark_cmd,					\
-	.guest_test_check	= { _CHECK(_with_af) },				\
-	.uffd_data_handler	= _uffd_data_handler,				\
-	.uffd_pt_handler	= _uffd_pt_handler,				\
-	.expected_events	= { .uffd_faults = _uffd_faults, },		\
-}
-
-#define TEST_DIRTY_LOG(_access, _with_af, _test_check, _pt_check)		\
-{										\
-	.name			= SCAT3(dirty_log, _access, _with_af),		\
-	.data_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
-	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
-	.guest_prepare		= { _PREPARE(_with_af),				\
-				    _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.guest_test_check	= { _CHECK(_with_af), _test_check, _pt_check },	\
-	.expected_events	= { 0 },					\
-}
-
-#define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler,		\
-				_uffd_faults, _test_check, _pt_check)		\
-{										\
-	.name			= SCAT3(uffd_and_dirty_log, _access, _with_af),	\
-	.data_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
-	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
-	.guest_prepare		= { _PREPARE(_with_af),				\
-				    _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
-	.guest_test_check	= { _CHECK(_with_af), _test_check, _pt_check },	\
-	.uffd_data_handler	= _uffd_data_handler,				\
-	.uffd_pt_handler	= uffd_pt_handler,				\
-	.expected_events	= { .uffd_faults = _uffd_faults, },		\
-}
-
-#define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits)			\
-{										\
-	.name			= SCAT2(ro_memslot, _access),			\
-	.data_memslot_flags	= KVM_MEM_READONLY,				\
-	.pt_memslot_flags	= KVM_MEM_READONLY,				\
-	.guest_prepare		= { _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.mmio_handler		= _mmio_handler,				\
-	.expected_events	= { .mmio_exits = _mmio_exits },		\
-}
-
-#define TEST_RO_MEMSLOT_NO_SYNDROME(_access)					\
-{										\
-	.name			= SCAT2(ro_memslot_no_syndrome, _access),	\
-	.data_memslot_flags	= KVM_MEM_READONLY,				\
-	.pt_memslot_flags	= KVM_MEM_READONLY,				\
-	.guest_prepare		= { _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
-	.expected_events	= { .fail_vcpu_runs = 1 },			\
-}
-
-#define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits,	\
-				      _test_check)				\
-{										\
-	.name			= SCAT2(ro_memslot, _access),			\
-	.data_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
-	.pt_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
-	.guest_prepare		= { _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.guest_test_check	= { _test_check },				\
-	.mmio_handler		= _mmio_handler,				\
-	.expected_events	= { .mmio_exits = _mmio_exits},			\
-}
-
-#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check)		\
-{										\
-	.name			= SCAT2(ro_memslot_no_syn_and_dlog, _access),	\
-	.data_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
-	.pt_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
-	.guest_prepare		= { _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.guest_test_check	= { _test_check },				\
-	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
-	.expected_events	= { .fail_vcpu_runs = 1 },			\
-}
-
-#define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits,		\
-				 _uffd_data_handler, _uffd_faults)		\
-{										\
-	.name			= SCAT2(ro_memslot_uffd, _access),		\
-	.data_memslot_flags	= KVM_MEM_READONLY,				\
-	.pt_memslot_flags	= KVM_MEM_READONLY,				\
-	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
-	.guest_prepare		= { _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.uffd_data_handler	= _uffd_data_handler,				\
-	.uffd_pt_handler	= uffd_pt_handler,				\
-	.mmio_handler		= _mmio_handler,				\
-	.expected_events	= { .mmio_exits = _mmio_exits,			\
-				    .uffd_faults = _uffd_faults },		\
-}
-
-#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler,	\
-					     _uffd_faults)			\
-{										\
-	.name			= SCAT2(ro_memslot_no_syndrome, _access),	\
-	.data_memslot_flags	= KVM_MEM_READONLY,				\
-	.pt_memslot_flags	= KVM_MEM_READONLY,				\
-	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
-	.guest_prepare		= { _PREPARE(_access) },			\
-	.guest_test		= _access,					\
-	.uffd_data_handler	= _uffd_data_handler,				\
-	.uffd_pt_handler	= uffd_pt_handler,			\
-	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
-	.expected_events	= { .fail_vcpu_runs = 1,			\
-				    .uffd_faults = _uffd_faults },		\
-}
-
-static struct test_desc tests[] = {
-
-	/* Check that HW is setting the Access Flag (AF) (sanity checks). */
-	TEST_ACCESS(guest_read64, with_af, CMD_NONE),
-	TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE),
-	TEST_ACCESS(guest_cas, with_af, CMD_NONE),
-	TEST_ACCESS(guest_write64, with_af, CMD_NONE),
-	TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE),
-	TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE),
-	TEST_ACCESS(guest_exec, with_af, CMD_NONE),
-
-	/*
-	 * Punch a hole in the data backing store, and then try multiple
-	 * accesses: reads should rturn zeroes, and writes should
-	 * re-populate the page. Moreover, the test also check that no
-	 * exception was generated in the guest.  Note that this
-	 * reading/writing behavior is the same as reading/writing a
-	 * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from
-	 * userspace.
-	 */
-	TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA),
-	TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA),
-	TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA),
-	TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA),
-	TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA),
-	TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA),
-	TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA),
-
-	/*
-	 * Punch holes in the data and PT backing stores and mark them for
-	 * userfaultfd handling. This should result in 2 faults: the access
-	 * on the data backing store, and its respective S1 page table walk
-	 * (S1PTW).
-	 */
-	TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	/*
-	 * Can't test guest_at with_af as it's IMPDEF whether the AF is set.
-	 * The S1PTW fault should still be marked as a write.
-	 */
-	TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_no_handler, uffd_pt_handler, 1),
-	TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-	TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
-		  uffd_data_handler, uffd_pt_handler, 2),
-
-	/*
-	 * Try accesses when the data and PT memory regions are both
-	 * tracked for dirty logging.
-	 */
-	TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log,
-		       guest_check_no_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_ld_preidx, with_af,
-		       guest_check_no_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log,
-		       guest_check_no_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-	TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log,
-		       guest_check_s1ptw_wr_in_dirty_log),
-
-	/*
-	 * Access when the data and PT memory regions are both marked for
-	 * dirty logging and UFFD at the same time. The expected result is
-	 * that writes should mark the dirty log and trigger a userfaultfd
-	 * write fault.  Reads/execs should result in a read userfaultfd
-	 * fault, and nothing in the dirty log.  Any S1PTW should result in
-	 * a write in the dirty log and a userfaultfd write.
-	 */
-	TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af,
-				uffd_data_handler, 2,
-				guest_check_no_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af,
-				uffd_data_handler, 2,
-				guest_check_no_write_in_dirty_log,
-				guest_check_no_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af,
-				uffd_data_handler,
-				2, guest_check_no_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, uffd_no_handler, 1,
-				guest_check_no_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af,
-				uffd_data_handler, 2,
-				guest_check_no_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af,
-				uffd_data_handler,
-				2, guest_check_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af,
-				uffd_data_handler, 2,
-				guest_check_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af,
-				uffd_data_handler,
-				2, guest_check_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af,
-				uffd_data_handler, 2,
-				guest_check_write_in_dirty_log,
-				guest_check_s1ptw_wr_in_dirty_log),
-	/*
-	 * Access when both the PT and data regions are marked read-only
-	 * (with KVM_MEM_READONLY). Writes with a syndrome result in an
-	 * MMIO exit, writes with no syndrome (e.g., CAS) result in a
-	 * failed vcpu run, and reads/execs with and without syndroms do
-	 * not fault.
-	 */
-	TEST_RO_MEMSLOT(guest_read64, 0, 0),
-	TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0),
-	TEST_RO_MEMSLOT(guest_at, 0, 0),
-	TEST_RO_MEMSLOT(guest_exec, 0, 0),
-	TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1),
-	TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva),
-	TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas),
-	TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx),
-
-	/*
-	 * The PT and data regions are both read-only and marked
-	 * for dirty logging at the same time. The expected result is that
-	 * for writes there should be no write in the dirty log. The
-	 * readonly handling is the same as if the memslot was not marked
-	 * for dirty logging: writes with a syndrome result in an MMIO
-	 * exit, and writes with no syndrome result in a failed vcpu run.
-	 */
-	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0,
-				      guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0,
-				      guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0,
-				      guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0,
-				      guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler,
-				      1, guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva,
-						  guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas,
-						  guest_check_no_write_in_dirty_log),
-	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx,
-						  guest_check_no_write_in_dirty_log),
-
-	/*
-	 * The PT and data regions are both read-only and punched with
-	 * holes tracked with userfaultfd.  The expected result is the
-	 * union of both userfaultfd and read-only behaviors. For example,
-	 * write accesses result in a userfaultfd write fault and an MMIO
-	 * exit.  Writes with no syndrome result in a failed vcpu run and
-	 * no userfaultfd write fault. Reads result in userfaultfd getting
-	 * triggered.
-	 */
-	TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, uffd_data_handler, 2),
-	TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, uffd_data_handler, 2),
-	TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, uffd_no_handler, 1),
-	TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, uffd_data_handler, 2),
-	TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1,
-				 uffd_data_handler, 2),
-	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, uffd_data_handler, 2),
-	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, uffd_no_handler, 1),
-	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, uffd_no_handler, 1),
-
-	{ 0 }
-};
-
-static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type)
-{
-	struct test_desc *t;
-
-	for (t = &tests[0]; t->name; t++) {
-		if (t->skip)
-			continue;
-
-		struct test_params p = {
-			.src_type = src_type,
-			.test_desc = t,
-		};
-
-		for_each_guest_mode(run_test, &p);
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	enum vm_mem_backing_src_type src_type;
-	int opt;
-
-	src_type = DEFAULT_VM_MEM_SRC;
-
-	while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
-		switch (opt) {
-		case 'm':
-			guest_modes_cmdline(optarg);
-			break;
-		case 's':
-			src_type = parse_backing_src_type(optarg);
-			break;
-		case 'h':
-		default:
-			help(argv[0]);
-			exit(0);
-		}
-	}
-
-	for_each_test_and_guest_mode(src_type);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c
deleted file mode 100644
index ab491ee9e5f7..000000000000
--- a/tools/testing/selftests/kvm/aarch64/psci_test.c
+++ /dev/null
@@ -1,290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * psci_test - Tests relating to KVM's PSCI implementation.
- *
- * Copyright (c) 2021 Google LLC.
- *
- * This test includes:
- *  - A regression test for a race between KVM servicing the PSCI CPU_ON call
- *    and userspace reading the targeted vCPU's registers.
- *  - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated
- *    KVM_SYSTEM_EVENT_SUSPEND UAPI.
- */
-
-#include <linux/kernel.h>
-#include <linux/psci.h>
-#include <asm/cputype.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-
-#define CPU_ON_ENTRY_ADDR 0xfeedf00dul
-#define CPU_ON_CONTEXT_ID 0xdeadc0deul
-
-static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr,
-			    uint64_t context_id)
-{
-	struct arm_smccc_res res;
-
-	smccc_hvc(PSCI_0_2_FN64_CPU_ON, target_cpu, entry_addr, context_id,
-		  0, 0, 0, 0, &res);
-
-	return res.a0;
-}
-
-static uint64_t psci_affinity_info(uint64_t target_affinity,
-				   uint64_t lowest_affinity_level)
-{
-	struct arm_smccc_res res;
-
-	smccc_hvc(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity, lowest_affinity_level,
-		  0, 0, 0, 0, 0, &res);
-
-	return res.a0;
-}
-
-static uint64_t psci_system_suspend(uint64_t entry_addr, uint64_t context_id)
-{
-	struct arm_smccc_res res;
-
-	smccc_hvc(PSCI_1_0_FN64_SYSTEM_SUSPEND, entry_addr, context_id,
-		  0, 0, 0, 0, 0, &res);
-
-	return res.a0;
-}
-
-static uint64_t psci_system_off2(uint64_t type, uint64_t cookie)
-{
-	struct arm_smccc_res res;
-
-	smccc_hvc(PSCI_1_3_FN64_SYSTEM_OFF2, type, cookie, 0, 0, 0, 0, 0, &res);
-
-	return res.a0;
-}
-
-static uint64_t psci_features(uint32_t func_id)
-{
-	struct arm_smccc_res res;
-
-	smccc_hvc(PSCI_1_0_FN_PSCI_FEATURES, func_id, 0, 0, 0, 0, 0, 0, &res);
-
-	return res.a0;
-}
-
-static void vcpu_power_off(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mp_state mp_state = {
-		.mp_state = KVM_MP_STATE_STOPPED,
-	};
-
-	vcpu_mp_state_set(vcpu, &mp_state);
-}
-
-static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source,
-			       struct kvm_vcpu **target)
-{
-	struct kvm_vcpu_init init;
-	struct kvm_vm *vm;
-
-	vm = vm_create(2);
-
-	vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
-	init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
-
-	*source = aarch64_vcpu_add(vm, 0, &init, guest_code);
-	*target = aarch64_vcpu_add(vm, 1, &init, guest_code);
-
-	return vm;
-}
-
-static void enter_guest(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-	if (get_ucall(vcpu, &uc) == UCALL_ABORT)
-		REPORT_GUEST_ASSERT(uc);
-}
-
-static void assert_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	uint64_t obs_pc, obs_x0;
-
-	obs_pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
-	obs_x0 = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.regs[0]));
-
-	TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR,
-		    "unexpected target cpu pc: %lx (expected: %lx)",
-		    obs_pc, CPU_ON_ENTRY_ADDR);
-	TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID,
-		    "unexpected target context id: %lx (expected: %lx)",
-		    obs_x0, CPU_ON_CONTEXT_ID);
-}
-
-static void guest_test_cpu_on(uint64_t target_cpu)
-{
-	uint64_t target_state;
-
-	GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID));
-
-	do {
-		target_state = psci_affinity_info(target_cpu, 0);
-
-		GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) ||
-			     (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF));
-	} while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON);
-
-	GUEST_DONE();
-}
-
-static void host_test_cpu_on(void)
-{
-	struct kvm_vcpu *source, *target;
-	uint64_t target_mpidr;
-	struct kvm_vm *vm;
-	struct ucall uc;
-
-	vm = setup_vm(guest_test_cpu_on, &source, &target);
-
-	/*
-	 * make sure the target is already off when executing the test.
-	 */
-	vcpu_power_off(target);
-
-	target_mpidr = vcpu_get_reg(target, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1));
-	vcpu_args_set(source, 1, target_mpidr & MPIDR_HWID_BITMASK);
-	enter_guest(source);
-
-	if (get_ucall(source, &uc) != UCALL_DONE)
-		TEST_FAIL("Unhandled ucall: %lu", uc.cmd);
-
-	assert_vcpu_reset(target);
-	kvm_vm_free(vm);
-}
-
-static void guest_test_system_suspend(void)
-{
-	uint64_t ret;
-
-	/* assert that SYSTEM_SUSPEND is discoverable */
-	GUEST_ASSERT(!psci_features(PSCI_1_0_FN_SYSTEM_SUSPEND));
-	GUEST_ASSERT(!psci_features(PSCI_1_0_FN64_SYSTEM_SUSPEND));
-
-	ret = psci_system_suspend(CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID);
-	GUEST_SYNC(ret);
-}
-
-static void host_test_system_suspend(void)
-{
-	struct kvm_vcpu *source, *target;
-	struct kvm_run *run;
-	struct kvm_vm *vm;
-
-	vm = setup_vm(guest_test_system_suspend, &source, &target);
-	vm_enable_cap(vm, KVM_CAP_ARM_SYSTEM_SUSPEND, 0);
-
-	vcpu_power_off(target);
-	run = source->run;
-
-	enter_guest(source);
-
-	TEST_ASSERT_KVM_EXIT_REASON(source, KVM_EXIT_SYSTEM_EVENT);
-	TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SUSPEND,
-		    "Unhandled system event: %u (expected: %u)",
-		    run->system_event.type, KVM_SYSTEM_EVENT_SUSPEND);
-
-	kvm_vm_free(vm);
-}
-
-static void guest_test_system_off2(void)
-{
-	uint64_t ret;
-
-	/* assert that SYSTEM_OFF2 is discoverable */
-	GUEST_ASSERT(psci_features(PSCI_1_3_FN_SYSTEM_OFF2) &
-		     PSCI_1_3_OFF_TYPE_HIBERNATE_OFF);
-	GUEST_ASSERT(psci_features(PSCI_1_3_FN64_SYSTEM_OFF2) &
-		     PSCI_1_3_OFF_TYPE_HIBERNATE_OFF);
-
-	/* With non-zero 'cookie' field, it should fail */
-	ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 1);
-	GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS);
-
-	/*
-	 * This would normally never return, so KVM sets the return value
-	 * to PSCI_RET_INTERNAL_FAILURE. The test case *does* return, so
-	 * that it can test both values for HIBERNATE_OFF.
-	 */
-	ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 0);
-	GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE);
-
-	/*
-	 * Revision F.b of the PSCI v1.3 specification documents zero as an
-	 * alias for HIBERNATE_OFF, since that's the value used in earlier
-	 * revisions of the spec and some implementations in the field.
-	 */
-	ret = psci_system_off2(0, 1);
-	GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS);
-
-	ret = psci_system_off2(0, 0);
-	GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE);
-
-	GUEST_DONE();
-}
-
-static void host_test_system_off2(void)
-{
-	struct kvm_vcpu *source, *target;
-	struct kvm_mp_state mps;
-	uint64_t psci_version = 0;
-	int nr_shutdowns = 0;
-	struct kvm_run *run;
-	struct ucall uc;
-
-	setup_vm(guest_test_system_off2, &source, &target);
-
-	psci_version = vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION);
-
-	TEST_ASSERT(psci_version >= PSCI_VERSION(1, 3),
-		    "Unexpected PSCI version %lu.%lu",
-		    PSCI_VERSION_MAJOR(psci_version),
-		    PSCI_VERSION_MINOR(psci_version));
-
-	vcpu_power_off(target);
-	run = source->run;
-
-	enter_guest(source);
-	while (run->exit_reason == KVM_EXIT_SYSTEM_EVENT) {
-		TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SHUTDOWN,
-			    "Unhandled system event: %u (expected: %u)",
-			    run->system_event.type, KVM_SYSTEM_EVENT_SHUTDOWN);
-		TEST_ASSERT(run->system_event.ndata >= 1,
-			    "Unexpected amount of system event data: %u (expected, >= 1)",
-			    run->system_event.ndata);
-		TEST_ASSERT(run->system_event.data[0] & KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2,
-			    "PSCI_OFF2 flag not set. Flags %llu (expected %llu)",
-			    run->system_event.data[0], KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2);
-
-		nr_shutdowns++;
-
-		/* Restart the vCPU */
-	        mps.mp_state = KVM_MP_STATE_RUNNABLE;
-		vcpu_mp_state_set(source, &mps);
-
-		enter_guest(source);
-	}
-
-	TEST_ASSERT(get_ucall(source, &uc) == UCALL_DONE, "Guest did not exit cleanly");
-	TEST_ASSERT(nr_shutdowns == 2, "Two shutdown events were expected, but saw %d", nr_shutdowns);
-}
-
-int main(void)
-{
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SYSTEM_SUSPEND));
-
-	host_test_cpu_on();
-	host_test_system_suspend();
-	host_test_system_off2();
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
deleted file mode 100644
index bc6cf50e5135..000000000000
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ /dev/null
@@ -1,695 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * set_id_regs - Test for setting ID register from usersapce.
- *
- * Copyright (c) 2023 Google LLC.
- *
- *
- * Test that KVM supports setting ID registers from userspace and handles the
- * feature set correctly.
- */
-
-#include <stdint.h>
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-#include <linux/bitfield.h>
-
-enum ftr_type {
-	FTR_EXACT,			/* Use a predefined safe value */
-	FTR_LOWER_SAFE,			/* Smaller value is safe */
-	FTR_HIGHER_SAFE,		/* Bigger value is safe */
-	FTR_HIGHER_OR_ZERO_SAFE,	/* Bigger value is safe, but 0 is biggest */
-	FTR_END,			/* Mark the last ftr bits */
-};
-
-#define FTR_SIGNED	true	/* Value should be treated as signed */
-#define FTR_UNSIGNED	false	/* Value should be treated as unsigned */
-
-struct reg_ftr_bits {
-	char *name;
-	bool sign;
-	enum ftr_type type;
-	uint8_t shift;
-	uint64_t mask;
-	/*
-	 * For FTR_EXACT, safe_val is used as the exact safe value.
-	 * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value.
-	 */
-	int64_t safe_val;
-};
-
-struct test_feature_reg {
-	uint32_t reg;
-	const struct reg_ftr_bits *ftr_bits;
-};
-
-#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL)	\
-	{								\
-		.name = #NAME,						\
-		.sign = SIGNED,						\
-		.type = TYPE,						\
-		.shift = SHIFT,						\
-		.mask = MASK,						\
-		.safe_val = SAFE_VAL,					\
-	}
-
-#define REG_FTR_BITS(type, reg, field, safe_val) \
-	__REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \
-		       reg##_##field##_MASK, safe_val)
-
-#define S_REG_FTR_BITS(type, reg, field, safe_val) \
-	__REG_FTR_BITS(reg##_##field, FTR_SIGNED, type, reg##_##field##_SHIFT, \
-		       reg##_##field##_MASK, safe_val)
-
-#define REG_FTR_END					\
-	{						\
-		.type = FTR_END,			\
-	}
-
-static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = {
-	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DoubleLock, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, WRPs, 0),
-	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_dfr0_el1[] = {
-	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, ID_DFR0_EL1_PerfMon_PMUv3),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, ID_DFR0_EL1_CopDbg_Armv8),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64isar0_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RNDR, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TLB, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TS, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, FHM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, DP, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM4, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM3, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA3, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RDM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TME, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, ATOMIC, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, CRC32, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA2, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA1, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, AES, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64isar1_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, LS64, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, XS, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, I8MM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, DGH, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, BF16, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, SPECRES, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, SB, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, FRINTTS, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, LRCPC, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, FCMA, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, JSCVT, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, DPB, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, BC, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, RPRES, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, WFxT, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV3, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0),
-	REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL0, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, CSV2_frac, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, SSBS, ID_AA64PFR1_EL1_SSBS_NI),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, BT, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0),
-	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN4, 0),
-	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN64, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN16, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGENDEL0, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, SNSMEM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGEND, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ASIDBITS, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, PARANGE, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64mmfr1_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, TIDCP1, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, AFP, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, ETS, 0),
-	REG_FTR_BITS(FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1, SpecSEI, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, PAN, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, LO, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, HPDS, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, HAFDBS, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64mmfr2_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, E0PD, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, BBM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, TTL, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, AT, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, ST, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, VARange, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, IESB, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, LSM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, UAO, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, CnP, 0),
-	REG_FTR_END,
-};
-
-static const struct reg_ftr_bits ftr_id_aa64zfr0_el1[] = {
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, F64MM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, F32MM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, I8MM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SM4, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SHA3, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, BF16, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, BitPerm, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, AES, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SVEver, 0),
-	REG_FTR_END,
-};
-
-#define TEST_REG(id, table)			\
-	{					\
-		.reg = id,			\
-		.ftr_bits = &((table)[0]),	\
-	}
-
-static struct test_feature_reg test_regs[] = {
-	TEST_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0_el1),
-	TEST_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0_el1),
-	TEST_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0_el1),
-	TEST_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1_el1),
-	TEST_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2_el1),
-	TEST_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0_el1),
-	TEST_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1_el1),
-	TEST_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0_el1),
-	TEST_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1_el1),
-	TEST_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2_el1),
-	TEST_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0_el1),
-};
-
-#define GUEST_REG_SYNC(id) GUEST_SYNC_ARGS(0, id, read_sysreg_s(id), 0, 0);
-
-static void guest_code(void)
-{
-	GUEST_REG_SYNC(SYS_ID_AA64DFR0_EL1);
-	GUEST_REG_SYNC(SYS_ID_DFR0_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64ISAR0_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64ISAR1_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64ISAR2_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64PFR0_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1);
-	GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1);
-	GUEST_REG_SYNC(SYS_CTR_EL0);
-
-	GUEST_DONE();
-}
-
-/* Return a safe value to a given ftr_bits an ftr value */
-uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
-{
-	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
-
-	if (ftr_bits->sign == FTR_UNSIGNED) {
-		switch (ftr_bits->type) {
-		case FTR_EXACT:
-			ftr = ftr_bits->safe_val;
-			break;
-		case FTR_LOWER_SAFE:
-			if (ftr > ftr_bits->safe_val)
-				ftr--;
-			break;
-		case FTR_HIGHER_SAFE:
-			if (ftr < ftr_max)
-				ftr++;
-			break;
-		case FTR_HIGHER_OR_ZERO_SAFE:
-			if (ftr == ftr_max)
-				ftr = 0;
-			else if (ftr != 0)
-				ftr++;
-			break;
-		default:
-			break;
-		}
-	} else if (ftr != ftr_max) {
-		switch (ftr_bits->type) {
-		case FTR_EXACT:
-			ftr = ftr_bits->safe_val;
-			break;
-		case FTR_LOWER_SAFE:
-			if (ftr > ftr_bits->safe_val)
-				ftr--;
-			break;
-		case FTR_HIGHER_SAFE:
-			if (ftr < ftr_max - 1)
-				ftr++;
-			break;
-		case FTR_HIGHER_OR_ZERO_SAFE:
-			if (ftr != 0 && ftr != ftr_max - 1)
-				ftr++;
-			break;
-		default:
-			break;
-		}
-	}
-
-	return ftr;
-}
-
-/* Return an invalid value to a given ftr_bits an ftr value */
-uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
-{
-	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
-
-	if (ftr_bits->sign == FTR_UNSIGNED) {
-		switch (ftr_bits->type) {
-		case FTR_EXACT:
-			ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1);
-			break;
-		case FTR_LOWER_SAFE:
-			ftr++;
-			break;
-		case FTR_HIGHER_SAFE:
-			ftr--;
-			break;
-		case FTR_HIGHER_OR_ZERO_SAFE:
-			if (ftr == 0)
-				ftr = ftr_max;
-			else
-				ftr--;
-			break;
-		default:
-			break;
-		}
-	} else if (ftr != ftr_max) {
-		switch (ftr_bits->type) {
-		case FTR_EXACT:
-			ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1);
-			break;
-		case FTR_LOWER_SAFE:
-			ftr++;
-			break;
-		case FTR_HIGHER_SAFE:
-			ftr--;
-			break;
-		case FTR_HIGHER_OR_ZERO_SAFE:
-			if (ftr == 0)
-				ftr = ftr_max - 1;
-			else
-				ftr--;
-			break;
-		default:
-			break;
-		}
-	} else {
-		ftr = 0;
-	}
-
-	return ftr;
-}
-
-static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg,
-				     const struct reg_ftr_bits *ftr_bits)
-{
-	uint8_t shift = ftr_bits->shift;
-	uint64_t mask = ftr_bits->mask;
-	uint64_t val, new_val, ftr;
-
-	val = vcpu_get_reg(vcpu, reg);
-	ftr = (val & mask) >> shift;
-
-	ftr = get_safe_value(ftr_bits, ftr);
-
-	ftr <<= shift;
-	val &= ~mask;
-	val |= ftr;
-
-	vcpu_set_reg(vcpu, reg, val);
-	new_val = vcpu_get_reg(vcpu, reg);
-	TEST_ASSERT_EQ(new_val, val);
-
-	return new_val;
-}
-
-static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg,
-			      const struct reg_ftr_bits *ftr_bits)
-{
-	uint8_t shift = ftr_bits->shift;
-	uint64_t mask = ftr_bits->mask;
-	uint64_t val, old_val, ftr;
-	int r;
-
-	val = vcpu_get_reg(vcpu, reg);
-	ftr = (val & mask) >> shift;
-
-	ftr = get_invalid_value(ftr_bits, ftr);
-
-	old_val = val;
-	ftr <<= shift;
-	val &= ~mask;
-	val |= ftr;
-
-	r = __vcpu_set_reg(vcpu, reg, val);
-	TEST_ASSERT(r < 0 && errno == EINVAL,
-		    "Unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
-
-	val = vcpu_get_reg(vcpu, reg);
-	TEST_ASSERT_EQ(val, old_val);
-}
-
-static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE];
-
-#define encoding_to_range_idx(encoding)							\
-	KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding),	\
-				     sys_reg_CRn(encoding), sys_reg_CRm(encoding),	\
-				     sys_reg_Op2(encoding))
-
-
-static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only)
-{
-	uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE];
-	struct reg_mask_range range = {
-		.addr = (__u64)masks,
-	};
-	int ret;
-
-	/* KVM should return error when reserved field is not zero */
-	range.reserved[0] = 1;
-	ret = __vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
-	TEST_ASSERT(ret, "KVM doesn't check invalid parameters.");
-
-	/* Get writable masks for feature ID registers */
-	memset(range.reserved, 0, sizeof(range.reserved));
-	vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
-
-	for (int i = 0; i < ARRAY_SIZE(test_regs); i++) {
-		const struct reg_ftr_bits *ftr_bits = test_regs[i].ftr_bits;
-		uint32_t reg_id = test_regs[i].reg;
-		uint64_t reg = KVM_ARM64_SYS_REG(reg_id);
-		int idx;
-
-		/* Get the index to masks array for the idreg */
-		idx = encoding_to_range_idx(reg_id);
-
-		for (int j = 0;  ftr_bits[j].type != FTR_END; j++) {
-			/* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */
-			if (aarch64_only && sys_reg_CRm(reg_id) < 4) {
-				ksft_test_result_skip("%s on AARCH64 only system\n",
-						      ftr_bits[j].name);
-				continue;
-			}
-
-			/* Make sure the feature field is writable */
-			TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask);
-
-			test_reg_set_fail(vcpu, reg, &ftr_bits[j]);
-
-			test_reg_vals[idx] = test_reg_set_success(vcpu, reg,
-								  &ftr_bits[j]);
-
-			ksft_test_result_pass("%s\n", ftr_bits[j].name);
-		}
-	}
-}
-
-#define MPAM_IDREG_TEST	6
-static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu)
-{
-	uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE];
-	struct reg_mask_range range = {
-		.addr = (__u64)masks,
-	};
-	uint64_t val;
-	int idx, err;
-
-	/*
-	 * If ID_AA64PFR0.MPAM is _not_ officially modifiable and is zero,
-	 * check that if it can be set to 1, (i.e. it is supported by the
-	 * hardware), that it can't be set to other values.
-	 */
-
-	/* Get writable masks for feature ID registers */
-	memset(range.reserved, 0, sizeof(range.reserved));
-	vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
-
-	/* Writeable? Nothing to test! */
-	idx = encoding_to_range_idx(SYS_ID_AA64PFR0_EL1);
-	if ((masks[idx] & ID_AA64PFR0_EL1_MPAM_MASK) == ID_AA64PFR0_EL1_MPAM_MASK) {
-		ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is officially writable, nothing to test\n");
-		return;
-	}
-
-	/* Get the id register value */
-	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
-
-	/* Try to set MPAM=0. This should always be possible. */
-	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
-	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 0);
-	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
-	if (err)
-		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM=0 was not accepted\n");
-	else
-		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=0 worked\n");
-
-	/* Try to set MPAM=1 */
-	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
-	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 1);
-	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
-	if (err)
-		ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is not writable, nothing to test\n");
-	else
-		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=1 was writable\n");
-
-	/* Try to set MPAM=2 */
-	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
-	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 2);
-	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
-	if (err)
-		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM not arbitrarily modifiable\n");
-	else
-		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM value should not be ignored\n");
-
-	/* And again for ID_AA64PFR1_EL1.MPAM_frac */
-	idx = encoding_to_range_idx(SYS_ID_AA64PFR1_EL1);
-	if ((masks[idx] & ID_AA64PFR1_EL1_MPAM_frac_MASK) == ID_AA64PFR1_EL1_MPAM_frac_MASK) {
-		ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is officially writable, nothing to test\n");
-		return;
-	}
-
-	/* Get the id register value */
-	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
-
-	/* Try to set MPAM_frac=0. This should always be possible. */
-	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
-	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 0);
-	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
-	if (err)
-		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM_frac=0 was not accepted\n");
-	else
-		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=0 worked\n");
-
-	/* Try to set MPAM_frac=1 */
-	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
-	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 1);
-	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
-	if (err)
-		ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is not writable, nothing to test\n");
-	else
-		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=1 was writable\n");
-
-	/* Try to set MPAM_frac=2 */
-	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
-	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 2);
-	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
-	if (err)
-		ksft_test_result_pass("ID_AA64PFR1_EL1.MPAM_frac not arbitrarily modifiable\n");
-	else
-		ksft_test_result_fail("ID_AA64PFR1_EL1.MPAM_frac value should not be ignored\n");
-}
-
-static void test_guest_reg_read(struct kvm_vcpu *vcpu)
-{
-	bool done = false;
-	struct ucall uc;
-
-	while (!done) {
-		vcpu_run(vcpu);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_SYNC:
-			/* Make sure the written values are seen by guest */
-			TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])],
-				       uc.args[3]);
-			break;
-		case UCALL_DONE:
-			done = true;
-			break;
-		default:
-			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-		}
-	}
-}
-
-/* Politely lifted from arch/arm64/include/asm/cache.h */
-/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */
-#define CLIDR_CTYPE_SHIFT(level)	(3 * (level - 1))
-#define CLIDR_CTYPE_MASK(level)		(7 << CLIDR_CTYPE_SHIFT(level))
-#define CLIDR_CTYPE(clidr, level)	\
-	(((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
-
-static void test_clidr(struct kvm_vcpu *vcpu)
-{
-	uint64_t clidr;
-	int level;
-
-	clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1));
-
-	/* find the first empty level in the cache hierarchy */
-	for (level = 1; level < 7; level++) {
-		if (!CLIDR_CTYPE(clidr, level))
-			break;
-	}
-
-	/*
-	 * If you have a mind-boggling 7 levels of cache, congratulations, you
-	 * get to fix this.
-	 */
-	TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy");
-
-	/* stick in a unified cache level */
-	clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level);
-
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr);
-	test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr;
-}
-
-static void test_ctr(struct kvm_vcpu *vcpu)
-{
-	u64 ctr;
-
-	ctr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0));
-	ctr &= ~CTR_EL0_DIC_MASK;
-	if (ctr & CTR_EL0_IminLine_MASK)
-		ctr--;
-
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), ctr);
-	test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr;
-}
-
-static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu)
-{
-	u64 val;
-
-	test_clidr(vcpu);
-	test_ctr(vcpu);
-
-	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1));
-	val++;
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val);
-
-	test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val;
-	ksft_test_result_pass("%s\n", __func__);
-}
-
-static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding)
-{
-	size_t idx = encoding_to_range_idx(encoding);
-	uint64_t observed;
-
-	observed = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding));
-	TEST_ASSERT_EQ(test_reg_vals[idx], observed);
-}
-
-static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an
-	 * architectural reset of the vCPU.
-	 */
-	aarch64_vcpu_setup(vcpu, NULL);
-
-	for (int i = 0; i < ARRAY_SIZE(test_regs); i++)
-		test_assert_id_reg_unchanged(vcpu, test_regs[i].reg);
-
-	test_assert_id_reg_unchanged(vcpu, SYS_MPIDR_EL1);
-	test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1);
-	test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0);
-
-	ksft_test_result_pass("%s\n", __func__);
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	bool aarch64_only;
-	uint64_t val, el0;
-	int test_cnt;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	/* Check for AARCH64 only system */
-	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
-	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
-	aarch64_only = (el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
-
-	ksft_print_header();
-
-	test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) +
-		   ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) +
-		   ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) +
-		   ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) +
-		   ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) +
-		   ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 +
-		   MPAM_IDREG_TEST;
-
-	ksft_set_plan(test_cnt);
-
-	test_vm_ftr_id_regs(vcpu, aarch64_only);
-	test_vcpu_ftr_id_regs(vcpu);
-	test_user_set_mpam_reg(vcpu);
-
-	test_guest_reg_read(vcpu);
-
-	test_reset_preserves_id_regs(vcpu);
-
-	kvm_vm_free(vm);
-
-	ksft_finished();
-}
diff --git a/tools/testing/selftests/kvm/aarch64/smccc_filter.c b/tools/testing/selftests/kvm/aarch64/smccc_filter.c
deleted file mode 100644
index 2d189f3da228..000000000000
--- a/tools/testing/selftests/kvm/aarch64/smccc_filter.c
+++ /dev/null
@@ -1,268 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * smccc_filter - Tests for the SMCCC filter UAPI.
- *
- * Copyright (c) 2023 Google LLC
- *
- * This test includes:
- *  - Tests that the UAPI constraints are upheld by KVM. For example, userspace
- *    is prevented from filtering the architecture range of SMCCC calls.
- *  - Test that the filter actions (DENIED, FWD_TO_USER) work as intended.
- */
-
-#include <linux/arm-smccc.h>
-#include <linux/psci.h>
-#include <stdint.h>
-
-#include "processor.h"
-#include "test_util.h"
-
-enum smccc_conduit {
-	HVC_INSN,
-	SMC_INSN,
-};
-
-#define for_each_conduit(conduit)					\
-	for (conduit = HVC_INSN; conduit <= SMC_INSN; conduit++)
-
-static void guest_main(uint32_t func_id, enum smccc_conduit conduit)
-{
-	struct arm_smccc_res res;
-
-	if (conduit == SMC_INSN)
-		smccc_smc(func_id, 0, 0, 0, 0, 0, 0, 0, &res);
-	else
-		smccc_hvc(func_id, 0, 0, 0, 0, 0, 0, 0, &res);
-
-	GUEST_SYNC(res.a0);
-}
-
-static int __set_smccc_filter(struct kvm_vm *vm, uint32_t start, uint32_t nr_functions,
-			      enum kvm_smccc_filter_action action)
-{
-	struct kvm_smccc_filter filter = {
-		.base		= start,
-		.nr_functions	= nr_functions,
-		.action		= action,
-	};
-
-	return __kvm_device_attr_set(vm->fd, KVM_ARM_VM_SMCCC_CTRL,
-				     KVM_ARM_VM_SMCCC_FILTER, &filter);
-}
-
-static void set_smccc_filter(struct kvm_vm *vm, uint32_t start, uint32_t nr_functions,
-			     enum kvm_smccc_filter_action action)
-{
-	int ret = __set_smccc_filter(vm, start, nr_functions, action);
-
-	TEST_ASSERT(!ret, "failed to configure SMCCC filter: %d", ret);
-}
-
-static struct kvm_vm *setup_vm(struct kvm_vcpu **vcpu)
-{
-	struct kvm_vcpu_init init;
-	struct kvm_vm *vm;
-
-	vm = vm_create(1);
-	vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
-
-	/*
-	 * Enable in-kernel emulation of PSCI to ensure that calls are denied
-	 * due to the SMCCC filter, not because of KVM.
-	 */
-	init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
-
-	*vcpu = aarch64_vcpu_add(vm, 0, &init, guest_main);
-	return vm;
-}
-
-static void test_pad_must_be_zero(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = setup_vm(&vcpu);
-	struct kvm_smccc_filter filter = {
-		.base		= PSCI_0_2_FN_PSCI_VERSION,
-		.nr_functions	= 1,
-		.action		= KVM_SMCCC_FILTER_DENY,
-		.pad		= { -1 },
-	};
-	int r;
-
-	r = __kvm_device_attr_set(vm->fd, KVM_ARM_VM_SMCCC_CTRL,
-				  KVM_ARM_VM_SMCCC_FILTER, &filter);
-	TEST_ASSERT(r < 0 && errno == EINVAL,
-		    "Setting filter with nonzero padding should return EINVAL");
-}
-
-/* Ensure that userspace cannot filter the Arm Architecture SMCCC range */
-static void test_filter_reserved_range(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = setup_vm(&vcpu);
-	uint32_t smc64_fn;
-	int r;
-
-	r = __set_smccc_filter(vm, ARM_SMCCC_ARCH_WORKAROUND_1,
-			       1, KVM_SMCCC_FILTER_DENY);
-	TEST_ASSERT(r < 0 && errno == EEXIST,
-		    "Attempt to filter reserved range should return EEXIST");
-
-	smc64_fn = ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64,
-				      0, 0);
-
-	r = __set_smccc_filter(vm, smc64_fn, 1, KVM_SMCCC_FILTER_DENY);
-	TEST_ASSERT(r < 0 && errno == EEXIST,
-		    "Attempt to filter reserved range should return EEXIST");
-
-	kvm_vm_free(vm);
-}
-
-static void test_invalid_nr_functions(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = setup_vm(&vcpu);
-	int r;
-
-	r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 0, KVM_SMCCC_FILTER_DENY);
-	TEST_ASSERT(r < 0 && errno == EINVAL,
-		    "Attempt to filter 0 functions should return EINVAL");
-
-	kvm_vm_free(vm);
-}
-
-static void test_overflow_nr_functions(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = setup_vm(&vcpu);
-	int r;
-
-	r = __set_smccc_filter(vm, ~0, ~0, KVM_SMCCC_FILTER_DENY);
-	TEST_ASSERT(r < 0 && errno == EINVAL,
-		    "Attempt to overflow filter range should return EINVAL");
-
-	kvm_vm_free(vm);
-}
-
-static void test_reserved_action(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = setup_vm(&vcpu);
-	int r;
-
-	r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, -1);
-	TEST_ASSERT(r < 0 && errno == EINVAL,
-		    "Attempt to use reserved filter action should return EINVAL");
-
-	kvm_vm_free(vm);
-}
-
-
-/* Test that overlapping configurations of the SMCCC filter are rejected */
-static void test_filter_overlap(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = setup_vm(&vcpu);
-	int r;
-
-	set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, KVM_SMCCC_FILTER_DENY);
-
-	r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, KVM_SMCCC_FILTER_DENY);
-	TEST_ASSERT(r < 0 && errno == EEXIST,
-		    "Attempt to filter already configured range should return EEXIST");
-
-	kvm_vm_free(vm);
-}
-
-static void expect_call_denied(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	if (get_ucall(vcpu, &uc) != UCALL_SYNC)
-		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-
-	TEST_ASSERT(uc.args[1] == SMCCC_RET_NOT_SUPPORTED,
-		    "Unexpected SMCCC return code: %lu", uc.args[1]);
-}
-
-/* Denied SMCCC calls have a return code of SMCCC_RET_NOT_SUPPORTED */
-static void test_filter_denied(void)
-{
-	enum smccc_conduit conduit;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	for_each_conduit(conduit) {
-		vm = setup_vm(&vcpu);
-
-		set_smccc_filter(vm, PSCI_0_2_FN_PSCI_VERSION, 1, KVM_SMCCC_FILTER_DENY);
-		vcpu_args_set(vcpu, 2, PSCI_0_2_FN_PSCI_VERSION, conduit);
-
-		vcpu_run(vcpu);
-		expect_call_denied(vcpu);
-
-		kvm_vm_free(vm);
-	}
-}
-
-static void expect_call_fwd_to_user(struct kvm_vcpu *vcpu, uint32_t func_id,
-				    enum smccc_conduit conduit)
-{
-	struct kvm_run *run = vcpu->run;
-
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERCALL,
-		    "Unexpected exit reason: %u", run->exit_reason);
-	TEST_ASSERT(run->hypercall.nr == func_id,
-		    "Unexpected SMCCC function: %llu", run->hypercall.nr);
-
-	if (conduit == SMC_INSN)
-		TEST_ASSERT(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC,
-			    "KVM_HYPERCALL_EXIT_SMC is not set");
-	else
-		TEST_ASSERT(!(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC),
-			    "KVM_HYPERCALL_EXIT_SMC is set");
-}
-
-/* SMCCC calls forwarded to userspace cause KVM_EXIT_HYPERCALL exits */
-static void test_filter_fwd_to_user(void)
-{
-	enum smccc_conduit conduit;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	for_each_conduit(conduit) {
-		vm = setup_vm(&vcpu);
-
-		set_smccc_filter(vm, PSCI_0_2_FN_PSCI_VERSION, 1, KVM_SMCCC_FILTER_FWD_TO_USER);
-		vcpu_args_set(vcpu, 2, PSCI_0_2_FN_PSCI_VERSION, conduit);
-
-		vcpu_run(vcpu);
-		expect_call_fwd_to_user(vcpu, PSCI_0_2_FN_PSCI_VERSION, conduit);
-
-		kvm_vm_free(vm);
-	}
-}
-
-static bool kvm_supports_smccc_filter(void)
-{
-	struct kvm_vm *vm = vm_create_barebones();
-	int r;
-
-	r = __kvm_has_device_attr(vm->fd, KVM_ARM_VM_SMCCC_CTRL, KVM_ARM_VM_SMCCC_FILTER);
-
-	kvm_vm_free(vm);
-	return !r;
-}
-
-int main(void)
-{
-	TEST_REQUIRE(kvm_supports_smccc_filter());
-
-	test_pad_must_be_zero();
-	test_invalid_nr_functions();
-	test_overflow_nr_functions();
-	test_reserved_action();
-	test_filter_reserved_range();
-	test_filter_overlap();
-	test_filter_denied();
-	test_filter_fwd_to_user();
-}
diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c
deleted file mode 100644
index 80b74c6f152b..000000000000
--- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vcpu_width_config - Test KVM_ARM_VCPU_INIT() with KVM_ARM_VCPU_EL1_32BIT.
- *
- * Copyright (c) 2022 Google LLC.
- *
- * This is a test that ensures that non-mixed-width vCPUs (all 64bit vCPUs
- * or all 32bit vcPUs) can be configured and mixed-width vCPUs cannot be
- * configured.
- */
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-
-
-/*
- * Add a vCPU, run KVM_ARM_VCPU_INIT with @init0, and then
- * add another vCPU, and run KVM_ARM_VCPU_INIT with @init1.
- */
-static int add_init_2vcpus(struct kvm_vcpu_init *init0,
-			   struct kvm_vcpu_init *init1)
-{
-	struct kvm_vcpu *vcpu0, *vcpu1;
-	struct kvm_vm *vm;
-	int ret;
-
-	vm = vm_create_barebones();
-
-	vcpu0 = __vm_vcpu_add(vm, 0);
-	ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0);
-	if (ret)
-		goto free_exit;
-
-	vcpu1 = __vm_vcpu_add(vm, 1);
-	ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1);
-
-free_exit:
-	kvm_vm_free(vm);
-	return ret;
-}
-
-/*
- * Add two vCPUs, then run KVM_ARM_VCPU_INIT for one vCPU with @init0,
- * and run KVM_ARM_VCPU_INIT for another vCPU with @init1.
- */
-static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init0,
-				  struct kvm_vcpu_init *init1)
-{
-	struct kvm_vcpu *vcpu0, *vcpu1;
-	struct kvm_vm *vm;
-	int ret;
-
-	vm = vm_create_barebones();
-
-	vcpu0 = __vm_vcpu_add(vm, 0);
-	vcpu1 = __vm_vcpu_add(vm, 1);
-
-	ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0);
-	if (ret)
-		goto free_exit;
-
-	ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1);
-
-free_exit:
-	kvm_vm_free(vm);
-	return ret;
-}
-
-/*
- * Tests that two 64bit vCPUs can be configured, two 32bit vCPUs can be
- * configured, and two mixed-width vCPUs cannot be configured.
- * Each of those three cases, configure vCPUs in two different orders.
- * The one is running KVM_CREATE_VCPU for 2 vCPUs, and then running
- * KVM_ARM_VCPU_INIT for them.
- * The other is running KVM_CREATE_VCPU and KVM_ARM_VCPU_INIT for a vCPU,
- * and then run those commands for another vCPU.
- */
-int main(void)
-{
-	struct kvm_vcpu_init init0, init1;
-	struct kvm_vm *vm;
-	int ret;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_EL1_32BIT));
-
-	/* Get the preferred target type and copy that to init1 for later use */
-	vm = vm_create_barebones();
-	vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init0);
-	kvm_vm_free(vm);
-	init1 = init0;
-
-	/* Test with 64bit vCPUs */
-	ret = add_init_2vcpus(&init0, &init0);
-	TEST_ASSERT(ret == 0,
-		    "Configuring 64bit EL1 vCPUs failed unexpectedly");
-	ret = add_2vcpus_init_2vcpus(&init0, &init0);
-	TEST_ASSERT(ret == 0,
-		    "Configuring 64bit EL1 vCPUs failed unexpectedly");
-
-	/* Test with 32bit vCPUs */
-	init0.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT);
-	ret = add_init_2vcpus(&init0, &init0);
-	TEST_ASSERT(ret == 0,
-		    "Configuring 32bit EL1 vCPUs failed unexpectedly");
-	ret = add_2vcpus_init_2vcpus(&init0, &init0);
-	TEST_ASSERT(ret == 0,
-		    "Configuring 32bit EL1 vCPUs failed unexpectedly");
-
-	/* Test with mixed-width vCPUs  */
-	init0.features[0] = 0;
-	init1.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT);
-	ret = add_init_2vcpus(&init0, &init1);
-	TEST_ASSERT(ret != 0,
-		    "Configuring mixed-width vCPUs worked unexpectedly");
-	ret = add_2vcpus_init_2vcpus(&init0, &init1);
-	TEST_ASSERT(ret != 0,
-		    "Configuring mixed-width vCPUs worked unexpectedly");
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c
deleted file mode 100644
index b3b5fb0ff0a9..000000000000
--- a/tools/testing/selftests/kvm/aarch64/vgic_init.c
+++ /dev/null
@@ -1,764 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * vgic init sequence tests
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-#include <linux/kernel.h>
-#include <sys/syscall.h>
-#include <asm/kvm.h>
-#include <asm/kvm_para.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vgic.h"
-
-#define NR_VCPUS		4
-
-#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
-
-#define GICR_TYPER 0x8
-
-#define VGIC_DEV_IS_V2(_d) ((_d) == KVM_DEV_TYPE_ARM_VGIC_V2)
-#define VGIC_DEV_IS_V3(_d) ((_d) == KVM_DEV_TYPE_ARM_VGIC_V3)
-
-struct vm_gic {
-	struct kvm_vm *vm;
-	int gic_fd;
-	uint32_t gic_dev_type;
-};
-
-static uint64_t max_phys_size;
-
-/*
- * Helpers to access a redistributor register and verify the ioctl() failed or
- * succeeded as expected, and provided the correct value on success.
- */
-static void v3_redist_reg_get_errno(int gicv3_fd, int vcpu, int offset,
-				    int want, const char *msg)
-{
-	uint32_t ignored_val;
-	int ret = __kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
-					REG_OFFSET(vcpu, offset), &ignored_val);
-
-	TEST_ASSERT(ret && errno == want, "%s; want errno = %d", msg, want);
-}
-
-static void v3_redist_reg_get(int gicv3_fd, int vcpu, int offset, uint32_t want,
-			      const char *msg)
-{
-	uint32_t val;
-
-	kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
-			    REG_OFFSET(vcpu, offset), &val);
-	TEST_ASSERT(val == want, "%s; want '0x%x', got '0x%x'", msg, want, val);
-}
-
-/* dummy guest code */
-static void guest_code(void)
-{
-	GUEST_SYNC(0);
-	GUEST_SYNC(1);
-	GUEST_SYNC(2);
-	GUEST_DONE();
-}
-
-/* we don't want to assert on run execution, hence that helper */
-static int run_vcpu(struct kvm_vcpu *vcpu)
-{
-	return __vcpu_run(vcpu) ? -errno : 0;
-}
-
-static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type,
-					      uint32_t nr_vcpus,
-					      struct kvm_vcpu *vcpus[])
-{
-	struct vm_gic v;
-
-	v.gic_dev_type = gic_dev_type;
-	v.vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
-	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
-
-	return v;
-}
-
-static struct vm_gic vm_gic_create_barebones(uint32_t gic_dev_type)
-{
-	struct vm_gic v;
-
-	v.gic_dev_type = gic_dev_type;
-	v.vm = vm_create_barebones();
-	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
-
-	return v;
-}
-
-
-static void vm_gic_destroy(struct vm_gic *v)
-{
-	close(v->gic_fd);
-	kvm_vm_free(v->vm);
-}
-
-struct vgic_region_attr {
-	uint64_t attr;
-	uint64_t size;
-	uint64_t alignment;
-};
-
-struct vgic_region_attr gic_v3_dist_region = {
-	.attr = KVM_VGIC_V3_ADDR_TYPE_DIST,
-	.size = 0x10000,
-	.alignment = 0x10000,
-};
-
-struct vgic_region_attr gic_v3_redist_region = {
-	.attr = KVM_VGIC_V3_ADDR_TYPE_REDIST,
-	.size = NR_VCPUS * 0x20000,
-	.alignment = 0x10000,
-};
-
-struct vgic_region_attr gic_v2_dist_region = {
-	.attr = KVM_VGIC_V2_ADDR_TYPE_DIST,
-	.size = 0x1000,
-	.alignment = 0x1000,
-};
-
-struct vgic_region_attr gic_v2_cpu_region = {
-	.attr = KVM_VGIC_V2_ADDR_TYPE_CPU,
-	.size = 0x2000,
-	.alignment = 0x1000,
-};
-
-/**
- * Helper routine that performs KVM device tests in general. Eventually the
- * ARM_VGIC (GICv2 or GICv3) device gets created with an overlapping
- * DIST/REDIST (or DIST/CPUIF for GICv2). Assumption is 4 vcpus are going to be
- * used hence the overlap. In the case of GICv3, A RDIST region is set at @0x0
- * and a DIST region is set @0x70000. The GICv2 case sets a CPUIF @0x0 and a
- * DIST region @0x1000.
- */
-static void subtest_dist_rdist(struct vm_gic *v)
-{
-	int ret;
-	uint64_t addr;
-	struct vgic_region_attr rdist; /* CPU interface in GICv2*/
-	struct vgic_region_attr dist;
-
-	rdist = VGIC_DEV_IS_V3(v->gic_dev_type) ? gic_v3_redist_region
-						: gic_v2_cpu_region;
-	dist = VGIC_DEV_IS_V3(v->gic_dev_type) ? gic_v3_dist_region
-						: gic_v2_dist_region;
-
-	/* Check existing group/attributes */
-	kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, dist.attr);
-
-	kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, rdist.attr);
-
-	/* check non existing attribute */
-	ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, -1);
-	TEST_ASSERT(ret && errno == ENXIO, "attribute not supported");
-
-	/* misaligned DIST and REDIST address settings */
-	addr = dist.alignment / 0x10;
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    dist.attr, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "GIC dist base not aligned");
-
-	addr = rdist.alignment / 0x10;
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    rdist.attr, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "GIC redist/cpu base not aligned");
-
-	/* out of range address */
-	addr = max_phys_size;
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    dist.attr, &addr);
-	TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit");
-
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    rdist.attr, &addr);
-	TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit");
-
-	/* Space for half a rdist (a rdist is: 2 * rdist.alignment). */
-	addr = max_phys_size - dist.alignment;
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    rdist.attr, &addr);
-	TEST_ASSERT(ret && errno == E2BIG,
-			"half of the redist is beyond IPA limit");
-
-	/* set REDIST base address @0x0*/
-	addr = 0x00000;
-	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    rdist.attr, &addr);
-
-	/* Attempt to create a second legacy redistributor region */
-	addr = 0xE0000;
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    rdist.attr, &addr);
-	TEST_ASSERT(ret && errno == EEXIST, "GIC redist base set again");
-
-	ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				     KVM_VGIC_V3_ADDR_TYPE_REDIST);
-	if (!ret) {
-		/* Attempt to mix legacy and new redistributor regions */
-		addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0);
-		ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-					    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-		TEST_ASSERT(ret && errno == EINVAL,
-			    "attempt to mix GICv3 REDIST and REDIST_REGION");
-	}
-
-	/*
-	 * Set overlapping DIST / REDIST, cannot be detected here. Will be detected
-	 * on first vcpu run instead.
-	 */
-	addr = rdist.size - rdist.alignment;
-	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    dist.attr, &addr);
-}
-
-/* Test the new REDIST region API */
-static void subtest_v3_redist_regions(struct vm_gic *v)
-{
-	uint64_t addr, expected_addr;
-	int ret;
-
-	ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST);
-	TEST_ASSERT(!ret, "Multiple redist regions advertised");
-
-	addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0");
-
-	addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0");
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL,
-		    "attempt to register the first rdist region with index != 0");
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address");
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
-	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index");
-
-	addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL,
-		    "register an rdist region overlapping with another one");
-
-	addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1");
-
-	addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
-	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	addr = REDIST_REGION_ATTR_ADDR(1, max_phys_size, 0, 2);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == E2BIG,
-		    "register redist region with base address beyond IPA range");
-
-	/* The last redist is above the pa range. */
-	addr = REDIST_REGION_ATTR_ADDR(2, max_phys_size - 0x30000, 0, 2);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == E2BIG,
-		    "register redist region with top address beyond IPA range");
-
-	addr = 0x260000;
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr);
-	TEST_ASSERT(ret && errno == EINVAL,
-		    "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION");
-
-	/*
-	 * Now there are 2 redist regions:
-	 * region 0 @ 0x200000 2 redists
-	 * region 1 @ 0x240000 1 redist
-	 * Attempt to read their characteristics
-	 */
-
-	addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0);
-	expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
-	ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0");
-
-	addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1);
-	expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
-	ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1");
-
-	addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2);
-	ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region");
-
-	addr = 0x260000;
-	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_DIST, &addr);
-
-	addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2);
-	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist");
-}
-
-/*
- * VGIC KVM device is created and initialized before the secondary CPUs
- * get created
- */
-static void test_vgic_then_vcpus(uint32_t gic_dev_type)
-{
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	struct vm_gic v;
-	int ret, i;
-
-	v = vm_gic_create_with_vcpus(gic_dev_type, 1, vcpus);
-
-	subtest_dist_rdist(&v);
-
-	/* Add the rest of the VCPUs */
-	for (i = 1; i < NR_VCPUS; ++i)
-		vcpus[i] = vm_vcpu_add(v.vm, i, guest_code);
-
-	ret = run_vcpu(vcpus[3]);
-	TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
-
-	vm_gic_destroy(&v);
-}
-
-/* All the VCPUs are created before the VGIC KVM device gets initialized */
-static void test_vcpus_then_vgic(uint32_t gic_dev_type)
-{
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	struct vm_gic v;
-	int ret;
-
-	v = vm_gic_create_with_vcpus(gic_dev_type, NR_VCPUS, vcpus);
-
-	subtest_dist_rdist(&v);
-
-	ret = run_vcpu(vcpus[3]);
-	TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
-
-	vm_gic_destroy(&v);
-}
-
-#define KVM_VGIC_V2_ATTR(offset, cpu) \
-	(FIELD_PREP(KVM_DEV_ARM_VGIC_OFFSET_MASK, offset) | \
-	 FIELD_PREP(KVM_DEV_ARM_VGIC_CPUID_MASK, cpu))
-
-#define GIC_CPU_CTRL	0x00
-
-static void test_v2_uaccess_cpuif_no_vcpus(void)
-{
-	struct vm_gic v;
-	u64 val = 0;
-	int ret;
-
-	v = vm_gic_create_barebones(KVM_DEV_TYPE_ARM_VGIC_V2);
-	subtest_dist_rdist(&v);
-
-	ret = __kvm_has_device_attr(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
-				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0));
-	TEST_ASSERT(ret && errno == EINVAL,
-		    "accessed non-existent CPU interface, want errno: %i",
-		    EINVAL);
-	ret = __kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
-				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val);
-	TEST_ASSERT(ret && errno == EINVAL,
-		    "accessed non-existent CPU interface, want errno: %i",
-		    EINVAL);
-	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
-				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val);
-	TEST_ASSERT(ret && errno == EINVAL,
-		    "accessed non-existent CPU interface, want errno: %i",
-		    EINVAL);
-
-	vm_gic_destroy(&v);
-}
-
-static void test_v3_new_redist_regions(void)
-{
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	void *dummy = NULL;
-	struct vm_gic v;
-	uint64_t addr;
-	int ret;
-
-	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
-	subtest_v3_redist_regions(&v);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	ret = run_vcpu(vcpus[3]);
-	TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists");
-	vm_gic_destroy(&v);
-
-	/* step2 */
-
-	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
-	subtest_v3_redist_regions(&v);
-
-	addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	ret = run_vcpu(vcpus[3]);
-	TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init");
-
-	vm_gic_destroy(&v);
-
-	/* step 3 */
-
-	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
-	subtest_v3_redist_regions(&v);
-
-	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy);
-	TEST_ASSERT(ret && errno == EFAULT,
-		    "register a third region allowing to cover the 4 vcpus");
-
-	addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	ret = run_vcpu(vcpus[3]);
-	TEST_ASSERT(!ret, "vcpu run");
-
-	vm_gic_destroy(&v);
-}
-
-static void test_v3_typer_accesses(void)
-{
-	struct vm_gic v;
-	uint64_t addr;
-	int ret, i;
-
-	v.vm = vm_create(NR_VCPUS);
-	(void)vm_vcpu_add(v.vm, 0, guest_code);
-
-	v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3);
-
-	(void)vm_vcpu_add(v.vm, 3, guest_code);
-
-	v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EINVAL,
-				"attempting to read GICR_TYPER of non created vcpu");
-
-	(void)vm_vcpu_add(v.vm, 1, guest_code);
-
-	v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EBUSY,
-				"read GICR_TYPER before GIC initialized");
-
-	(void)vm_vcpu_add(v.vm, 2, guest_code);
-
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	for (i = 0; i < NR_VCPUS ; i++) {
-		v3_redist_reg_get(v.gic_fd, i, GICR_TYPER, i * 0x100,
-				  "read GICR_TYPER before rdist region setting");
-	}
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	/* The 2 first rdists should be put there (vcpu 0 and 3) */
-	v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x0, "read typer of rdist #0");
-	v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #1");
-
-	addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1);
-	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-	TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region");
-
-	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100,
-			  "no redist region attached to vcpu #1 yet, last cannot be returned");
-	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200,
-			  "no redist region attached to vcpu #2, last cannot be returned");
-
-	addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1");
-	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210,
-			  "read typer of rdist #1, last properly returned");
-
-	vm_gic_destroy(&v);
-}
-
-static struct vm_gic vm_gic_v3_create_with_vcpuids(int nr_vcpus,
-						   uint32_t vcpuids[])
-{
-	struct vm_gic v;
-	int i;
-
-	v.vm = vm_create(nr_vcpus);
-	for (i = 0; i < nr_vcpus; i++)
-		vm_vcpu_add(v.vm, vcpuids[i], guest_code);
-
-	v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3);
-
-	return v;
-}
-
-/**
- * Test GICR_TYPER last bit with new redist regions
- * rdist regions #1 and #2 are contiguous
- * rdist region #0 @0x100000 2 rdist capacity
- *     rdists: 0, 3 (Last)
- * rdist region #1 @0x240000 2 rdist capacity
- *     rdists:  5, 4 (Last)
- * rdist region #2 @0x200000 2 rdist capacity
- *     rdists: 1, 2
- */
-static void test_v3_last_bit_redist_regions(void)
-{
-	uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
-	struct vm_gic v;
-	uint64_t addr;
-
-	v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids);
-
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
-
-	v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0");
-	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1");
-	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200, "read typer of rdist #2");
-	v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #3");
-	v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #5");
-	v3_redist_reg_get(v.gic_fd, 4, GICR_TYPER, 0x410, "read typer of rdist #4");
-
-	vm_gic_destroy(&v);
-}
-
-/* Test last bit with legacy region */
-static void test_v3_last_bit_single_rdist(void)
-{
-	uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
-	struct vm_gic v;
-	uint64_t addr;
-
-	v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids);
-
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	addr = 0x10000;
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr);
-
-	v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0");
-	v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x300, "read typer of rdist #1");
-	v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #2");
-	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #3");
-	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210, "read typer of rdist #3");
-
-	vm_gic_destroy(&v);
-}
-
-/* Uses the legacy REDIST region API. */
-static void test_v3_redist_ipa_range_check_at_vcpu_run(void)
-{
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	struct vm_gic v;
-	int ret, i;
-	uint64_t addr;
-
-	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, 1, vcpus);
-
-	/* Set space for 3 redists, we have 1 vcpu, so this succeeds. */
-	addr = max_phys_size - (3 * 2 * 0x10000);
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr);
-
-	addr = 0x00000;
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_DIST, &addr);
-
-	/* Add the rest of the VCPUs */
-	for (i = 1; i < NR_VCPUS; ++i)
-		vcpus[i] = vm_vcpu_add(v.vm, i, guest_code);
-
-	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	/* Attempt to run a vcpu without enough redist space. */
-	ret = run_vcpu(vcpus[2]);
-	TEST_ASSERT(ret && errno == EINVAL,
-		"redist base+size above PA range detected on 1st vcpu run");
-
-	vm_gic_destroy(&v);
-}
-
-static void test_v3_its_region(void)
-{
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	struct vm_gic v;
-	uint64_t addr;
-	int its_fd, ret;
-
-	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
-	its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS);
-
-	addr = 0x401000;
-	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
-	TEST_ASSERT(ret && errno == EINVAL,
-		"ITS region with misaligned address");
-
-	addr = max_phys_size;
-	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
-	TEST_ASSERT(ret && errno == E2BIG,
-		"register ITS region with base address beyond IPA range");
-
-	addr = max_phys_size - 0x10000;
-	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
-	TEST_ASSERT(ret && errno == E2BIG,
-		"Half of ITS region is beyond IPA range");
-
-	/* This one succeeds setting the ITS base */
-	addr = 0x400000;
-	kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_ITS_ADDR_TYPE, &addr);
-
-	addr = 0x300000;
-	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
-	TEST_ASSERT(ret && errno == EEXIST, "ITS base set again");
-
-	close(its_fd);
-	vm_gic_destroy(&v);
-}
-
-/*
- * Returns 0 if it's possible to create GIC device of a given type (V2 or V3).
- */
-int test_kvm_device(uint32_t gic_dev_type)
-{
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	struct vm_gic v;
-	uint32_t other;
-	int ret;
-
-	v.vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus);
-
-	/* try to create a non existing KVM device */
-	ret = __kvm_test_create_device(v.vm, 0);
-	TEST_ASSERT(ret && errno == ENODEV, "unsupported device");
-
-	/* trial mode */
-	ret = __kvm_test_create_device(v.vm, gic_dev_type);
-	if (ret)
-		return ret;
-	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
-
-	ret = __kvm_create_device(v.vm, gic_dev_type);
-	TEST_ASSERT(ret < 0 && errno == EEXIST, "create GIC device twice");
-
-	/* try to create the other gic_dev_type */
-	other = VGIC_DEV_IS_V2(gic_dev_type) ? KVM_DEV_TYPE_ARM_VGIC_V3
-					     : KVM_DEV_TYPE_ARM_VGIC_V2;
-
-	if (!__kvm_test_create_device(v.vm, other)) {
-		ret = __kvm_create_device(v.vm, other);
-		TEST_ASSERT(ret < 0 && (errno == EINVAL || errno == EEXIST),
-				"create GIC device while other version exists");
-	}
-
-	vm_gic_destroy(&v);
-
-	return 0;
-}
-
-void run_tests(uint32_t gic_dev_type)
-{
-	test_vcpus_then_vgic(gic_dev_type);
-	test_vgic_then_vcpus(gic_dev_type);
-
-	if (VGIC_DEV_IS_V2(gic_dev_type))
-		test_v2_uaccess_cpuif_no_vcpus();
-
-	if (VGIC_DEV_IS_V3(gic_dev_type)) {
-		test_v3_new_redist_regions();
-		test_v3_typer_accesses();
-		test_v3_last_bit_redist_regions();
-		test_v3_last_bit_single_rdist();
-		test_v3_redist_ipa_range_check_at_vcpu_run();
-		test_v3_its_region();
-	}
-}
-
-int main(int ac, char **av)
-{
-	int ret;
-	int pa_bits;
-	int cnt_impl = 0;
-
-	pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits;
-	max_phys_size = 1ULL << pa_bits;
-
-	ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V3);
-	if (!ret) {
-		pr_info("Running GIC_v3 tests.\n");
-		run_tests(KVM_DEV_TYPE_ARM_VGIC_V3);
-		cnt_impl++;
-	}
-
-	ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-	if (!ret) {
-		pr_info("Running GIC_v2 tests.\n");
-		run_tests(KVM_DEV_TYPE_ARM_VGIC_V2);
-		cnt_impl++;
-	}
-
-	if (!cnt_impl) {
-		print_skip("No GICv2 nor GICv3 support");
-		exit(KSFT_SKIP);
-	}
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
deleted file mode 100644
index f4ac28d53747..000000000000
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ /dev/null
@@ -1,847 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * vgic_irq.c - Test userspace injection of IRQs
- *
- * This test validates the injection of IRQs from userspace using various
- * methods (e.g., KVM_IRQ_LINE) and modes (e.g., EOI). The guest "asks" the
- * host to inject a specific intid via a GUEST_SYNC call, and then checks that
- * it received it.
- */
-#include <asm/kvm.h>
-#include <asm/kvm_para.h>
-#include <sys/eventfd.h>
-#include <linux/sizes.h>
-
-#include "processor.h"
-#include "test_util.h"
-#include "kvm_util.h"
-#include "gic.h"
-#include "gic_v3.h"
-#include "vgic.h"
-
-/*
- * Stores the user specified args; it's passed to the guest and to every test
- * function.
- */
-struct test_args {
-	uint32_t nr_irqs; /* number of KVM supported IRQs. */
-	bool eoi_split; /* 1 is eoir+dir, 0 is eoir only */
-	bool level_sensitive; /* 1 is level, 0 is edge */
-	int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */
-	bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */
-};
-
-/*
- * KVM implements 32 priority levels:
- * 0x00 (highest priority) - 0xF8 (lowest priority), in steps of 8
- *
- * Note that these macros will still be correct in the case that KVM implements
- * more priority levels. Also note that 32 is the minimum for GICv3 and GICv2.
- */
-#define KVM_NUM_PRIOS		32
-#define KVM_PRIO_SHIFT		3 /* steps of 8 = 1 << 3 */
-#define KVM_PRIO_STEPS		(1 << KVM_PRIO_SHIFT) /* 8 */
-#define LOWEST_PRIO		(KVM_NUM_PRIOS - 1)
-#define CPU_PRIO_MASK		(LOWEST_PRIO << KVM_PRIO_SHIFT)	/* 0xf8 */
-#define IRQ_DEFAULT_PRIO	(LOWEST_PRIO - 1)
-#define IRQ_DEFAULT_PRIO_REG	(IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */
-
-/*
- * The kvm_inject_* utilities are used by the guest to ask the host to inject
- * interrupts (e.g., using the KVM_IRQ_LINE ioctl).
- */
-
-typedef enum {
-	KVM_INJECT_EDGE_IRQ_LINE = 1,
-	KVM_SET_IRQ_LINE,
-	KVM_SET_IRQ_LINE_HIGH,
-	KVM_SET_LEVEL_INFO_HIGH,
-	KVM_INJECT_IRQFD,
-	KVM_WRITE_ISPENDR,
-	KVM_WRITE_ISACTIVER,
-} kvm_inject_cmd;
-
-struct kvm_inject_args {
-	kvm_inject_cmd cmd;
-	uint32_t first_intid;
-	uint32_t num;
-	int level;
-	bool expect_failure;
-};
-
-/* Used on the guest side to perform the hypercall. */
-static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
-		uint32_t num, int level, bool expect_failure);
-
-/* Used on the host side to get the hypercall info. */
-static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
-		struct kvm_inject_args *args);
-
-#define _KVM_INJECT_MULTI(cmd, intid, num, expect_failure)			\
-	kvm_inject_call(cmd, intid, num, -1 /* not used */, expect_failure)
-
-#define KVM_INJECT_MULTI(cmd, intid, num)					\
-	_KVM_INJECT_MULTI(cmd, intid, num, false)
-
-#define _KVM_INJECT(cmd, intid, expect_failure)					\
-	_KVM_INJECT_MULTI(cmd, intid, 1, expect_failure)
-
-#define KVM_INJECT(cmd, intid)							\
-	_KVM_INJECT_MULTI(cmd, intid, 1, false)
-
-#define KVM_ACTIVATE(cmd, intid)						\
-	kvm_inject_call(cmd, intid, 1, 1, false);
-
-struct kvm_inject_desc {
-	kvm_inject_cmd cmd;
-	/* can inject PPIs, PPIs, and/or SPIs. */
-	bool sgi, ppi, spi;
-};
-
-static struct kvm_inject_desc inject_edge_fns[] = {
-	/*                                      sgi    ppi    spi */
-	{ KVM_INJECT_EDGE_IRQ_LINE,		false, false, true },
-	{ KVM_INJECT_IRQFD,			false, false, true },
-	{ KVM_WRITE_ISPENDR,			true,  false, true },
-	{ 0, },
-};
-
-static struct kvm_inject_desc inject_level_fns[] = {
-	/*                                      sgi    ppi    spi */
-	{ KVM_SET_IRQ_LINE_HIGH,		false, true,  true },
-	{ KVM_SET_LEVEL_INFO_HIGH,		false, true,  true },
-	{ KVM_INJECT_IRQFD,			false, false, true },
-	{ KVM_WRITE_ISPENDR,			false, true,  true },
-	{ 0, },
-};
-
-static struct kvm_inject_desc set_active_fns[] = {
-	/*                                      sgi    ppi    spi */
-	{ KVM_WRITE_ISACTIVER,			true,  true,  true },
-	{ 0, },
-};
-
-#define for_each_inject_fn(t, f)						\
-	for ((f) = (t); (f)->cmd; (f)++)
-
-#define for_each_supported_inject_fn(args, t, f)				\
-	for_each_inject_fn(t, f)						\
-		if ((args)->kvm_supports_irqfd || (f)->cmd != KVM_INJECT_IRQFD)
-
-#define for_each_supported_activate_fn(args, t, f)				\
-	for_each_supported_inject_fn((args), (t), (f))
-
-/* Shared between the guest main thread and the IRQ handlers. */
-volatile uint64_t irq_handled;
-volatile uint32_t irqnr_received[MAX_SPI + 1];
-
-static void reset_stats(void)
-{
-	int i;
-
-	irq_handled = 0;
-	for (i = 0; i <= MAX_SPI; i++)
-		irqnr_received[i] = 0;
-}
-
-static uint64_t gic_read_ap1r0(void)
-{
-	uint64_t reg = read_sysreg_s(SYS_ICC_AP1R0_EL1);
-
-	dsb(sy);
-	return reg;
-}
-
-static void gic_write_ap1r0(uint64_t val)
-{
-	write_sysreg_s(val, SYS_ICC_AP1R0_EL1);
-	isb();
-}
-
-static void guest_set_irq_line(uint32_t intid, uint32_t level);
-
-static void guest_irq_generic_handler(bool eoi_split, bool level_sensitive)
-{
-	uint32_t intid = gic_get_and_ack_irq();
-
-	if (intid == IAR_SPURIOUS)
-		return;
-
-	GUEST_ASSERT(gic_irq_get_active(intid));
-
-	if (!level_sensitive)
-		GUEST_ASSERT(!gic_irq_get_pending(intid));
-
-	if (level_sensitive)
-		guest_set_irq_line(intid, 0);
-
-	GUEST_ASSERT(intid < MAX_SPI);
-	irqnr_received[intid] += 1;
-	irq_handled += 1;
-
-	gic_set_eoi(intid);
-	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
-	if (eoi_split)
-		gic_set_dir(intid);
-
-	GUEST_ASSERT(!gic_irq_get_active(intid));
-	GUEST_ASSERT(!gic_irq_get_pending(intid));
-}
-
-static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
-		uint32_t num, int level, bool expect_failure)
-{
-	struct kvm_inject_args args = {
-		.cmd = cmd,
-		.first_intid = first_intid,
-		.num = num,
-		.level = level,
-		.expect_failure = expect_failure,
-	};
-	GUEST_SYNC(&args);
-}
-
-#define GUEST_ASSERT_IAR_EMPTY()						\
-do { 										\
-	uint32_t _intid;							\
-	_intid = gic_get_and_ack_irq();						\
-	GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS);			\
-} while (0)
-
-#define CAT_HELPER(a, b) a ## b
-#define CAT(a, b) CAT_HELPER(a, b)
-#define PREFIX guest_irq_handler_
-#define GUEST_IRQ_HANDLER_NAME(split, lev) CAT(PREFIX, CAT(split, lev))
-#define GENERATE_GUEST_IRQ_HANDLER(split, lev)					\
-static void CAT(PREFIX, CAT(split, lev))(struct ex_regs *regs)			\
-{										\
-	guest_irq_generic_handler(split, lev);					\
-}
-
-GENERATE_GUEST_IRQ_HANDLER(0, 0);
-GENERATE_GUEST_IRQ_HANDLER(0, 1);
-GENERATE_GUEST_IRQ_HANDLER(1, 0);
-GENERATE_GUEST_IRQ_HANDLER(1, 1);
-
-static void (*guest_irq_handlers[2][2])(struct ex_regs *) = {
-	{GUEST_IRQ_HANDLER_NAME(0, 0), GUEST_IRQ_HANDLER_NAME(0, 1),},
-	{GUEST_IRQ_HANDLER_NAME(1, 0), GUEST_IRQ_HANDLER_NAME(1, 1),},
-};
-
-static void reset_priorities(struct test_args *args)
-{
-	int i;
-
-	for (i = 0; i < args->nr_irqs; i++)
-		gic_set_priority(i, IRQ_DEFAULT_PRIO_REG);
-}
-
-static void guest_set_irq_line(uint32_t intid, uint32_t level)
-{
-	kvm_inject_call(KVM_SET_IRQ_LINE, intid, 1, level, false);
-}
-
-static void test_inject_fail(struct test_args *args,
-		uint32_t intid, kvm_inject_cmd cmd)
-{
-	reset_stats();
-
-	_KVM_INJECT(cmd, intid, true);
-	/* no IRQ to handle on entry */
-
-	GUEST_ASSERT_EQ(irq_handled, 0);
-	GUEST_ASSERT_IAR_EMPTY();
-}
-
-static void guest_inject(struct test_args *args,
-		uint32_t first_intid, uint32_t num,
-		kvm_inject_cmd cmd)
-{
-	uint32_t i;
-
-	reset_stats();
-
-	/* Cycle over all priorities to make things more interesting. */
-	for (i = first_intid; i < num + first_intid; i++)
-		gic_set_priority(i, (i % (KVM_NUM_PRIOS - 1)) << 3);
-
-	asm volatile("msr daifset, #2" : : : "memory");
-	KVM_INJECT_MULTI(cmd, first_intid, num);
-
-	while (irq_handled < num) {
-		wfi();
-		local_irq_enable();
-		isb(); /* handle IRQ */
-		local_irq_disable();
-	}
-	local_irq_enable();
-
-	GUEST_ASSERT_EQ(irq_handled, num);
-	for (i = first_intid; i < num + first_intid; i++)
-		GUEST_ASSERT_EQ(irqnr_received[i], 1);
-	GUEST_ASSERT_IAR_EMPTY();
-
-	reset_priorities(args);
-}
-
-/*
- * Restore the active state of multiple concurrent IRQs (given by
- * concurrent_irqs).  This does what a live-migration would do on the
- * destination side assuming there are some active IRQs that were not
- * deactivated yet.
- */
-static void guest_restore_active(struct test_args *args,
-		uint32_t first_intid, uint32_t num,
-		kvm_inject_cmd cmd)
-{
-	uint32_t prio, intid, ap1r;
-	int i;
-
-	/*
-	 * Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs
-	 * in descending order, so intid+1 can preempt intid.
-	 */
-	for (i = 0, prio = (num - 1) * 8; i < num; i++, prio -= 8) {
-		GUEST_ASSERT(prio >= 0);
-		intid = i + first_intid;
-		gic_set_priority(intid, prio);
-	}
-
-	/*
-	 * In a real migration, KVM would restore all GIC state before running
-	 * guest code.
-	 */
-	for (i = 0; i < num; i++) {
-		intid = i + first_intid;
-		KVM_ACTIVATE(cmd, intid);
-		ap1r = gic_read_ap1r0();
-		ap1r |= 1U << i;
-		gic_write_ap1r0(ap1r);
-	}
-
-	/* This is where the "migration" would occur. */
-
-	/* finish handling the IRQs starting with the highest priority one. */
-	for (i = 0; i < num; i++) {
-		intid = num - i - 1 + first_intid;
-		gic_set_eoi(intid);
-		if (args->eoi_split)
-			gic_set_dir(intid);
-	}
-
-	for (i = 0; i < num; i++)
-		GUEST_ASSERT(!gic_irq_get_active(i + first_intid));
-	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
-	GUEST_ASSERT_IAR_EMPTY();
-}
-
-/*
- * Polls the IAR until it's not a spurious interrupt.
- *
- * This function should only be used in test_inject_preemption (with IRQs
- * masked).
- */
-static uint32_t wait_for_and_activate_irq(void)
-{
-	uint32_t intid;
-
-	do {
-		asm volatile("wfi" : : : "memory");
-		intid = gic_get_and_ack_irq();
-	} while (intid == IAR_SPURIOUS);
-
-	return intid;
-}
-
-/*
- * Inject multiple concurrent IRQs (num IRQs starting at first_intid) and
- * handle them without handling the actual exceptions.  This is done by masking
- * interrupts for the whole test.
- */
-static void test_inject_preemption(struct test_args *args,
-		uint32_t first_intid, int num,
-		kvm_inject_cmd cmd)
-{
-	uint32_t intid, prio, step = KVM_PRIO_STEPS;
-	int i;
-
-	/* Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs
-	 * in descending order, so intid+1 can preempt intid.
-	 */
-	for (i = 0, prio = (num - 1) * step; i < num; i++, prio -= step) {
-		GUEST_ASSERT(prio >= 0);
-		intid = i + first_intid;
-		gic_set_priority(intid, prio);
-	}
-
-	local_irq_disable();
-
-	for (i = 0; i < num; i++) {
-		uint32_t tmp;
-		intid = i + first_intid;
-		KVM_INJECT(cmd, intid);
-		/* Each successive IRQ will preempt the previous one. */
-		tmp = wait_for_and_activate_irq();
-		GUEST_ASSERT_EQ(tmp, intid);
-		if (args->level_sensitive)
-			guest_set_irq_line(intid, 0);
-	}
-
-	/* finish handling the IRQs starting with the highest priority one. */
-	for (i = 0; i < num; i++) {
-		intid = num - i - 1 + first_intid;
-		gic_set_eoi(intid);
-		if (args->eoi_split)
-			gic_set_dir(intid);
-	}
-
-	local_irq_enable();
-
-	for (i = 0; i < num; i++)
-		GUEST_ASSERT(!gic_irq_get_active(i + first_intid));
-	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
-	GUEST_ASSERT_IAR_EMPTY();
-
-	reset_priorities(args);
-}
-
-static void test_injection(struct test_args *args, struct kvm_inject_desc *f)
-{
-	uint32_t nr_irqs = args->nr_irqs;
-
-	if (f->sgi) {
-		guest_inject(args, MIN_SGI, 1, f->cmd);
-		guest_inject(args, 0, 16, f->cmd);
-	}
-
-	if (f->ppi)
-		guest_inject(args, MIN_PPI, 1, f->cmd);
-
-	if (f->spi) {
-		guest_inject(args, MIN_SPI, 1, f->cmd);
-		guest_inject(args, nr_irqs - 1, 1, f->cmd);
-		guest_inject(args, MIN_SPI, nr_irqs - MIN_SPI, f->cmd);
-	}
-}
-
-static void test_injection_failure(struct test_args *args,
-		struct kvm_inject_desc *f)
-{
-	uint32_t bad_intid[] = { args->nr_irqs, 1020, 1024, 1120, 5120, ~0U, };
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(bad_intid); i++)
-		test_inject_fail(args, bad_intid[i], f->cmd);
-}
-
-static void test_preemption(struct test_args *args, struct kvm_inject_desc *f)
-{
-	/*
-	 * Test up to 4 levels of preemption. The reason is that KVM doesn't
-	 * currently implement the ability to have more than the number-of-LRs
-	 * number of concurrently active IRQs. The number of LRs implemented is
-	 * IMPLEMENTATION DEFINED, however, it seems that most implement 4.
-	 */
-	if (f->sgi)
-		test_inject_preemption(args, MIN_SGI, 4, f->cmd);
-
-	if (f->ppi)
-		test_inject_preemption(args, MIN_PPI, 4, f->cmd);
-
-	if (f->spi)
-		test_inject_preemption(args, MIN_SPI, 4, f->cmd);
-}
-
-static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f)
-{
-	/* Test up to 4 active IRQs. Same reason as in test_preemption. */
-	if (f->sgi)
-		guest_restore_active(args, MIN_SGI, 4, f->cmd);
-
-	if (f->ppi)
-		guest_restore_active(args, MIN_PPI, 4, f->cmd);
-
-	if (f->spi)
-		guest_restore_active(args, MIN_SPI, 4, f->cmd);
-}
-
-static void guest_code(struct test_args *args)
-{
-	uint32_t i, nr_irqs = args->nr_irqs;
-	bool level_sensitive = args->level_sensitive;
-	struct kvm_inject_desc *f, *inject_fns;
-
-	gic_init(GIC_V3, 1);
-
-	for (i = 0; i < nr_irqs; i++)
-		gic_irq_enable(i);
-
-	for (i = MIN_SPI; i < nr_irqs; i++)
-		gic_irq_set_config(i, !level_sensitive);
-
-	gic_set_eoi_split(args->eoi_split);
-
-	reset_priorities(args);
-	gic_set_priority_mask(CPU_PRIO_MASK);
-
-	inject_fns  = level_sensitive ? inject_level_fns
-				      : inject_edge_fns;
-
-	local_irq_enable();
-
-	/* Start the tests. */
-	for_each_supported_inject_fn(args, inject_fns, f) {
-		test_injection(args, f);
-		test_preemption(args, f);
-		test_injection_failure(args, f);
-	}
-
-	/*
-	 * Restore the active state of IRQs. This would happen when live
-	 * migrating IRQs in the middle of being handled.
-	 */
-	for_each_supported_activate_fn(args, set_active_fns, f)
-		test_restore_active(args, f);
-
-	GUEST_DONE();
-}
-
-static void kvm_irq_line_check(struct kvm_vm *vm, uint32_t intid, int level,
-			struct test_args *test_args, bool expect_failure)
-{
-	int ret;
-
-	if (!expect_failure) {
-		kvm_arm_irq_line(vm, intid, level);
-	} else {
-		/* The interface doesn't allow larger intid's. */
-		if (intid > KVM_ARM_IRQ_NUM_MASK)
-			return;
-
-		ret = _kvm_arm_irq_line(vm, intid, level);
-		TEST_ASSERT(ret != 0 && errno == EINVAL,
-				"Bad intid %i did not cause KVM_IRQ_LINE "
-				"error: rc: %i errno: %i", intid, ret, errno);
-	}
-}
-
-void kvm_irq_set_level_info_check(int gic_fd, uint32_t intid, int level,
-			bool expect_failure)
-{
-	if (!expect_failure) {
-		kvm_irq_set_level_info(gic_fd, intid, level);
-	} else {
-		int ret = _kvm_irq_set_level_info(gic_fd, intid, level);
-		/*
-		 * The kernel silently fails for invalid SPIs and SGIs (which
-		 * are not level-sensitive). It only checks for intid to not
-		 * spill over 1U << 10 (the max reserved SPI). Also, callers
-		 * are supposed to mask the intid with 0x3ff (1023).
-		 */
-		if (intid > VGIC_MAX_RESERVED)
-			TEST_ASSERT(ret != 0 && errno == EINVAL,
-				"Bad intid %i did not cause VGIC_GRP_LEVEL_INFO "
-				"error: rc: %i errno: %i", intid, ret, errno);
-		else
-			TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO "
-				"for intid %i failed, rc: %i errno: %i",
-				intid, ret, errno);
-	}
-}
-
-static void kvm_set_gsi_routing_irqchip_check(struct kvm_vm *vm,
-		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
-		bool expect_failure)
-{
-	struct kvm_irq_routing *routing;
-	int ret;
-	uint64_t i;
-
-	assert(num <= kvm_max_routes && kvm_max_routes <= KVM_MAX_IRQ_ROUTES);
-
-	routing = kvm_gsi_routing_create();
-	for (i = intid; i < (uint64_t)intid + num; i++)
-		kvm_gsi_routing_irqchip_add(routing, i - MIN_SPI, i - MIN_SPI);
-
-	if (!expect_failure) {
-		kvm_gsi_routing_write(vm, routing);
-	} else {
-		ret = _kvm_gsi_routing_write(vm, routing);
-		/* The kernel only checks e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS */
-		if (((uint64_t)intid + num - 1 - MIN_SPI) >= KVM_IRQCHIP_NUM_PINS)
-			TEST_ASSERT(ret != 0 && errno == EINVAL,
-				"Bad intid %u did not cause KVM_SET_GSI_ROUTING "
-				"error: rc: %i errno: %i", intid, ret, errno);
-		else
-			TEST_ASSERT(ret == 0, "KVM_SET_GSI_ROUTING "
-				"for intid %i failed, rc: %i errno: %i",
-				intid, ret, errno);
-	}
-}
-
-static void kvm_irq_write_ispendr_check(int gic_fd, uint32_t intid,
-					struct kvm_vcpu *vcpu,
-					bool expect_failure)
-{
-	/*
-	 * Ignore this when expecting failure as invalid intids will lead to
-	 * either trying to inject SGIs when we configured the test to be
-	 * level_sensitive (or the reverse), or inject large intids which
-	 * will lead to writing above the ISPENDR register space (and we
-	 * don't want to do that either).
-	 */
-	if (!expect_failure)
-		kvm_irq_write_ispendr(gic_fd, intid, vcpu);
-}
-
-static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
-		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
-		bool expect_failure)
-{
-	int fd[MAX_SPI];
-	uint64_t val;
-	int ret, f;
-	uint64_t i;
-
-	/*
-	 * There is no way to try injecting an SGI or PPI as the interface
-	 * starts counting from the first SPI (above the private ones), so just
-	 * exit.
-	 */
-	if (INTID_IS_SGI(intid) || INTID_IS_PPI(intid))
-		return;
-
-	kvm_set_gsi_routing_irqchip_check(vm, intid, num,
-			kvm_max_routes, expect_failure);
-
-	/*
-	 * If expect_failure, then just to inject anyway. These
-	 * will silently fail. And in any case, the guest will check
-	 * that no actual interrupt was injected for those cases.
-	 */
-
-	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
-		fd[f] = eventfd(0, 0);
-		TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f]));
-	}
-
-	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
-		struct kvm_irqfd irqfd = {
-			.fd  = fd[f],
-			.gsi = i - MIN_SPI,
-		};
-		assert(i <= (uint64_t)UINT_MAX);
-		vm_ioctl(vm, KVM_IRQFD, &irqfd);
-	}
-
-	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
-		val = 1;
-		ret = write(fd[f], &val, sizeof(uint64_t));
-		TEST_ASSERT(ret == sizeof(uint64_t),
-			    __KVM_SYSCALL_ERROR("write()", ret));
-	}
-
-	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++)
-		close(fd[f]);
-}
-
-/* handles the valid case: intid=0xffffffff num=1 */
-#define for_each_intid(first, num, tmp, i)					\
-	for ((tmp) = (i) = (first);						\
-		(tmp) < (uint64_t)(first) + (uint64_t)(num);			\
-		(tmp)++, (i)++)
-
-static void run_guest_cmd(struct kvm_vcpu *vcpu, int gic_fd,
-			  struct kvm_inject_args *inject_args,
-			  struct test_args *test_args)
-{
-	kvm_inject_cmd cmd = inject_args->cmd;
-	uint32_t intid = inject_args->first_intid;
-	uint32_t num = inject_args->num;
-	int level = inject_args->level;
-	bool expect_failure = inject_args->expect_failure;
-	struct kvm_vm *vm = vcpu->vm;
-	uint64_t tmp;
-	uint32_t i;
-
-	/* handles the valid case: intid=0xffffffff num=1 */
-	assert(intid < UINT_MAX - num || num == 1);
-
-	switch (cmd) {
-	case KVM_INJECT_EDGE_IRQ_LINE:
-		for_each_intid(intid, num, tmp, i)
-			kvm_irq_line_check(vm, i, 1, test_args,
-					expect_failure);
-		for_each_intid(intid, num, tmp, i)
-			kvm_irq_line_check(vm, i, 0, test_args,
-					expect_failure);
-		break;
-	case KVM_SET_IRQ_LINE:
-		for_each_intid(intid, num, tmp, i)
-			kvm_irq_line_check(vm, i, level, test_args,
-					expect_failure);
-		break;
-	case KVM_SET_IRQ_LINE_HIGH:
-		for_each_intid(intid, num, tmp, i)
-			kvm_irq_line_check(vm, i, 1, test_args,
-					expect_failure);
-		break;
-	case KVM_SET_LEVEL_INFO_HIGH:
-		for_each_intid(intid, num, tmp, i)
-			kvm_irq_set_level_info_check(gic_fd, i, 1,
-					expect_failure);
-		break;
-	case KVM_INJECT_IRQFD:
-		kvm_routing_and_irqfd_check(vm, intid, num,
-					test_args->kvm_max_routes,
-					expect_failure);
-		break;
-	case KVM_WRITE_ISPENDR:
-		for (i = intid; i < intid + num; i++)
-			kvm_irq_write_ispendr_check(gic_fd, i, vcpu,
-						    expect_failure);
-		break;
-	case KVM_WRITE_ISACTIVER:
-		for (i = intid; i < intid + num; i++)
-			kvm_irq_write_isactiver(gic_fd, i, vcpu);
-		break;
-	default:
-		break;
-	}
-}
-
-static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
-		struct kvm_inject_args *args)
-{
-	struct kvm_inject_args *kvm_args_hva;
-	vm_vaddr_t kvm_args_gva;
-
-	kvm_args_gva = uc->args[1];
-	kvm_args_hva = (struct kvm_inject_args *)addr_gva2hva(vm, kvm_args_gva);
-	memcpy(args, kvm_args_hva, sizeof(struct kvm_inject_args));
-}
-
-static void print_args(struct test_args *args)
-{
-	printf("nr-irqs=%d level-sensitive=%d eoi-split=%d\n",
-			args->nr_irqs, args->level_sensitive,
-			args->eoi_split);
-}
-
-static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split)
-{
-	struct ucall uc;
-	int gic_fd;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_inject_args inject_args;
-	vm_vaddr_t args_gva;
-
-	struct test_args args = {
-		.nr_irqs = nr_irqs,
-		.level_sensitive = level_sensitive,
-		.eoi_split = eoi_split,
-		.kvm_max_routes = kvm_check_cap(KVM_CAP_IRQ_ROUTING),
-		.kvm_supports_irqfd = kvm_check_cap(KVM_CAP_IRQFD),
-	};
-
-	print_args(&args);
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(vcpu);
-
-	/* Setup the guest args page (so it gets the args). */
-	args_gva = vm_vaddr_alloc_page(vm);
-	memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args));
-	vcpu_args_set(vcpu, 1, args_gva);
-
-	gic_fd = vgic_v3_setup(vm, 1, nr_irqs);
-	__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping");
-
-	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT,
-		guest_irq_handlers[args.eoi_split][args.level_sensitive]);
-
-	while (1) {
-		vcpu_run(vcpu);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			kvm_inject_get_call(vm, &uc, &inject_args);
-			run_guest_cmd(vcpu, gic_fd, &inject_args, &args);
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-
-done:
-	close(gic_fd);
-	kvm_vm_free(vm);
-}
-
-static void help(const char *name)
-{
-	printf(
-	"\n"
-	"usage: %s [-n num_irqs] [-e eoi_split] [-l level_sensitive]\n", name);
-	printf(" -n: specify number of IRQs to setup the vgic with. "
-		"It has to be a multiple of 32 and between 64 and 1024.\n");
-	printf(" -e: if 1 then EOI is split into a write to DIR on top "
-		"of writing EOI.\n");
-	printf(" -l: specify whether the IRQs are level-sensitive (1) or not (0).");
-	puts("");
-	exit(1);
-}
-
-int main(int argc, char **argv)
-{
-	uint32_t nr_irqs = 64;
-	bool default_args = true;
-	bool level_sensitive = false;
-	int opt;
-	bool eoi_split = false;
-
-	while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) {
-		switch (opt) {
-		case 'n':
-			nr_irqs = atoi_non_negative("Number of IRQs", optarg);
-			if (nr_irqs > 1024 || nr_irqs % 32)
-				help(argv[0]);
-			break;
-		case 'e':
-			eoi_split = (bool)atoi_paranoid(optarg);
-			default_args = false;
-			break;
-		case 'l':
-			level_sensitive = (bool)atoi_paranoid(optarg);
-			default_args = false;
-			break;
-		case 'h':
-		default:
-			help(argv[0]);
-			break;
-		}
-	}
-
-	/*
-	 * If the user just specified nr_irqs and/or gic_version, then run all
-	 * combinations.
-	 */
-	if (default_args) {
-		test_vgic(nr_irqs, false /* level */, false /* eoi_split */);
-		test_vgic(nr_irqs, false /* level */, true /* eoi_split */);
-		test_vgic(nr_irqs, true /* level */, false /* eoi_split */);
-		test_vgic(nr_irqs, true /* level */, true /* eoi_split */);
-	} else {
-		test_vgic(nr_irqs, level_sensitive, eoi_split);
-	}
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c
deleted file mode 100644
index fc4fe52fb6f8..000000000000
--- a/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c
+++ /dev/null
@@ -1,410 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * vgic_lpi_stress - Stress test for KVM's ITS emulation
- *
- * Copyright (c) 2024 Google LLC
- */
-
-#include <linux/sizes.h>
-#include <pthread.h>
-#include <stdatomic.h>
-#include <sys/sysinfo.h>
-
-#include "kvm_util.h"
-#include "gic.h"
-#include "gic_v3.h"
-#include "gic_v3_its.h"
-#include "processor.h"
-#include "ucall.h"
-#include "vgic.h"
-
-#define TEST_MEMSLOT_INDEX	1
-
-#define GIC_LPI_OFFSET	8192
-
-static size_t nr_iterations = 1000;
-static vm_paddr_t gpa_base;
-
-static struct kvm_vm *vm;
-static struct kvm_vcpu **vcpus;
-static int gic_fd, its_fd;
-
-static struct test_data {
-	bool		request_vcpus_stop;
-	u32		nr_cpus;
-	u32		nr_devices;
-	u32		nr_event_ids;
-
-	vm_paddr_t	device_table;
-	vm_paddr_t	collection_table;
-	vm_paddr_t	cmdq_base;
-	void		*cmdq_base_va;
-	vm_paddr_t	itt_tables;
-
-	vm_paddr_t	lpi_prop_table;
-	vm_paddr_t	lpi_pend_tables;
-} test_data =  {
-	.nr_cpus	= 1,
-	.nr_devices	= 1,
-	.nr_event_ids	= 16,
-};
-
-static void guest_irq_handler(struct ex_regs *regs)
-{
-	u32 intid = gic_get_and_ack_irq();
-
-	if (intid == IAR_SPURIOUS)
-		return;
-
-	GUEST_ASSERT(intid >= GIC_LPI_OFFSET);
-	gic_set_eoi(intid);
-}
-
-static void guest_setup_its_mappings(void)
-{
-	u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET;
-	u32 nr_events = test_data.nr_event_ids;
-	u32 nr_devices = test_data.nr_devices;
-	u32 nr_cpus = test_data.nr_cpus;
-
-	for (coll_id = 0; coll_id < nr_cpus; coll_id++)
-		its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true);
-
-	/* Round-robin the LPIs to all of the vCPUs in the VM */
-	coll_id = 0;
-	for (device_id = 0; device_id < nr_devices; device_id++) {
-		vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K);
-
-		its_send_mapd_cmd(test_data.cmdq_base_va, device_id,
-				  itt_base, SZ_64K, true);
-
-		for (event_id = 0; event_id < nr_events; event_id++) {
-			its_send_mapti_cmd(test_data.cmdq_base_va, device_id,
-					   event_id, coll_id, intid++);
-
-			coll_id = (coll_id + 1) % test_data.nr_cpus;
-		}
-	}
-}
-
-static void guest_invalidate_all_rdists(void)
-{
-	int i;
-
-	for (i = 0; i < test_data.nr_cpus; i++)
-		its_send_invall_cmd(test_data.cmdq_base_va, i);
-}
-
-static void guest_setup_gic(void)
-{
-	static atomic_int nr_cpus_ready = 0;
-	u32 cpuid = guest_get_vcpuid();
-
-	gic_init(GIC_V3, test_data.nr_cpus);
-	gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K,
-			      test_data.lpi_pend_tables + (cpuid * SZ_64K));
-
-	atomic_fetch_add(&nr_cpus_ready, 1);
-
-	if (cpuid > 0)
-		return;
-
-	while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus)
-		cpu_relax();
-
-	its_init(test_data.collection_table, SZ_64K,
-		 test_data.device_table, SZ_64K,
-		 test_data.cmdq_base, SZ_64K);
-
-	guest_setup_its_mappings();
-	guest_invalidate_all_rdists();
-}
-
-static void guest_code(size_t nr_lpis)
-{
-	guest_setup_gic();
-
-	GUEST_SYNC(0);
-
-	/*
-	 * Don't use WFI here to avoid blocking the vCPU thread indefinitely and
-	 * never getting the stop signal.
-	 */
-	while (!READ_ONCE(test_data.request_vcpus_stop))
-		cpu_relax();
-
-	GUEST_DONE();
-}
-
-static void setup_memslot(void)
-{
-	size_t pages;
-	size_t sz;
-
-	/*
-	 * For the ITS:
-	 *  - A single level device table
-	 *  - A single level collection table
-	 *  - The command queue
-	 *  - An ITT for each device
-	 */
-	sz = (3 + test_data.nr_devices) * SZ_64K;
-
-	/*
-	 * For the redistributors:
-	 *  - A shared LPI configuration table
-	 *  - An LPI pending table for each vCPU
-	 */
-	sz += (1 + test_data.nr_cpus) * SZ_64K;
-
-	pages = sz / vm->page_size;
-	gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz;
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base,
-				    TEST_MEMSLOT_INDEX, pages, 0);
-}
-
-#define LPI_PROP_DEFAULT_PRIO	0xa0
-
-static void configure_lpis(void)
-{
-	size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids;
-	u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table);
-	size_t i;
-
-	for (i = 0; i < nr_lpis; i++) {
-		tbl[i] = LPI_PROP_DEFAULT_PRIO |
-			 LPI_PROP_GROUP1 |
-			 LPI_PROP_ENABLED;
-	}
-}
-
-static void setup_test_data(void)
-{
-	size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K);
-	u32 nr_devices = test_data.nr_devices;
-	u32 nr_cpus = test_data.nr_cpus;
-	vm_paddr_t cmdq_base;
-
-	test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k,
-						    gpa_base,
-						    TEST_MEMSLOT_INDEX);
-
-	test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k,
-							gpa_base,
-							TEST_MEMSLOT_INDEX);
-
-	cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base,
-				       TEST_MEMSLOT_INDEX);
-	virt_map(vm, cmdq_base, cmdq_base, pages_per_64k);
-	test_data.cmdq_base = cmdq_base;
-	test_data.cmdq_base_va = (void *)cmdq_base;
-
-	test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices,
-						  gpa_base, TEST_MEMSLOT_INDEX);
-
-	test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k,
-						      gpa_base, TEST_MEMSLOT_INDEX);
-	configure_lpis();
-
-	test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus,
-						       gpa_base, TEST_MEMSLOT_INDEX);
-
-	sync_global_to_guest(vm, test_data);
-}
-
-static void setup_gic(void)
-{
-	gic_fd = vgic_v3_setup(vm, test_data.nr_cpus, 64);
-	__TEST_REQUIRE(gic_fd >= 0, "Failed to create GICv3");
-
-	its_fd = vgic_its_setup(vm);
-}
-
-static void signal_lpi(u32 device_id, u32 event_id)
-{
-	vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER;
-
-	struct kvm_msi msi = {
-		.address_lo	= db_addr,
-		.address_hi	= db_addr >> 32,
-		.data		= event_id,
-		.devid		= device_id,
-		.flags		= KVM_MSI_VALID_DEVID,
-	};
-
-	/*
-	 * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM,
-	 * which for arm64 implies having a valid translation in the ITS.
-	 */
-	TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1,
-		    "KVM_SIGNAL_MSI ioctl failed");
-}
-
-static pthread_barrier_t test_setup_barrier;
-
-static void *lpi_worker_thread(void *data)
-{
-	u32 device_id = (size_t)data;
-	u32 event_id;
-	size_t i;
-
-	pthread_barrier_wait(&test_setup_barrier);
-
-	for (i = 0; i < nr_iterations; i++)
-		for (event_id = 0; event_id < test_data.nr_event_ids; event_id++)
-			signal_lpi(device_id, event_id);
-
-	return NULL;
-}
-
-static void *vcpu_worker_thread(void *data)
-{
-	struct kvm_vcpu *vcpu = data;
-	struct ucall uc;
-
-	while (true) {
-		vcpu_run(vcpu);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			pthread_barrier_wait(&test_setup_barrier);
-			continue;
-		case UCALL_DONE:
-			return NULL;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		default:
-			TEST_FAIL("Unknown ucall: %lu", uc.cmd);
-		}
-	}
-
-	return NULL;
-}
-
-static void report_stats(struct timespec delta)
-{
-	double nr_lpis;
-	double time;
-
-	nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations;
-
-	time = delta.tv_sec;
-	time += ((double)delta.tv_nsec) / NSEC_PER_SEC;
-
-	pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time);
-}
-
-static void run_test(void)
-{
-	u32 nr_devices = test_data.nr_devices;
-	u32 nr_vcpus = test_data.nr_cpus;
-	pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t));
-	pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t));
-	struct timespec start, delta;
-	size_t i;
-
-	TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays");
-
-	pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1);
-
-	for (i = 0; i < nr_vcpus; i++)
-		pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]);
-
-	for (i = 0; i < nr_devices; i++)
-		pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i);
-
-	pthread_barrier_wait(&test_setup_barrier);
-
-	clock_gettime(CLOCK_MONOTONIC, &start);
-
-	for (i = 0; i < nr_devices; i++)
-		pthread_join(lpi_threads[i], NULL);
-
-	delta = timespec_elapsed(start);
-	write_guest_global(vm, test_data.request_vcpus_stop, true);
-
-	for (i = 0; i < nr_vcpus; i++)
-		pthread_join(vcpu_threads[i], NULL);
-
-	report_stats(delta);
-}
-
-static void setup_vm(void)
-{
-	int i;
-
-	vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu));
-	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
-
-	vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus);
-
-	vm_init_descriptor_tables(vm);
-	for (i = 0; i < test_data.nr_cpus; i++)
-		vcpu_init_descriptor_tables(vcpus[i]);
-
-	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
-
-	setup_memslot();
-
-	setup_gic();
-
-	setup_test_data();
-}
-
-static void destroy_vm(void)
-{
-	close(its_fd);
-	close(gic_fd);
-	kvm_vm_free(vm);
-	free(vcpus);
-}
-
-static void pr_usage(const char *name)
-{
-	pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name);
-	pr_info("  -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus);
-	pr_info("  -d:\tnumber of devices (default: %u)\n", test_data.nr_devices);
-	pr_info("  -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids);
-	pr_info("  -i:\tnumber of iterations (default: %lu)\n", nr_iterations);
-}
-
-int main(int argc, char **argv)
-{
-	u32 nr_threads;
-	int c;
-
-	while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) {
-		switch (c) {
-		case 'v':
-			test_data.nr_cpus = atoi(optarg);
-			break;
-		case 'd':
-			test_data.nr_devices = atoi(optarg);
-			break;
-		case 'e':
-			test_data.nr_event_ids = atoi(optarg);
-			break;
-		case 'i':
-			nr_iterations = strtoul(optarg, NULL, 0);
-			break;
-		case 'h':
-		default:
-			pr_usage(argv[0]);
-			return 1;
-		}
-	}
-
-	nr_threads = test_data.nr_cpus + test_data.nr_devices;
-	if (nr_threads > get_nprocs())
-		pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n",
-			 nr_threads, get_nprocs());
-
-	setup_vm();
-
-	run_test();
-
-	destroy_vm();
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
deleted file mode 100644
index f16b3b27e32e..000000000000
--- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
+++ /dev/null
@@ -1,648 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vpmu_counter_access - Test vPMU event counter access
- *
- * Copyright (c) 2023 Google LLC.
- *
- * This test checks if the guest can see the same number of the PMU event
- * counters (PMCR_EL0.N) that userspace sets, if the guest can access
- * those counters, and if the guest is prevented from accessing any
- * other counters.
- * It also checks if the userspace accesses to the PMU regsisters honor the
- * PMCR.N value that's set for the guest.
- * This test runs only when KVM_CAP_ARM_PMU_V3 is supported on the host.
- */
-#include <kvm_util.h>
-#include <processor.h>
-#include <test_util.h>
-#include <vgic.h>
-#include <perf/arm_pmuv3.h>
-#include <linux/bitfield.h>
-
-/* The max number of the PMU event counters (excluding the cycle counter) */
-#define ARMV8_PMU_MAX_GENERAL_COUNTERS	(ARMV8_PMU_MAX_COUNTERS - 1)
-
-/* The cycle counter bit position that's common among the PMU registers */
-#define ARMV8_PMU_CYCLE_IDX		31
-
-struct vpmu_vm {
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	int gic_fd;
-};
-
-static struct vpmu_vm vpmu_vm;
-
-struct pmreg_sets {
-	uint64_t set_reg_id;
-	uint64_t clr_reg_id;
-};
-
-#define PMREG_SET(set, clr) {.set_reg_id = set, .clr_reg_id = clr}
-
-static uint64_t get_pmcr_n(uint64_t pmcr)
-{
-	return FIELD_GET(ARMV8_PMU_PMCR_N, pmcr);
-}
-
-static void set_pmcr_n(uint64_t *pmcr, uint64_t pmcr_n)
-{
-	u64p_replace_bits((__u64 *) pmcr, pmcr_n, ARMV8_PMU_PMCR_N);
-}
-
-static uint64_t get_counters_mask(uint64_t n)
-{
-	uint64_t mask = BIT(ARMV8_PMU_CYCLE_IDX);
-
-	if (n)
-		mask |= GENMASK(n - 1, 0);
-	return mask;
-}
-
-/* Read PMEVTCNTR<n>_EL0 through PMXEVCNTR_EL0 */
-static inline unsigned long read_sel_evcntr(int sel)
-{
-	write_sysreg(sel, pmselr_el0);
-	isb();
-	return read_sysreg(pmxevcntr_el0);
-}
-
-/* Write PMEVTCNTR<n>_EL0 through PMXEVCNTR_EL0 */
-static inline void write_sel_evcntr(int sel, unsigned long val)
-{
-	write_sysreg(sel, pmselr_el0);
-	isb();
-	write_sysreg(val, pmxevcntr_el0);
-	isb();
-}
-
-/* Read PMEVTYPER<n>_EL0 through PMXEVTYPER_EL0 */
-static inline unsigned long read_sel_evtyper(int sel)
-{
-	write_sysreg(sel, pmselr_el0);
-	isb();
-	return read_sysreg(pmxevtyper_el0);
-}
-
-/* Write PMEVTYPER<n>_EL0 through PMXEVTYPER_EL0 */
-static inline void write_sel_evtyper(int sel, unsigned long val)
-{
-	write_sysreg(sel, pmselr_el0);
-	isb();
-	write_sysreg(val, pmxevtyper_el0);
-	isb();
-}
-
-static void pmu_disable_reset(void)
-{
-	uint64_t pmcr = read_sysreg(pmcr_el0);
-
-	/* Reset all counters, disabling them */
-	pmcr &= ~ARMV8_PMU_PMCR_E;
-	write_sysreg(pmcr | ARMV8_PMU_PMCR_P, pmcr_el0);
-	isb();
-}
-
-#define RETURN_READ_PMEVCNTRN(n) \
-	return read_sysreg(pmevcntr##n##_el0)
-static unsigned long read_pmevcntrn(int n)
-{
-	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
-	return 0;
-}
-
-#define WRITE_PMEVCNTRN(n) \
-	write_sysreg(val, pmevcntr##n##_el0)
-static void write_pmevcntrn(int n, unsigned long val)
-{
-	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
-	isb();
-}
-
-#define READ_PMEVTYPERN(n) \
-	return read_sysreg(pmevtyper##n##_el0)
-static unsigned long read_pmevtypern(int n)
-{
-	PMEVN_SWITCH(n, READ_PMEVTYPERN);
-	return 0;
-}
-
-#define WRITE_PMEVTYPERN(n) \
-	write_sysreg(val, pmevtyper##n##_el0)
-static void write_pmevtypern(int n, unsigned long val)
-{
-	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
-	isb();
-}
-
-/*
- * The pmc_accessor structure has pointers to PMEV{CNTR,TYPER}<n>_EL0
- * accessors that test cases will use. Each of the accessors will
- * either directly reads/writes PMEV{CNTR,TYPER}<n>_EL0
- * (i.e. {read,write}_pmev{cnt,type}rn()), or reads/writes them through
- * PMXEV{CNTR,TYPER}_EL0 (i.e. {read,write}_sel_ev{cnt,type}r()).
- *
- * This is used to test that combinations of those accessors provide
- * the consistent behavior.
- */
-struct pmc_accessor {
-	/* A function to be used to read PMEVTCNTR<n>_EL0 */
-	unsigned long	(*read_cntr)(int idx);
-	/* A function to be used to write PMEVTCNTR<n>_EL0 */
-	void		(*write_cntr)(int idx, unsigned long val);
-	/* A function to be used to read PMEVTYPER<n>_EL0 */
-	unsigned long	(*read_typer)(int idx);
-	/* A function to be used to write PMEVTYPER<n>_EL0 */
-	void		(*write_typer)(int idx, unsigned long val);
-};
-
-struct pmc_accessor pmc_accessors[] = {
-	/* test with all direct accesses */
-	{ read_pmevcntrn, write_pmevcntrn, read_pmevtypern, write_pmevtypern },
-	/* test with all indirect accesses */
-	{ read_sel_evcntr, write_sel_evcntr, read_sel_evtyper, write_sel_evtyper },
-	/* read with direct accesses, and write with indirect accesses */
-	{ read_pmevcntrn, write_sel_evcntr, read_pmevtypern, write_sel_evtyper },
-	/* read with indirect accesses, and write with direct accesses */
-	{ read_sel_evcntr, write_pmevcntrn, read_sel_evtyper, write_pmevtypern },
-};
-
-/*
- * Convert a pointer of pmc_accessor to an index in pmc_accessors[],
- * assuming that the pointer is one of the entries in pmc_accessors[].
- */
-#define PMC_ACC_TO_IDX(acc)	(acc - &pmc_accessors[0])
-
-#define GUEST_ASSERT_BITMAP_REG(regname, mask, set_expected)			 \
-{										 \
-	uint64_t _tval = read_sysreg(regname);					 \
-										 \
-	if (set_expected)							 \
-		__GUEST_ASSERT((_tval & mask),					 \
-				"tval: 0x%lx; mask: 0x%lx; set_expected: %u",	 \
-				_tval, mask, set_expected);			 \
-	else									 \
-		__GUEST_ASSERT(!(_tval & mask),					 \
-				"tval: 0x%lx; mask: 0x%lx; set_expected: %u",	 \
-				_tval, mask, set_expected);			 \
-}
-
-/*
- * Check if @mask bits in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers
- * are set or cleared as specified in @set_expected.
- */
-static void check_bitmap_pmu_regs(uint64_t mask, bool set_expected)
-{
-	GUEST_ASSERT_BITMAP_REG(pmcntenset_el0, mask, set_expected);
-	GUEST_ASSERT_BITMAP_REG(pmcntenclr_el0, mask, set_expected);
-	GUEST_ASSERT_BITMAP_REG(pmintenset_el1, mask, set_expected);
-	GUEST_ASSERT_BITMAP_REG(pmintenclr_el1, mask, set_expected);
-	GUEST_ASSERT_BITMAP_REG(pmovsset_el0, mask, set_expected);
-	GUEST_ASSERT_BITMAP_REG(pmovsclr_el0, mask, set_expected);
-}
-
-/*
- * Check if the bit in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers corresponding
- * to the specified counter (@pmc_idx) can be read/written as expected.
- * When @set_op is true, it tries to set the bit for the counter in
- * those registers by writing the SET registers (the bit won't be set
- * if the counter is not implemented though).
- * Otherwise, it tries to clear the bits in the registers by writing
- * the CLR registers.
- * Then, it checks if the values indicated in the registers are as expected.
- */
-static void test_bitmap_pmu_regs(int pmc_idx, bool set_op)
-{
-	uint64_t pmcr_n, test_bit = BIT(pmc_idx);
-	bool set_expected = false;
-
-	if (set_op) {
-		write_sysreg(test_bit, pmcntenset_el0);
-		write_sysreg(test_bit, pmintenset_el1);
-		write_sysreg(test_bit, pmovsset_el0);
-
-		/* The bit will be set only if the counter is implemented */
-		pmcr_n = get_pmcr_n(read_sysreg(pmcr_el0));
-		set_expected = (pmc_idx < pmcr_n) ? true : false;
-	} else {
-		write_sysreg(test_bit, pmcntenclr_el0);
-		write_sysreg(test_bit, pmintenclr_el1);
-		write_sysreg(test_bit, pmovsclr_el0);
-	}
-	check_bitmap_pmu_regs(test_bit, set_expected);
-}
-
-/*
- * Tests for reading/writing registers for the (implemented) event counter
- * specified by @pmc_idx.
- */
-static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
-{
-	uint64_t write_data, read_data;
-
-	/* Disable all PMCs and reset all PMCs to zero. */
-	pmu_disable_reset();
-
-	/*
-	 * Tests for reading/writing {PMCNTEN,PMINTEN,PMOVS}{SET,CLR}_EL1.
-	 */
-
-	/* Make sure that the bit in those registers are set to 0 */
-	test_bitmap_pmu_regs(pmc_idx, false);
-	/* Test if setting the bit in those registers works */
-	test_bitmap_pmu_regs(pmc_idx, true);
-	/* Test if clearing the bit in those registers works */
-	test_bitmap_pmu_regs(pmc_idx, false);
-
-	/*
-	 * Tests for reading/writing the event type register.
-	 */
-
-	/*
-	 * Set the event type register to an arbitrary value just for testing
-	 * of reading/writing the register.
-	 * Arm ARM says that for the event from 0x0000 to 0x003F,
-	 * the value indicated in the PMEVTYPER<n>_EL0.evtCount field is
-	 * the value written to the field even when the specified event
-	 * is not supported.
-	 */
-	write_data = (ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMUV3_PERFCTR_INST_RETIRED);
-	acc->write_typer(pmc_idx, write_data);
-	read_data = acc->read_typer(pmc_idx);
-	__GUEST_ASSERT(read_data == write_data,
-		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
-		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data);
-
-	/*
-	 * Tests for reading/writing the event count register.
-	 */
-
-	read_data = acc->read_cntr(pmc_idx);
-
-	/* The count value must be 0, as it is disabled and reset */
-	__GUEST_ASSERT(read_data == 0,
-		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx",
-		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data);
-
-	write_data = read_data + pmc_idx + 0x12345;
-	acc->write_cntr(pmc_idx, write_data);
-	read_data = acc->read_cntr(pmc_idx);
-	__GUEST_ASSERT(read_data == write_data,
-		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
-		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data);
-}
-
-#define INVALID_EC	(-1ul)
-uint64_t expected_ec = INVALID_EC;
-
-static void guest_sync_handler(struct ex_regs *regs)
-{
-	uint64_t esr, ec;
-
-	esr = read_sysreg(esr_el1);
-	ec = ESR_ELx_EC(esr);
-
-	__GUEST_ASSERT(expected_ec == ec,
-			"PC: 0x%lx; ESR: 0x%lx; EC: 0x%lx; EC expected: 0x%lx",
-			regs->pc, esr, ec, expected_ec);
-
-	/* skip the trapping instruction */
-	regs->pc += 4;
-
-	/* Use INVALID_EC to indicate an exception occurred */
-	expected_ec = INVALID_EC;
-}
-
-/*
- * Run the given operation that should trigger an exception with the
- * given exception class. The exception handler (guest_sync_handler)
- * will reset op_end_addr to 0, expected_ec to INVALID_EC, and skip
- * the instruction that trapped.
- */
-#define TEST_EXCEPTION(ec, ops)				\
-({							\
-	GUEST_ASSERT(ec != INVALID_EC);			\
-	WRITE_ONCE(expected_ec, ec);			\
-	dsb(ish);					\
-	ops;						\
-	GUEST_ASSERT(expected_ec == INVALID_EC);	\
-})
-
-/*
- * Tests for reading/writing registers for the unimplemented event counter
- * specified by @pmc_idx (>= PMCR_EL0.N).
- */
-static void test_access_invalid_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
-{
-	/*
-	 * Reading/writing the event count/type registers should cause
-	 * an UNDEFINED exception.
-	 */
-	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_cntr(pmc_idx));
-	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0));
-	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_typer(pmc_idx));
-	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_typer(pmc_idx, 0));
-	/*
-	 * The bit corresponding to the (unimplemented) counter in
-	 * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers should be RAZ.
-	 */
-	test_bitmap_pmu_regs(pmc_idx, 1);
-	test_bitmap_pmu_regs(pmc_idx, 0);
-}
-
-/*
- * The guest is configured with PMUv3 with @expected_pmcr_n number of
- * event counters.
- * Check if @expected_pmcr_n is consistent with PMCR_EL0.N, and
- * if reading/writing PMU registers for implemented or unimplemented
- * counters works as expected.
- */
-static void guest_code(uint64_t expected_pmcr_n)
-{
-	uint64_t pmcr, pmcr_n, unimp_mask;
-	int i, pmc;
-
-	__GUEST_ASSERT(expected_pmcr_n <= ARMV8_PMU_MAX_GENERAL_COUNTERS,
-			"Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%x",
-			expected_pmcr_n, ARMV8_PMU_MAX_GENERAL_COUNTERS);
-
-	pmcr = read_sysreg(pmcr_el0);
-	pmcr_n = get_pmcr_n(pmcr);
-
-	/* Make sure that PMCR_EL0.N indicates the value userspace set */
-	__GUEST_ASSERT(pmcr_n == expected_pmcr_n,
-			"Expected PMCR.N: 0x%lx, PMCR.N: 0x%lx",
-			expected_pmcr_n, pmcr_n);
-
-	/*
-	 * Make sure that (RAZ) bits corresponding to unimplemented event
-	 * counters in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers are reset
-	 * to zero.
-	 * (NOTE: bits for implemented event counters are reset to UNKNOWN)
-	 */
-	unimp_mask = GENMASK_ULL(ARMV8_PMU_MAX_GENERAL_COUNTERS - 1, pmcr_n);
-	check_bitmap_pmu_regs(unimp_mask, false);
-
-	/*
-	 * Tests for reading/writing PMU registers for implemented counters.
-	 * Use each combination of PMEV{CNTR,TYPER}<n>_EL0 accessor functions.
-	 */
-	for (i = 0; i < ARRAY_SIZE(pmc_accessors); i++) {
-		for (pmc = 0; pmc < pmcr_n; pmc++)
-			test_access_pmc_regs(&pmc_accessors[i], pmc);
-	}
-
-	/*
-	 * Tests for reading/writing PMU registers for unimplemented counters.
-	 * Use each combination of PMEV{CNTR,TYPER}<n>_EL0 accessor functions.
-	 */
-	for (i = 0; i < ARRAY_SIZE(pmc_accessors); i++) {
-		for (pmc = pmcr_n; pmc < ARMV8_PMU_MAX_GENERAL_COUNTERS; pmc++)
-			test_access_invalid_pmc_regs(&pmc_accessors[i], pmc);
-	}
-
-	GUEST_DONE();
-}
-
-/* Create a VM that has one vCPU with PMUv3 configured. */
-static void create_vpmu_vm(void *guest_code)
-{
-	struct kvm_vcpu_init init;
-	uint8_t pmuver, ec;
-	uint64_t dfr0, irq = 23;
-	struct kvm_device_attr irq_attr = {
-		.group = KVM_ARM_VCPU_PMU_V3_CTRL,
-		.attr = KVM_ARM_VCPU_PMU_V3_IRQ,
-		.addr = (uint64_t)&irq,
-	};
-	struct kvm_device_attr init_attr = {
-		.group = KVM_ARM_VCPU_PMU_V3_CTRL,
-		.attr = KVM_ARM_VCPU_PMU_V3_INIT,
-	};
-
-	/* The test creates the vpmu_vm multiple times. Ensure a clean state */
-	memset(&vpmu_vm, 0, sizeof(vpmu_vm));
-
-	vpmu_vm.vm = vm_create(1);
-	vm_init_descriptor_tables(vpmu_vm.vm);
-	for (ec = 0; ec < ESR_ELx_EC_MAX + 1; ec++) {
-		vm_install_sync_handler(vpmu_vm.vm, VECTOR_SYNC_CURRENT, ec,
-					guest_sync_handler);
-	}
-
-	/* Create vCPU with PMUv3 */
-	vm_ioctl(vpmu_vm.vm, KVM_ARM_PREFERRED_TARGET, &init);
-	init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3);
-	vpmu_vm.vcpu = aarch64_vcpu_add(vpmu_vm.vm, 0, &init, guest_code);
-	vcpu_init_descriptor_tables(vpmu_vm.vcpu);
-	vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64);
-	__TEST_REQUIRE(vpmu_vm.gic_fd >= 0,
-		       "Failed to create vgic-v3, skipping");
-
-	/* Make sure that PMUv3 support is indicated in the ID register */
-	dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
-	pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0);
-	TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF &&
-		    pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP,
-		    "Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver);
-
-	/* Initialize vPMU */
-	vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &irq_attr);
-	vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &init_attr);
-}
-
-static void destroy_vpmu_vm(void)
-{
-	close(vpmu_vm.gic_fd);
-	kvm_vm_free(vpmu_vm.vm);
-}
-
-static void run_vcpu(struct kvm_vcpu *vcpu, uint64_t pmcr_n)
-{
-	struct ucall uc;
-
-	vcpu_args_set(vcpu, 1, pmcr_n);
-	vcpu_run(vcpu);
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		break;
-	default:
-		TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		break;
-	}
-}
-
-static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail)
-{
-	struct kvm_vcpu *vcpu;
-	uint64_t pmcr, pmcr_orig;
-
-	create_vpmu_vm(guest_code);
-	vcpu = vpmu_vm.vcpu;
-
-	pmcr_orig = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
-	pmcr = pmcr_orig;
-
-	/*
-	 * Setting a larger value of PMCR.N should not modify the field, and
-	 * return a success.
-	 */
-	set_pmcr_n(&pmcr, pmcr_n);
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), pmcr);
-	pmcr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
-
-	if (expect_fail)
-		TEST_ASSERT(pmcr_orig == pmcr,
-			    "PMCR.N modified by KVM to a larger value (PMCR: 0x%lx) for pmcr_n: 0x%lx",
-			    pmcr, pmcr_n);
-	else
-		TEST_ASSERT(pmcr_n == get_pmcr_n(pmcr),
-			    "Failed to update PMCR.N to %lu (received: %lu)",
-			    pmcr_n, get_pmcr_n(pmcr));
-}
-
-/*
- * Create a guest with one vCPU, set the PMCR_EL0.N for the vCPU to @pmcr_n,
- * and run the test.
- */
-static void run_access_test(uint64_t pmcr_n)
-{
-	uint64_t sp;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vcpu_init init;
-
-	pr_debug("Test with pmcr_n %lu\n", pmcr_n);
-
-	test_create_vpmu_vm_with_pmcr_n(pmcr_n, false);
-	vcpu = vpmu_vm.vcpu;
-
-	/* Save the initial sp to restore them later to run the guest again */
-	sp = vcpu_get_reg(vcpu, ARM64_CORE_REG(sp_el1));
-
-	run_vcpu(vcpu, pmcr_n);
-
-	/*
-	 * Reset and re-initialize the vCPU, and run the guest code again to
-	 * check if PMCR_EL0.N is preserved.
-	 */
-	vm_ioctl(vpmu_vm.vm, KVM_ARM_PREFERRED_TARGET, &init);
-	init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3);
-	aarch64_vcpu_setup(vcpu, &init);
-	vcpu_init_descriptor_tables(vcpu);
-	vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), sp);
-	vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
-
-	run_vcpu(vcpu, pmcr_n);
-
-	destroy_vpmu_vm();
-}
-
-static struct pmreg_sets validity_check_reg_sets[] = {
-	PMREG_SET(SYS_PMCNTENSET_EL0, SYS_PMCNTENCLR_EL0),
-	PMREG_SET(SYS_PMINTENSET_EL1, SYS_PMINTENCLR_EL1),
-	PMREG_SET(SYS_PMOVSSET_EL0, SYS_PMOVSCLR_EL0),
-};
-
-/*
- * Create a VM, and check if KVM handles the userspace accesses of
- * the PMU register sets in @validity_check_reg_sets[] correctly.
- */
-static void run_pmregs_validity_test(uint64_t pmcr_n)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-	uint64_t set_reg_id, clr_reg_id, reg_val;
-	uint64_t valid_counters_mask, max_counters_mask;
-
-	test_create_vpmu_vm_with_pmcr_n(pmcr_n, false);
-	vcpu = vpmu_vm.vcpu;
-
-	valid_counters_mask = get_counters_mask(pmcr_n);
-	max_counters_mask = get_counters_mask(ARMV8_PMU_MAX_COUNTERS);
-
-	for (i = 0; i < ARRAY_SIZE(validity_check_reg_sets); i++) {
-		set_reg_id = validity_check_reg_sets[i].set_reg_id;
-		clr_reg_id = validity_check_reg_sets[i].clr_reg_id;
-
-		/*
-		 * Test if the 'set' and 'clr' variants of the registers
-		 * are initialized based on the number of valid counters.
-		 */
-		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id));
-		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
-			    "Initial read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
-			    KVM_ARM64_SYS_REG(set_reg_id), reg_val);
-
-		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id));
-		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
-			    "Initial read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
-			    KVM_ARM64_SYS_REG(clr_reg_id), reg_val);
-
-		/*
-		 * Using the 'set' variant, force-set the register to the
-		 * max number of possible counters and test if KVM discards
-		 * the bits for unimplemented counters as it should.
-		 */
-		vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), max_counters_mask);
-
-		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id));
-		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
-			    "Read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
-			    KVM_ARM64_SYS_REG(set_reg_id), reg_val);
-
-		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id));
-		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
-			    "Read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
-			    KVM_ARM64_SYS_REG(clr_reg_id), reg_val);
-	}
-
-	destroy_vpmu_vm();
-}
-
-/*
- * Create a guest with one vCPU, and attempt to set the PMCR_EL0.N for
- * the vCPU to @pmcr_n, which is larger than the host value.
- * The attempt should fail as @pmcr_n is too big to set for the vCPU.
- */
-static void run_error_test(uint64_t pmcr_n)
-{
-	pr_debug("Error test with pmcr_n %lu (larger than the host)\n", pmcr_n);
-
-	test_create_vpmu_vm_with_pmcr_n(pmcr_n, true);
-	destroy_vpmu_vm();
-}
-
-/*
- * Return the default number of implemented PMU event counters excluding
- * the cycle counter (i.e. PMCR_EL0.N value) for the guest.
- */
-static uint64_t get_pmcr_n_limit(void)
-{
-	uint64_t pmcr;
-
-	create_vpmu_vm(guest_code);
-	pmcr = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
-	destroy_vpmu_vm();
-	return get_pmcr_n(pmcr);
-}
-
-int main(void)
-{
-	uint64_t i, pmcr_n;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3));
-
-	pmcr_n = get_pmcr_n_limit();
-	for (i = 0; i <= pmcr_n; i++) {
-		run_access_test(i);
-		run_pmregs_validity_test(i);
-	}
-
-	for (i = pmcr_n + 1; i < ARMV8_PMU_MAX_COUNTERS; i++)
-		run_error_test(i);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c
new file mode 100644
index 000000000000..447d61cae4db
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * aarch32_id_regs - Test for ID register behavior on AArch64-only systems
+ *
+ * Copyright (c) 2022 Google LLC.
+ *
+ * Test that KVM handles the AArch64 views of the AArch32 ID registers as RAZ
+ * and WI from userspace.
+ */
+
+#include <stdint.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include <linux/bitfield.h>
+
+#define BAD_ID_REG_VAL	0x1badc0deul
+
+#define GUEST_ASSERT_REG_RAZ(reg)	GUEST_ASSERT_EQ(read_sysreg_s(reg), 0)
+
+static void guest_main(void)
+{
+	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_DFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_AFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR2_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR3_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR2_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR3_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR4_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR5_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR4_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR6_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_MVFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_MVFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_MVFR2_EL1);
+	GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 3));
+	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR2_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_DFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR5_EL1);
+	GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 7));
+
+	GUEST_DONE();
+}
+
+static void test_guest_raz(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
+
+static uint64_t raz_wi_reg_ids[] = {
+	KVM_ARM64_SYS_REG(SYS_ID_PFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_PFR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_DFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR3_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR3_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR4_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR5_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR4_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR6_EL1),
+	KVM_ARM64_SYS_REG(SYS_MVFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_MVFR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_MVFR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_PFR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR5_EL1),
+};
+
+static void test_user_raz_wi(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(raz_wi_reg_ids); i++) {
+		uint64_t reg_id = raz_wi_reg_ids[i];
+		uint64_t val;
+
+		val = vcpu_get_reg(vcpu, reg_id);
+		TEST_ASSERT_EQ(val, 0);
+
+		/*
+		 * Expect the ioctl to succeed with no effect on the register
+		 * value.
+		 */
+		vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
+
+		val = vcpu_get_reg(vcpu, reg_id);
+		TEST_ASSERT_EQ(val, 0);
+	}
+}
+
+static uint64_t raz_invariant_reg_ids[] = {
+	KVM_ARM64_SYS_REG(SYS_ID_AFR0_EL1),
+	KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 3)),
+	KVM_ARM64_SYS_REG(SYS_ID_DFR1_EL1),
+	KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 7)),
+};
+
+static void test_user_raz_invariant(struct kvm_vcpu *vcpu)
+{
+	int i, r;
+
+	for (i = 0; i < ARRAY_SIZE(raz_invariant_reg_ids); i++) {
+		uint64_t reg_id = raz_invariant_reg_ids[i];
+		uint64_t val;
+
+		val = vcpu_get_reg(vcpu, reg_id);
+		TEST_ASSERT_EQ(val, 0);
+
+		r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
+		TEST_ASSERT(r < 0 && errno == EINVAL,
+			    "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
+
+		val = vcpu_get_reg(vcpu, reg_id);
+		TEST_ASSERT_EQ(val, 0);
+	}
+}
+
+
+
+static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
+{
+	uint64_t val, el0;
+
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
+
+	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
+	return el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY;
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	TEST_REQUIRE(vcpu_aarch64_only(vcpu));
+
+	test_user_raz_wi(vcpu);
+	test_user_raz_invariant(vcpu);
+	test_guest_raz(vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/arm64/arch_timer.c b/tools/testing/selftests/kvm/arm64/arch_timer.c
new file mode 100644
index 000000000000..eeba1cc87ff8
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/arch_timer.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * The test validates both the virtual and physical timer IRQs using
+ * CVAL and TVAL registers.
+ *
+ * Copyright (c) 2021, Google LLC.
+ */
+#include "arch_timer.h"
+#include "delay.h"
+#include "gic.h"
+#include "processor.h"
+#include "timer_test.h"
+#include "ucall_common.h"
+#include "vgic.h"
+
+enum guest_stage {
+	GUEST_STAGE_VTIMER_CVAL = 1,
+	GUEST_STAGE_VTIMER_TVAL,
+	GUEST_STAGE_PTIMER_CVAL,
+	GUEST_STAGE_PTIMER_TVAL,
+	GUEST_STAGE_MAX,
+};
+
+static int vtimer_irq, ptimer_irq;
+
+static void
+guest_configure_timer_action(struct test_vcpu_shared_data *shared_data)
+{
+	switch (shared_data->guest_stage) {
+	case GUEST_STAGE_VTIMER_CVAL:
+		timer_set_next_cval_ms(VIRTUAL, test_args.timer_period_ms);
+		shared_data->xcnt = timer_get_cntct(VIRTUAL);
+		timer_set_ctl(VIRTUAL, CTL_ENABLE);
+		break;
+	case GUEST_STAGE_VTIMER_TVAL:
+		timer_set_next_tval_ms(VIRTUAL, test_args.timer_period_ms);
+		shared_data->xcnt = timer_get_cntct(VIRTUAL);
+		timer_set_ctl(VIRTUAL, CTL_ENABLE);
+		break;
+	case GUEST_STAGE_PTIMER_CVAL:
+		timer_set_next_cval_ms(PHYSICAL, test_args.timer_period_ms);
+		shared_data->xcnt = timer_get_cntct(PHYSICAL);
+		timer_set_ctl(PHYSICAL, CTL_ENABLE);
+		break;
+	case GUEST_STAGE_PTIMER_TVAL:
+		timer_set_next_tval_ms(PHYSICAL, test_args.timer_period_ms);
+		shared_data->xcnt = timer_get_cntct(PHYSICAL);
+		timer_set_ctl(PHYSICAL, CTL_ENABLE);
+		break;
+	default:
+		GUEST_ASSERT(0);
+	}
+}
+
+static void guest_validate_irq(unsigned int intid,
+				struct test_vcpu_shared_data *shared_data)
+{
+	enum guest_stage stage = shared_data->guest_stage;
+	uint64_t xcnt = 0, xcnt_diff_us, cval = 0;
+	unsigned long xctl = 0;
+	unsigned int timer_irq = 0;
+	unsigned int accessor;
+
+	if (intid == IAR_SPURIOUS)
+		return;
+
+	switch (stage) {
+	case GUEST_STAGE_VTIMER_CVAL:
+	case GUEST_STAGE_VTIMER_TVAL:
+		accessor = VIRTUAL;
+		timer_irq = vtimer_irq;
+		break;
+	case GUEST_STAGE_PTIMER_CVAL:
+	case GUEST_STAGE_PTIMER_TVAL:
+		accessor = PHYSICAL;
+		timer_irq = ptimer_irq;
+		break;
+	default:
+		GUEST_ASSERT(0);
+		return;
+	}
+
+	xctl = timer_get_ctl(accessor);
+	if ((xctl & CTL_IMASK) || !(xctl & CTL_ENABLE))
+		return;
+
+	timer_set_ctl(accessor, CTL_IMASK);
+	xcnt = timer_get_cntct(accessor);
+	cval = timer_get_cval(accessor);
+
+	xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt);
+
+	/* Make sure we are dealing with the correct timer IRQ */
+	GUEST_ASSERT_EQ(intid, timer_irq);
+
+	/* Basic 'timer condition met' check */
+	__GUEST_ASSERT(xcnt >= cval,
+		       "xcnt = 0x%lx, cval = 0x%lx, xcnt_diff_us = 0x%lx",
+		       xcnt, cval, xcnt_diff_us);
+	__GUEST_ASSERT(xctl & CTL_ISTATUS, "xctl = 0x%lx", xctl);
+
+	WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1);
+}
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+	unsigned int intid = gic_get_and_ack_irq();
+	uint32_t cpu = guest_get_vcpuid();
+	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu];
+
+	guest_validate_irq(intid, shared_data);
+
+	gic_set_eoi(intid);
+}
+
+static void guest_run_stage(struct test_vcpu_shared_data *shared_data,
+				enum guest_stage stage)
+{
+	uint32_t irq_iter, config_iter;
+
+	shared_data->guest_stage = stage;
+	shared_data->nr_iter = 0;
+
+	for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) {
+		/* Setup the next interrupt */
+		guest_configure_timer_action(shared_data);
+
+		/* Setup a timeout for the interrupt to arrive */
+		udelay(msecs_to_usecs(test_args.timer_period_ms) +
+			test_args.timer_err_margin_us);
+
+		irq_iter = READ_ONCE(shared_data->nr_iter);
+		__GUEST_ASSERT(config_iter + 1 == irq_iter,
+				"config_iter + 1 = 0x%x, irq_iter = 0x%x.\n"
+				"  Guest timer interrupt was not triggered within the specified\n"
+				"  interval, try to increase the error margin by [-e] option.\n",
+				config_iter + 1, irq_iter);
+	}
+}
+
+static void guest_code(void)
+{
+	uint32_t cpu = guest_get_vcpuid();
+	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu];
+
+	local_irq_disable();
+
+	gic_init(GIC_V3, test_args.nr_vcpus);
+
+	timer_set_ctl(VIRTUAL, CTL_IMASK);
+	timer_set_ctl(PHYSICAL, CTL_IMASK);
+
+	gic_irq_enable(vtimer_irq);
+	gic_irq_enable(ptimer_irq);
+	local_irq_enable();
+
+	guest_run_stage(shared_data, GUEST_STAGE_VTIMER_CVAL);
+	guest_run_stage(shared_data, GUEST_STAGE_VTIMER_TVAL);
+	guest_run_stage(shared_data, GUEST_STAGE_PTIMER_CVAL);
+	guest_run_stage(shared_data, GUEST_STAGE_PTIMER_TVAL);
+
+	GUEST_DONE();
+}
+
+static void test_init_timer_irq(struct kvm_vm *vm)
+{
+	/* Timer initid should be same for all the vCPUs, so query only vCPU-0 */
+	vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL,
+			     KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq);
+	vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL,
+			     KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq);
+
+	sync_global_to_guest(vm, ptimer_irq);
+	sync_global_to_guest(vm, vtimer_irq);
+
+	pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
+}
+
+static int gic_fd;
+
+struct kvm_vm *test_vm_create(void)
+{
+	struct kvm_vm *vm;
+	unsigned int i;
+	int nr_vcpus = test_args.nr_vcpus;
+
+	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
+
+	vm_init_descriptor_tables(vm);
+	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
+
+	if (!test_args.reserved) {
+		if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) {
+			struct kvm_arm_counter_offset offset = {
+				.counter_offset = test_args.counter_offset,
+				.reserved = 0,
+			};
+			vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &offset);
+		} else
+			TEST_FAIL("no support for global offset");
+	}
+
+	for (i = 0; i < nr_vcpus; i++)
+		vcpu_init_descriptor_tables(vcpus[i]);
+
+	test_init_timer_irq(vm);
+	gic_fd = vgic_v3_setup(vm, nr_vcpus, 64);
+	__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3");
+
+	/* Make all the test's cmdline args visible to the guest */
+	sync_global_to_guest(vm, test_args);
+
+	return vm;
+}
+
+void test_vm_cleanup(struct kvm_vm *vm)
+{
+	close(gic_fd);
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
new file mode 100644
index 000000000000..a36a7e2db434
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -0,0 +1,1062 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * arch_timer_edge_cases.c - Tests the aarch64 timer IRQ functionality.
+ *
+ * The test validates some edge cases related to the arch-timer:
+ * - timers above the max TVAL value.
+ * - timers in the past
+ * - moving counters ahead and behind pending timers.
+ * - reprograming timers.
+ * - timers fired multiple times.
+ * - masking/unmasking using the timer control mask.
+ *
+ * Copyright (c) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+#include <pthread.h>
+#include <sys/sysinfo.h>
+
+#include "arch_timer.h"
+#include "gic.h"
+#include "vgic.h"
+
+static const uint64_t CVAL_MAX = ~0ULL;
+/* tval is a signed 32-bit int. */
+static const int32_t TVAL_MAX = INT32_MAX;
+static const int32_t TVAL_MIN = INT32_MIN;
+
+/* After how much time we say there is no IRQ. */
+static const uint32_t TIMEOUT_NO_IRQ_US = 50000;
+
+/* A nice counter value to use as the starting one for most tests. */
+static const uint64_t DEF_CNT = (CVAL_MAX / 2);
+
+/* Number of runs. */
+static const uint32_t NR_TEST_ITERS_DEF = 5;
+
+/* Default wait test time in ms. */
+static const uint32_t WAIT_TEST_MS = 10;
+
+/* Default "long" wait test time in ms. */
+static const uint32_t LONG_WAIT_TEST_MS = 100;
+
+/* Shared with IRQ handler. */
+struct test_vcpu_shared_data {
+	atomic_t handled;
+	atomic_t spurious;
+} shared_data;
+
+struct test_args {
+	/* Virtual or physical timer and counter tests. */
+	enum arch_timer timer;
+	/* Delay used for most timer tests. */
+	uint64_t wait_ms;
+	/* Delay used in the test_long_timer_delays test. */
+	uint64_t long_wait_ms;
+	/* Number of iterations. */
+	int iterations;
+	/* Whether to test the physical timer. */
+	bool test_physical;
+	/* Whether to test the virtual timer. */
+	bool test_virtual;
+};
+
+struct test_args test_args = {
+	.wait_ms = WAIT_TEST_MS,
+	.long_wait_ms = LONG_WAIT_TEST_MS,
+	.iterations = NR_TEST_ITERS_DEF,
+	.test_physical = true,
+	.test_virtual = true,
+};
+
+static int vtimer_irq, ptimer_irq;
+
+enum sync_cmd {
+	SET_COUNTER_VALUE,
+	USERSPACE_USLEEP,
+	USERSPACE_SCHED_YIELD,
+	USERSPACE_MIGRATE_SELF,
+	NO_USERSPACE_CMD,
+};
+
+typedef void (*sleep_method_t)(enum arch_timer timer, uint64_t usec);
+
+static void sleep_poll(enum arch_timer timer, uint64_t usec);
+static void sleep_sched_poll(enum arch_timer timer, uint64_t usec);
+static void sleep_in_userspace(enum arch_timer timer, uint64_t usec);
+static void sleep_migrate(enum arch_timer timer, uint64_t usec);
+
+sleep_method_t sleep_method[] = {
+	sleep_poll,
+	sleep_sched_poll,
+	sleep_migrate,
+	sleep_in_userspace,
+};
+
+typedef void (*irq_wait_method_t)(void);
+
+static void wait_for_non_spurious_irq(void);
+static void wait_poll_for_irq(void);
+static void wait_sched_poll_for_irq(void);
+static void wait_migrate_poll_for_irq(void);
+
+irq_wait_method_t irq_wait_method[] = {
+	wait_for_non_spurious_irq,
+	wait_poll_for_irq,
+	wait_sched_poll_for_irq,
+	wait_migrate_poll_for_irq,
+};
+
+enum timer_view {
+	TIMER_CVAL,
+	TIMER_TVAL,
+};
+
+static void assert_irqs_handled(uint32_t n)
+{
+	int h = atomic_read(&shared_data.handled);
+
+	__GUEST_ASSERT(h == n, "Handled %d IRQS but expected %d", h, n);
+}
+
+static void userspace_cmd(uint64_t cmd)
+{
+	GUEST_SYNC_ARGS(cmd, 0, 0, 0, 0);
+}
+
+static void userspace_migrate_vcpu(void)
+{
+	userspace_cmd(USERSPACE_MIGRATE_SELF);
+}
+
+static void userspace_sleep(uint64_t usecs)
+{
+	GUEST_SYNC_ARGS(USERSPACE_USLEEP, usecs, 0, 0, 0);
+}
+
+static void set_counter(enum arch_timer timer, uint64_t counter)
+{
+	GUEST_SYNC_ARGS(SET_COUNTER_VALUE, counter, timer, 0, 0);
+}
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+	unsigned int intid = gic_get_and_ack_irq();
+	enum arch_timer timer;
+	uint64_t cnt, cval;
+	uint32_t ctl;
+	bool timer_condition, istatus;
+
+	if (intid == IAR_SPURIOUS) {
+		atomic_inc(&shared_data.spurious);
+		goto out;
+	}
+
+	if (intid == ptimer_irq)
+		timer = PHYSICAL;
+	else if (intid == vtimer_irq)
+		timer = VIRTUAL;
+	else
+		goto out;
+
+	ctl = timer_get_ctl(timer);
+	cval = timer_get_cval(timer);
+	cnt = timer_get_cntct(timer);
+	timer_condition = cnt >= cval;
+	istatus = (ctl & CTL_ISTATUS) && (ctl & CTL_ENABLE);
+	GUEST_ASSERT_EQ(timer_condition, istatus);
+
+	/* Disable and mask the timer. */
+	timer_set_ctl(timer, CTL_IMASK);
+
+	atomic_inc(&shared_data.handled);
+
+out:
+	gic_set_eoi(intid);
+}
+
+static void set_cval_irq(enum arch_timer timer, uint64_t cval_cycles,
+			 uint32_t ctl)
+{
+	atomic_set(&shared_data.handled, 0);
+	atomic_set(&shared_data.spurious, 0);
+	timer_set_cval(timer, cval_cycles);
+	timer_set_ctl(timer, ctl);
+}
+
+static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles,
+			 uint32_t ctl)
+{
+	atomic_set(&shared_data.handled, 0);
+	atomic_set(&shared_data.spurious, 0);
+	timer_set_ctl(timer, ctl);
+	timer_set_tval(timer, tval_cycles);
+}
+
+static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl,
+			 enum timer_view tv)
+{
+	switch (tv) {
+	case TIMER_CVAL:
+		set_cval_irq(timer, xval, ctl);
+		break;
+	case TIMER_TVAL:
+		set_tval_irq(timer, xval, ctl);
+		break;
+	default:
+		GUEST_FAIL("Could not get timer %d", timer);
+	}
+}
+
+/*
+ * Note that this can theoretically hang forever, so we rely on having
+ * a timeout mechanism in the "runner", like:
+ * tools/testing/selftests/kselftest/runner.sh.
+ */
+static void wait_for_non_spurious_irq(void)
+{
+	int h;
+
+	local_irq_disable();
+
+	for (h = atomic_read(&shared_data.handled); h == atomic_read(&shared_data.handled);) {
+		wfi();
+		local_irq_enable();
+		isb(); /* handle IRQ */
+		local_irq_disable();
+	}
+}
+
+/*
+ * Wait for an non-spurious IRQ by polling in the guest or in
+ * userspace (e.g. userspace_cmd=USERSPACE_SCHED_YIELD).
+ *
+ * Note that this can theoretically hang forever, so we rely on having
+ * a timeout mechanism in the "runner", like:
+ * tools/testing/selftests/kselftest/runner.sh.
+ */
+static void poll_for_non_spurious_irq(enum sync_cmd usp_cmd)
+{
+	int h;
+
+	local_irq_disable();
+
+	h = atomic_read(&shared_data.handled);
+
+	local_irq_enable();
+	while (h == atomic_read(&shared_data.handled)) {
+		if (usp_cmd == NO_USERSPACE_CMD)
+			cpu_relax();
+		else
+			userspace_cmd(usp_cmd);
+	}
+	local_irq_disable();
+}
+
+static void wait_poll_for_irq(void)
+{
+	poll_for_non_spurious_irq(NO_USERSPACE_CMD);
+}
+
+static void wait_sched_poll_for_irq(void)
+{
+	poll_for_non_spurious_irq(USERSPACE_SCHED_YIELD);
+}
+
+static void wait_migrate_poll_for_irq(void)
+{
+	poll_for_non_spurious_irq(USERSPACE_MIGRATE_SELF);
+}
+
+/*
+ * Sleep for usec microseconds by polling in the guest or in
+ * userspace (e.g. userspace_cmd=USERSPACE_SCHEDULE).
+ */
+static void guest_poll(enum arch_timer test_timer, uint64_t usec,
+		       enum sync_cmd usp_cmd)
+{
+	uint64_t cycles = usec_to_cycles(usec);
+	/* Whichever timer we are testing with, sleep with the other. */
+	enum arch_timer sleep_timer = 1 - test_timer;
+	uint64_t start = timer_get_cntct(sleep_timer);
+
+	while ((timer_get_cntct(sleep_timer) - start) < cycles) {
+		if (usp_cmd == NO_USERSPACE_CMD)
+			cpu_relax();
+		else
+			userspace_cmd(usp_cmd);
+	}
+}
+
+static void sleep_poll(enum arch_timer timer, uint64_t usec)
+{
+	guest_poll(timer, usec, NO_USERSPACE_CMD);
+}
+
+static void sleep_sched_poll(enum arch_timer timer, uint64_t usec)
+{
+	guest_poll(timer, usec, USERSPACE_SCHED_YIELD);
+}
+
+static void sleep_migrate(enum arch_timer timer, uint64_t usec)
+{
+	guest_poll(timer, usec, USERSPACE_MIGRATE_SELF);
+}
+
+static void sleep_in_userspace(enum arch_timer timer, uint64_t usec)
+{
+	userspace_sleep(usec);
+}
+
+/*
+ * Reset the timer state to some nice values like the counter not being close
+ * to the edge, and the control register masked and disabled.
+ */
+static void reset_timer_state(enum arch_timer timer, uint64_t cnt)
+{
+	set_counter(timer, cnt);
+	timer_set_ctl(timer, CTL_IMASK);
+}
+
+static void test_timer_xval(enum arch_timer timer, uint64_t xval,
+			    enum timer_view tv, irq_wait_method_t wm, bool reset_state,
+			    uint64_t reset_cnt)
+{
+	local_irq_disable();
+
+	if (reset_state)
+		reset_timer_state(timer, reset_cnt);
+
+	set_xval_irq(timer, xval, CTL_ENABLE, tv);
+
+	/* This method re-enables IRQs to handle the one we're looking for. */
+	wm();
+
+	assert_irqs_handled(1);
+	local_irq_enable();
+}
+
+/*
+ * The test_timer_* functions will program the timer, wait for it, and assert
+ * the firing of the correct IRQ.
+ *
+ * These functions don't have a timeout and return as soon as they receive an
+ * IRQ. They can hang (forever), so we rely on having a timeout mechanism in
+ * the "runner", like: tools/testing/selftests/kselftest/runner.sh.
+ */
+
+static void test_timer_cval(enum arch_timer timer, uint64_t cval,
+			    irq_wait_method_t wm, bool reset_state,
+			    uint64_t reset_cnt)
+{
+	test_timer_xval(timer, cval, TIMER_CVAL, wm, reset_state, reset_cnt);
+}
+
+static void test_timer_tval(enum arch_timer timer, int32_t tval,
+			    irq_wait_method_t wm, bool reset_state,
+			    uint64_t reset_cnt)
+{
+	test_timer_xval(timer, (uint64_t) tval, TIMER_TVAL, wm, reset_state,
+			reset_cnt);
+}
+
+static void test_xval_check_no_irq(enum arch_timer timer, uint64_t xval,
+				   uint64_t usec, enum timer_view timer_view,
+				   sleep_method_t guest_sleep)
+{
+	local_irq_disable();
+
+	set_xval_irq(timer, xval, CTL_ENABLE | CTL_IMASK, timer_view);
+	guest_sleep(timer, usec);
+
+	local_irq_enable();
+	isb();
+
+	/* Assume success (no IRQ) after waiting usec microseconds */
+	assert_irqs_handled(0);
+}
+
+static void test_cval_no_irq(enum arch_timer timer, uint64_t cval,
+			     uint64_t usec, sleep_method_t wm)
+{
+	test_xval_check_no_irq(timer, cval, usec, TIMER_CVAL, wm);
+}
+
+static void test_tval_no_irq(enum arch_timer timer, int32_t tval, uint64_t usec,
+			     sleep_method_t wm)
+{
+	/* tval will be cast to an int32_t in test_xval_check_no_irq */
+	test_xval_check_no_irq(timer, (uint64_t) tval, usec, TIMER_TVAL, wm);
+}
+
+/* Test masking/unmasking a timer using the timer mask (not the IRQ mask). */
+static void test_timer_control_mask_then_unmask(enum arch_timer timer)
+{
+	reset_timer_state(timer, DEF_CNT);
+	set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK);
+
+	/* Unmask the timer, and then get an IRQ. */
+	local_irq_disable();
+	timer_set_ctl(timer, CTL_ENABLE);
+	/* This method re-enables IRQs to handle the one we're looking for. */
+	wait_for_non_spurious_irq();
+
+	assert_irqs_handled(1);
+	local_irq_enable();
+}
+
+/* Check that timer control masks actually mask a timer being fired. */
+static void test_timer_control_masks(enum arch_timer timer)
+{
+	reset_timer_state(timer, DEF_CNT);
+
+	/* Local IRQs are not masked at this point. */
+
+	set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK);
+
+	/* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */
+	sleep_poll(timer, TIMEOUT_NO_IRQ_US);
+
+	assert_irqs_handled(0);
+	timer_set_ctl(timer, CTL_IMASK);
+}
+
+static void test_fire_a_timer_multiple_times(enum arch_timer timer,
+					     irq_wait_method_t wm, int num)
+{
+	int i;
+
+	local_irq_disable();
+	reset_timer_state(timer, DEF_CNT);
+
+	set_tval_irq(timer, 0, CTL_ENABLE);
+
+	for (i = 1; i <= num; i++) {
+		/* This method re-enables IRQs to handle the one we're looking for. */
+		wm();
+
+		/* The IRQ handler masked and disabled the timer.
+		 * Enable and unmmask it again.
+		 */
+		timer_set_ctl(timer, CTL_ENABLE);
+
+		assert_irqs_handled(i);
+	}
+
+	local_irq_enable();
+}
+
+static void test_timers_fired_multiple_times(enum arch_timer timer)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++)
+		test_fire_a_timer_multiple_times(timer, irq_wait_method[i], 10);
+}
+
+/*
+ * Set a timer for tval=delta_1_ms then reprogram it to
+ * tval=delta_2_ms. Check that we get the timer fired. There is no
+ * timeout for the wait: we use the wfi instruction.
+ */
+static void test_reprogramming_timer(enum arch_timer timer, irq_wait_method_t wm,
+				     int32_t delta_1_ms, int32_t delta_2_ms)
+{
+	local_irq_disable();
+	reset_timer_state(timer, DEF_CNT);
+
+	/* Program the timer to DEF_CNT + delta_1_ms. */
+	set_tval_irq(timer, msec_to_cycles(delta_1_ms), CTL_ENABLE);
+
+	/* Reprogram the timer to DEF_CNT + delta_2_ms. */
+	timer_set_tval(timer, msec_to_cycles(delta_2_ms));
+
+	/* This method re-enables IRQs to handle the one we're looking for. */
+	wm();
+
+	/* The IRQ should arrive at DEF_CNT + delta_2_ms (or after). */
+	GUEST_ASSERT(timer_get_cntct(timer) >=
+		     DEF_CNT + msec_to_cycles(delta_2_ms));
+
+	local_irq_enable();
+	assert_irqs_handled(1);
+};
+
+static void test_reprogram_timers(enum arch_timer timer)
+{
+	int i;
+	uint64_t base_wait = test_args.wait_ms;
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
+		/*
+		 * Ensure reprogramming works whether going from a
+		 * longer time to a shorter or vice versa.
+		 */
+		test_reprogramming_timer(timer, irq_wait_method[i], 2 * base_wait,
+					 base_wait);
+		test_reprogramming_timer(timer, irq_wait_method[i], base_wait,
+					 2 * base_wait);
+	}
+}
+
+static void test_basic_functionality(enum arch_timer timer)
+{
+	int32_t tval = (int32_t) msec_to_cycles(test_args.wait_ms);
+	uint64_t cval = DEF_CNT + msec_to_cycles(test_args.wait_ms);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
+		irq_wait_method_t wm = irq_wait_method[i];
+
+		test_timer_cval(timer, cval, wm, true, DEF_CNT);
+		test_timer_tval(timer, tval, wm, true, DEF_CNT);
+	}
+}
+
+/*
+ * This test checks basic timer behavior without actually firing timers, things
+ * like: the relationship between cval and tval, tval down-counting.
+ */
+static void timers_sanity_checks(enum arch_timer timer, bool use_sched)
+{
+	reset_timer_state(timer, DEF_CNT);
+
+	local_irq_disable();
+
+	/* cval in the past */
+	timer_set_cval(timer,
+		       timer_get_cntct(timer) -
+		       msec_to_cycles(test_args.wait_ms));
+	if (use_sched)
+		userspace_migrate_vcpu();
+	GUEST_ASSERT(timer_get_tval(timer) < 0);
+
+	/* tval in the past */
+	timer_set_tval(timer, -1);
+	if (use_sched)
+		userspace_migrate_vcpu();
+	GUEST_ASSERT(timer_get_cval(timer) < timer_get_cntct(timer));
+
+	/* tval larger than TVAL_MAX. This requires programming with
+	 * timer_set_cval instead so the value is expressible
+	 */
+	timer_set_cval(timer,
+		       timer_get_cntct(timer) + TVAL_MAX +
+		       msec_to_cycles(test_args.wait_ms));
+	if (use_sched)
+		userspace_migrate_vcpu();
+	GUEST_ASSERT(timer_get_tval(timer) <= 0);
+
+	/*
+	 * tval larger than 2 * TVAL_MAX.
+	 * Twice the TVAL_MAX completely loops around the TVAL.
+	 */
+	timer_set_cval(timer,
+		       timer_get_cntct(timer) + 2ULL * TVAL_MAX +
+		       msec_to_cycles(test_args.wait_ms));
+	if (use_sched)
+		userspace_migrate_vcpu();
+	GUEST_ASSERT(timer_get_tval(timer) <=
+		       msec_to_cycles(test_args.wait_ms));
+
+	/* negative tval that rollovers from 0. */
+	set_counter(timer, msec_to_cycles(1));
+	timer_set_tval(timer, -1 * msec_to_cycles(test_args.wait_ms));
+	if (use_sched)
+		userspace_migrate_vcpu();
+	GUEST_ASSERT(timer_get_cval(timer) >= (CVAL_MAX - msec_to_cycles(test_args.wait_ms)));
+
+	/* tval should keep down-counting from 0 to -1. */
+	timer_set_tval(timer, 0);
+	sleep_poll(timer, 1);
+	GUEST_ASSERT(timer_get_tval(timer) < 0);
+
+	local_irq_enable();
+
+	/* Mask and disable any pending timer. */
+	timer_set_ctl(timer, CTL_IMASK);
+}
+
+static void test_timers_sanity_checks(enum arch_timer timer)
+{
+	timers_sanity_checks(timer, false);
+	/* Check how KVM saves/restores these edge-case values. */
+	timers_sanity_checks(timer, true);
+}
+
+static void test_set_cnt_after_tval_max(enum arch_timer timer, irq_wait_method_t wm)
+{
+	local_irq_disable();
+	reset_timer_state(timer, DEF_CNT);
+
+	set_cval_irq(timer,
+		     (uint64_t) TVAL_MAX +
+		     msec_to_cycles(test_args.wait_ms) / 2, CTL_ENABLE);
+
+	set_counter(timer, TVAL_MAX);
+
+	/* This method re-enables IRQs to handle the one we're looking for. */
+	wm();
+
+	assert_irqs_handled(1);
+	local_irq_enable();
+}
+
+/* Test timers set for: cval = now + TVAL_MAX + wait_ms / 2 */
+static void test_timers_above_tval_max(enum arch_timer timer)
+{
+	uint64_t cval;
+	int i;
+
+	/*
+	 * Test that the system is not implementing cval in terms of
+	 * tval.  If that was the case, setting a cval to "cval = now
+	 * + TVAL_MAX + wait_ms" would wrap to "cval = now +
+	 * wait_ms", and the timer would fire immediately. Test that it
+	 * doesn't.
+	 */
+	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
+		reset_timer_state(timer, DEF_CNT);
+		cval = timer_get_cntct(timer) + TVAL_MAX +
+			msec_to_cycles(test_args.wait_ms);
+		test_cval_no_irq(timer, cval,
+				 msecs_to_usecs(test_args.wait_ms) +
+				 TIMEOUT_NO_IRQ_US, sleep_method[i]);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
+		/* Get the IRQ by moving the counter forward. */
+		test_set_cnt_after_tval_max(timer, irq_wait_method[i]);
+	}
+}
+
+/*
+ * Template function to be used by the test_move_counter_ahead_* tests.  It
+ * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and
+ * then waits for an IRQ.
+ */
+static void test_set_cnt_after_xval(enum arch_timer timer, uint64_t cnt_1,
+				    uint64_t xval, uint64_t cnt_2,
+				    irq_wait_method_t wm, enum timer_view tv)
+{
+	local_irq_disable();
+
+	set_counter(timer, cnt_1);
+	timer_set_ctl(timer, CTL_IMASK);
+
+	set_xval_irq(timer, xval, CTL_ENABLE, tv);
+	set_counter(timer, cnt_2);
+	/* This method re-enables IRQs to handle the one we're looking for. */
+	wm();
+
+	assert_irqs_handled(1);
+	local_irq_enable();
+}
+
+/*
+ * Template function to be used by the test_move_counter_ahead_* tests.  It
+ * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and
+ * then waits for an IRQ.
+ */
+static void test_set_cnt_after_xval_no_irq(enum arch_timer timer,
+					   uint64_t cnt_1, uint64_t xval,
+					   uint64_t cnt_2,
+					   sleep_method_t guest_sleep,
+					   enum timer_view tv)
+{
+	local_irq_disable();
+
+	set_counter(timer, cnt_1);
+	timer_set_ctl(timer, CTL_IMASK);
+
+	set_xval_irq(timer, xval, CTL_ENABLE, tv);
+	set_counter(timer, cnt_2);
+	guest_sleep(timer, TIMEOUT_NO_IRQ_US);
+
+	local_irq_enable();
+	isb();
+
+	/* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */
+	assert_irqs_handled(0);
+	timer_set_ctl(timer, CTL_IMASK);
+}
+
+static void test_set_cnt_after_tval(enum arch_timer timer, uint64_t cnt_1,
+				    int32_t tval, uint64_t cnt_2,
+				    irq_wait_method_t wm)
+{
+	test_set_cnt_after_xval(timer, cnt_1, tval, cnt_2, wm, TIMER_TVAL);
+}
+
+static void test_set_cnt_after_cval(enum arch_timer timer, uint64_t cnt_1,
+				    uint64_t cval, uint64_t cnt_2,
+				    irq_wait_method_t wm)
+{
+	test_set_cnt_after_xval(timer, cnt_1, cval, cnt_2, wm, TIMER_CVAL);
+}
+
+static void test_set_cnt_after_tval_no_irq(enum arch_timer timer,
+					   uint64_t cnt_1, int32_t tval,
+					   uint64_t cnt_2, sleep_method_t wm)
+{
+	test_set_cnt_after_xval_no_irq(timer, cnt_1, tval, cnt_2, wm,
+				       TIMER_TVAL);
+}
+
+static void test_set_cnt_after_cval_no_irq(enum arch_timer timer,
+					   uint64_t cnt_1, uint64_t cval,
+					   uint64_t cnt_2, sleep_method_t wm)
+{
+	test_set_cnt_after_xval_no_irq(timer, cnt_1, cval, cnt_2, wm,
+				       TIMER_CVAL);
+}
+
+/* Set a timer and then move the counter ahead of it. */
+static void test_move_counters_ahead_of_timers(enum arch_timer timer)
+{
+	int i;
+	int32_t tval;
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
+		irq_wait_method_t wm = irq_wait_method[i];
+
+		test_set_cnt_after_cval(timer, 0, DEF_CNT, DEF_CNT + 1, wm);
+		test_set_cnt_after_cval(timer, CVAL_MAX, 1, 2, wm);
+
+		/* Move counter ahead of negative tval. */
+		test_set_cnt_after_tval(timer, 0, -1, DEF_CNT + 1, wm);
+		test_set_cnt_after_tval(timer, 0, -1, TVAL_MAX, wm);
+		tval = TVAL_MAX;
+		test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1,
+					wm);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
+		sleep_method_t sm = sleep_method[i];
+
+		test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm);
+	}
+}
+
+/*
+ * Program a timer, mask it, and then change the tval or counter to cancel it.
+ * Unmask it and check that nothing fires.
+ */
+static void test_move_counters_behind_timers(enum arch_timer timer)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
+		sleep_method_t sm = sleep_method[i];
+
+		test_set_cnt_after_cval_no_irq(timer, DEF_CNT, DEF_CNT - 1, 0,
+					       sm);
+		test_set_cnt_after_tval_no_irq(timer, DEF_CNT, -1, 0, sm);
+	}
+}
+
+static void test_timers_in_the_past(enum arch_timer timer)
+{
+	int32_t tval = -1 * (int32_t) msec_to_cycles(test_args.wait_ms);
+	uint64_t cval;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
+		irq_wait_method_t wm = irq_wait_method[i];
+
+		/* set a timer wait_ms the past. */
+		cval = DEF_CNT - msec_to_cycles(test_args.wait_ms);
+		test_timer_cval(timer, cval, wm, true, DEF_CNT);
+		test_timer_tval(timer, tval, wm, true, DEF_CNT);
+
+		/* Set a timer to counter=0 (in the past) */
+		test_timer_cval(timer, 0, wm, true, DEF_CNT);
+
+		/* Set a time for tval=0 (now) */
+		test_timer_tval(timer, 0, wm, true, DEF_CNT);
+
+		/* Set a timer to as far in the past as possible */
+		test_timer_tval(timer, TVAL_MIN, wm, true, DEF_CNT);
+	}
+
+	/*
+	 * Set the counter to wait_ms, and a tval to -wait_ms. There should be no
+	 * IRQ as that tval means cval=CVAL_MAX-wait_ms.
+	 */
+	for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
+		sleep_method_t sm = sleep_method[i];
+
+		set_counter(timer, msec_to_cycles(test_args.wait_ms));
+		test_tval_no_irq(timer, tval, TIMEOUT_NO_IRQ_US, sm);
+	}
+}
+
+static void test_long_timer_delays(enum arch_timer timer)
+{
+	int32_t tval = (int32_t) msec_to_cycles(test_args.long_wait_ms);
+	uint64_t cval = DEF_CNT + msec_to_cycles(test_args.long_wait_ms);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) {
+		irq_wait_method_t wm = irq_wait_method[i];
+
+		test_timer_cval(timer, cval, wm, true, DEF_CNT);
+		test_timer_tval(timer, tval, wm, true, DEF_CNT);
+	}
+}
+
+static void guest_run_iteration(enum arch_timer timer)
+{
+	test_basic_functionality(timer);
+	test_timers_sanity_checks(timer);
+
+	test_timers_above_tval_max(timer);
+	test_timers_in_the_past(timer);
+
+	test_move_counters_ahead_of_timers(timer);
+	test_move_counters_behind_timers(timer);
+	test_reprogram_timers(timer);
+
+	test_timers_fired_multiple_times(timer);
+
+	test_timer_control_mask_then_unmask(timer);
+	test_timer_control_masks(timer);
+}
+
+static void guest_code(enum arch_timer timer)
+{
+	int i;
+
+	local_irq_disable();
+
+	gic_init(GIC_V3, 1);
+
+	timer_set_ctl(VIRTUAL, CTL_IMASK);
+	timer_set_ctl(PHYSICAL, CTL_IMASK);
+
+	gic_irq_enable(vtimer_irq);
+	gic_irq_enable(ptimer_irq);
+	local_irq_enable();
+
+	for (i = 0; i < test_args.iterations; i++) {
+		GUEST_SYNC(i);
+		guest_run_iteration(timer);
+	}
+
+	test_long_timer_delays(timer);
+	GUEST_DONE();
+}
+
+static uint32_t next_pcpu(void)
+{
+	uint32_t max = get_nprocs();
+	uint32_t cur = sched_getcpu();
+	uint32_t next = cur;
+	cpu_set_t cpuset;
+
+	TEST_ASSERT(max > 1, "Need at least two physical cpus");
+
+	sched_getaffinity(0, sizeof(cpuset), &cpuset);
+
+	do {
+		next = (next + 1) % CPU_SETSIZE;
+	} while (!CPU_ISSET(next, &cpuset));
+
+	return next;
+}
+
+static void migrate_self(uint32_t new_pcpu)
+{
+	int ret;
+	cpu_set_t cpuset;
+	pthread_t thread;
+
+	thread = pthread_self();
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(new_pcpu, &cpuset);
+
+	pr_debug("Migrating from %u to %u\n", sched_getcpu(), new_pcpu);
+
+	ret = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
+
+	TEST_ASSERT(ret == 0, "Failed to migrate to pCPU: %u; ret: %d\n",
+		    new_pcpu, ret);
+}
+
+static void kvm_set_cntxct(struct kvm_vcpu *vcpu, uint64_t cnt,
+			   enum arch_timer timer)
+{
+	if (timer == PHYSICAL)
+		vcpu_set_reg(vcpu, KVM_REG_ARM_PTIMER_CNT, cnt);
+	else
+		vcpu_set_reg(vcpu, KVM_REG_ARM_TIMER_CNT, cnt);
+}
+
+static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
+{
+	enum sync_cmd cmd = uc->args[1];
+	uint64_t val = uc->args[2];
+	enum arch_timer timer = uc->args[3];
+
+	switch (cmd) {
+	case SET_COUNTER_VALUE:
+		kvm_set_cntxct(vcpu, val, timer);
+		break;
+	case USERSPACE_USLEEP:
+		usleep(val);
+		break;
+	case USERSPACE_SCHED_YIELD:
+		sched_yield();
+		break;
+	case USERSPACE_MIGRATE_SELF:
+		migrate_self(next_pcpu());
+		break;
+	default:
+		break;
+	}
+}
+
+static void test_run(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	/* Start on CPU 0 */
+	migrate_self(0);
+
+	while (true) {
+		vcpu_run(vcpu);
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			handle_sync(vcpu, &uc);
+			break;
+		case UCALL_DONE:
+			goto out;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			goto out;
+		default:
+			TEST_FAIL("Unexpected guest exit\n");
+		}
+	}
+
+ out:
+	return;
+}
+
+static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL,
+			     KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq);
+	vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL,
+			     KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq);
+
+	sync_global_to_guest(vm, ptimer_irq);
+	sync_global_to_guest(vm, vtimer_irq);
+
+	pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
+}
+
+static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
+			   enum arch_timer timer)
+{
+	*vm = vm_create_with_one_vcpu(vcpu, guest_code);
+	TEST_ASSERT(*vm, "Failed to create the test VM\n");
+
+	vm_init_descriptor_tables(*vm);
+	vm_install_exception_handler(*vm, VECTOR_IRQ_CURRENT,
+				     guest_irq_handler);
+
+	vcpu_init_descriptor_tables(*vcpu);
+	vcpu_args_set(*vcpu, 1, timer);
+
+	test_init_timer_irq(*vm, *vcpu);
+	vgic_v3_setup(*vm, 1, 64);
+	sync_global_to_guest(*vm, test_args);
+}
+
+static void test_print_help(char *name)
+{
+	pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n"
+		, name);
+	pr_info("\t-i: Number of iterations (default: %u)\n",
+		NR_TEST_ITERS_DEF);
+	pr_info("\t-b: Test both physical and virtual timers (default: true)\n");
+	pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n",
+	     LONG_WAIT_TEST_MS);
+	pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n",
+		WAIT_TEST_MS);
+	pr_info("\t-p: Test physical timer (default: true)\n");
+	pr_info("\t-v: Test virtual timer (default: true)\n");
+	pr_info("\t-h: Print this help message\n");
+}
+
+static bool parse_args(int argc, char *argv[])
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "bhi:l:pvw:")) != -1) {
+		switch (opt) {
+		case 'b':
+			test_args.test_physical = true;
+			test_args.test_virtual = true;
+			break;
+		case 'i':
+			test_args.iterations =
+			    atoi_positive("Number of iterations", optarg);
+			break;
+		case 'l':
+			test_args.long_wait_ms =
+			    atoi_positive("Long wait time", optarg);
+			break;
+		case 'p':
+			test_args.test_physical = true;
+			test_args.test_virtual = false;
+			break;
+		case 'v':
+			test_args.test_virtual = true;
+			test_args.test_physical = false;
+			break;
+		case 'w':
+			test_args.wait_ms = atoi_positive("Wait time", optarg);
+			break;
+		case 'h':
+		default:
+			goto err;
+		}
+	}
+
+	return true;
+
+ err:
+	test_print_help(argv[0]);
+	return false;
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	if (!parse_args(argc, argv))
+		exit(KSFT_SKIP);
+
+	if (test_args.test_virtual) {
+		test_vm_create(&vm, &vcpu, VIRTUAL);
+		test_run(vm, vcpu);
+		kvm_vm_free(vm);
+	}
+
+	if (test_args.test_physical) {
+		test_vm_create(&vm, &vcpu, PHYSICAL);
+		test_run(vm, vcpu);
+		kvm_vm_free(vm);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/debug-exceptions.c b/tools/testing/selftests/kvm/arm64/debug-exceptions.c
new file mode 100644
index 000000000000..c7fb55c9135b
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/debug-exceptions.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+#include <linux/bitfield.h>
+
+#define MDSCR_KDE	(1 << 13)
+#define MDSCR_MDE	(1 << 15)
+#define MDSCR_SS	(1 << 0)
+
+#define DBGBCR_LEN8	(0xff << 5)
+#define DBGBCR_EXEC	(0x0 << 3)
+#define DBGBCR_EL1	(0x1 << 1)
+#define DBGBCR_E	(0x1 << 0)
+#define DBGBCR_LBN_SHIFT	16
+#define DBGBCR_BT_SHIFT		20
+#define DBGBCR_BT_ADDR_LINK_CTX	(0x1 << DBGBCR_BT_SHIFT)
+#define DBGBCR_BT_CTX_LINK	(0x3 << DBGBCR_BT_SHIFT)
+
+#define DBGWCR_LEN8	(0xff << 5)
+#define DBGWCR_RD	(0x1 << 3)
+#define DBGWCR_WR	(0x2 << 3)
+#define DBGWCR_EL1	(0x1 << 1)
+#define DBGWCR_E	(0x1 << 0)
+#define DBGWCR_LBN_SHIFT	16
+#define DBGWCR_WT_SHIFT		20
+#define DBGWCR_WT_LINK		(0x1 << DBGWCR_WT_SHIFT)
+
+#define SPSR_D		(1 << 9)
+#define SPSR_SS		(1 << 21)
+
+extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start, hw_bp_ctx;
+extern unsigned char iter_ss_begin, iter_ss_end;
+static volatile uint64_t sw_bp_addr, hw_bp_addr;
+static volatile uint64_t wp_addr, wp_data_addr;
+static volatile uint64_t svc_addr;
+static volatile uint64_t ss_addr[4], ss_idx;
+#define  PC(v)  ((uint64_t)&(v))
+
+#define GEN_DEBUG_WRITE_REG(reg_name)			\
+static void write_##reg_name(int num, uint64_t val)	\
+{							\
+	switch (num) {					\
+	case 0:						\
+		write_sysreg(val, reg_name##0_el1);	\
+		break;					\
+	case 1:						\
+		write_sysreg(val, reg_name##1_el1);	\
+		break;					\
+	case 2:						\
+		write_sysreg(val, reg_name##2_el1);	\
+		break;					\
+	case 3:						\
+		write_sysreg(val, reg_name##3_el1);	\
+		break;					\
+	case 4:						\
+		write_sysreg(val, reg_name##4_el1);	\
+		break;					\
+	case 5:						\
+		write_sysreg(val, reg_name##5_el1);	\
+		break;					\
+	case 6:						\
+		write_sysreg(val, reg_name##6_el1);	\
+		break;					\
+	case 7:						\
+		write_sysreg(val, reg_name##7_el1);	\
+		break;					\
+	case 8:						\
+		write_sysreg(val, reg_name##8_el1);	\
+		break;					\
+	case 9:						\
+		write_sysreg(val, reg_name##9_el1);	\
+		break;					\
+	case 10:					\
+		write_sysreg(val, reg_name##10_el1);	\
+		break;					\
+	case 11:					\
+		write_sysreg(val, reg_name##11_el1);	\
+		break;					\
+	case 12:					\
+		write_sysreg(val, reg_name##12_el1);	\
+		break;					\
+	case 13:					\
+		write_sysreg(val, reg_name##13_el1);	\
+		break;					\
+	case 14:					\
+		write_sysreg(val, reg_name##14_el1);	\
+		break;					\
+	case 15:					\
+		write_sysreg(val, reg_name##15_el1);	\
+		break;					\
+	default:					\
+		GUEST_ASSERT(0);			\
+	}						\
+}
+
+/* Define write_dbgbcr()/write_dbgbvr()/write_dbgwcr()/write_dbgwvr() */
+GEN_DEBUG_WRITE_REG(dbgbcr)
+GEN_DEBUG_WRITE_REG(dbgbvr)
+GEN_DEBUG_WRITE_REG(dbgwcr)
+GEN_DEBUG_WRITE_REG(dbgwvr)
+
+static void reset_debug_state(void)
+{
+	uint8_t brps, wrps, i;
+	uint64_t dfr0;
+
+	asm volatile("msr daifset, #8");
+
+	write_sysreg(0, osdlr_el1);
+	write_sysreg(0, oslar_el1);
+	isb();
+
+	write_sysreg(0, mdscr_el1);
+	write_sysreg(0, contextidr_el1);
+
+	/* Reset all bcr/bvr/wcr/wvr registers */
+	dfr0 = read_sysreg(id_aa64dfr0_el1);
+	brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0);
+	for (i = 0; i <= brps; i++) {
+		write_dbgbcr(i, 0);
+		write_dbgbvr(i, 0);
+	}
+	wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0);
+	for (i = 0; i <= wrps; i++) {
+		write_dbgwcr(i, 0);
+		write_dbgwvr(i, 0);
+	}
+
+	isb();
+}
+
+static void enable_os_lock(void)
+{
+	write_sysreg(1, oslar_el1);
+	isb();
+
+	GUEST_ASSERT(read_sysreg(oslsr_el1) & 2);
+}
+
+static void enable_monitor_debug_exceptions(void)
+{
+	uint32_t mdscr;
+
+	asm volatile("msr daifclr, #8");
+
+	mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE;
+	write_sysreg(mdscr, mdscr_el1);
+	isb();
+}
+
+static void install_wp(uint8_t wpn, uint64_t addr)
+{
+	uint32_t wcr;
+
+	wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E;
+	write_dbgwcr(wpn, wcr);
+	write_dbgwvr(wpn, addr);
+
+	isb();
+
+	enable_monitor_debug_exceptions();
+}
+
+static void install_hw_bp(uint8_t bpn, uint64_t addr)
+{
+	uint32_t bcr;
+
+	bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E;
+	write_dbgbcr(bpn, bcr);
+	write_dbgbvr(bpn, addr);
+	isb();
+
+	enable_monitor_debug_exceptions();
+}
+
+static void install_wp_ctx(uint8_t addr_wp, uint8_t ctx_bp, uint64_t addr,
+			   uint64_t ctx)
+{
+	uint32_t wcr;
+	uint64_t ctx_bcr;
+
+	/* Setup a context-aware breakpoint for Linked Context ID Match */
+	ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
+		  DBGBCR_BT_CTX_LINK;
+	write_dbgbcr(ctx_bp, ctx_bcr);
+	write_dbgbvr(ctx_bp, ctx);
+
+	/* Setup a linked watchpoint (linked to the context-aware breakpoint) */
+	wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E |
+	      DBGWCR_WT_LINK | ((uint32_t)ctx_bp << DBGWCR_LBN_SHIFT);
+	write_dbgwcr(addr_wp, wcr);
+	write_dbgwvr(addr_wp, addr);
+	isb();
+
+	enable_monitor_debug_exceptions();
+}
+
+void install_hw_bp_ctx(uint8_t addr_bp, uint8_t ctx_bp, uint64_t addr,
+		       uint64_t ctx)
+{
+	uint32_t addr_bcr, ctx_bcr;
+
+	/* Setup a context-aware breakpoint for Linked Context ID Match */
+	ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
+		  DBGBCR_BT_CTX_LINK;
+	write_dbgbcr(ctx_bp, ctx_bcr);
+	write_dbgbvr(ctx_bp, ctx);
+
+	/*
+	 * Setup a normal breakpoint for Linked Address Match, and link it
+	 * to the context-aware breakpoint.
+	 */
+	addr_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E |
+		   DBGBCR_BT_ADDR_LINK_CTX |
+		   ((uint32_t)ctx_bp << DBGBCR_LBN_SHIFT);
+	write_dbgbcr(addr_bp, addr_bcr);
+	write_dbgbvr(addr_bp, addr);
+	isb();
+
+	enable_monitor_debug_exceptions();
+}
+
+static void install_ss(void)
+{
+	uint32_t mdscr;
+
+	asm volatile("msr daifclr, #8");
+
+	mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_SS;
+	write_sysreg(mdscr, mdscr_el1);
+	isb();
+}
+
+static volatile char write_data;
+
+static void guest_code(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
+{
+	uint64_t ctx = 0xabcdef;	/* a random context number */
+
+	/* Software-breakpoint */
+	reset_debug_state();
+	asm volatile("sw_bp: brk #0");
+	GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp));
+
+	/* Hardware-breakpoint */
+	reset_debug_state();
+	install_hw_bp(bpn, PC(hw_bp));
+	asm volatile("hw_bp: nop");
+	GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp));
+
+	/* Hardware-breakpoint + svc */
+	reset_debug_state();
+	install_hw_bp(bpn, PC(bp_svc));
+	asm volatile("bp_svc: svc #0");
+	GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc));
+	GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4);
+
+	/* Hardware-breakpoint + software-breakpoint */
+	reset_debug_state();
+	install_hw_bp(bpn, PC(bp_brk));
+	asm volatile("bp_brk: brk #0");
+	GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk));
+	GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk));
+
+	/* Watchpoint */
+	reset_debug_state();
+	install_wp(wpn, PC(write_data));
+	write_data = 'x';
+	GUEST_ASSERT_EQ(write_data, 'x');
+	GUEST_ASSERT_EQ(wp_data_addr, PC(write_data));
+
+	/* Single-step */
+	reset_debug_state();
+	install_ss();
+	ss_idx = 0;
+	asm volatile("ss_start:\n"
+		     "mrs x0, esr_el1\n"
+		     "add x0, x0, #1\n"
+		     "msr daifset, #8\n"
+		     : : : "x0");
+	GUEST_ASSERT_EQ(ss_addr[0], PC(ss_start));
+	GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4);
+	GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8);
+
+	/* OS Lock does not block software-breakpoint */
+	reset_debug_state();
+	enable_os_lock();
+	sw_bp_addr = 0;
+	asm volatile("sw_bp2: brk #0");
+	GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp2));
+
+	/* OS Lock blocking hardware-breakpoint */
+	reset_debug_state();
+	enable_os_lock();
+	install_hw_bp(bpn, PC(hw_bp2));
+	hw_bp_addr = 0;
+	asm volatile("hw_bp2: nop");
+	GUEST_ASSERT_EQ(hw_bp_addr, 0);
+
+	/* OS Lock blocking watchpoint */
+	reset_debug_state();
+	enable_os_lock();
+	write_data = '\0';
+	wp_data_addr = 0;
+	install_wp(wpn, PC(write_data));
+	write_data = 'x';
+	GUEST_ASSERT_EQ(write_data, 'x');
+	GUEST_ASSERT_EQ(wp_data_addr, 0);
+
+	/* OS Lock blocking single-step */
+	reset_debug_state();
+	enable_os_lock();
+	ss_addr[0] = 0;
+	install_ss();
+	ss_idx = 0;
+	asm volatile("mrs x0, esr_el1\n\t"
+		     "add x0, x0, #1\n\t"
+		     "msr daifset, #8\n\t"
+		     : : : "x0");
+	GUEST_ASSERT_EQ(ss_addr[0], 0);
+
+	/* Linked hardware-breakpoint */
+	hw_bp_addr = 0;
+	reset_debug_state();
+	install_hw_bp_ctx(bpn, ctx_bpn, PC(hw_bp_ctx), ctx);
+	/* Set context id */
+	write_sysreg(ctx, contextidr_el1);
+	isb();
+	asm volatile("hw_bp_ctx: nop");
+	write_sysreg(0, contextidr_el1);
+	GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp_ctx));
+
+	/* Linked watchpoint */
+	reset_debug_state();
+	install_wp_ctx(wpn, ctx_bpn, PC(write_data), ctx);
+	/* Set context id */
+	write_sysreg(ctx, contextidr_el1);
+	isb();
+	write_data = 'x';
+	GUEST_ASSERT_EQ(write_data, 'x');
+	GUEST_ASSERT_EQ(wp_data_addr, PC(write_data));
+
+	GUEST_DONE();
+}
+
+static void guest_sw_bp_handler(struct ex_regs *regs)
+{
+	sw_bp_addr = regs->pc;
+	regs->pc += 4;
+}
+
+static void guest_hw_bp_handler(struct ex_regs *regs)
+{
+	hw_bp_addr = regs->pc;
+	regs->pstate |= SPSR_D;
+}
+
+static void guest_wp_handler(struct ex_regs *regs)
+{
+	wp_data_addr = read_sysreg(far_el1);
+	wp_addr = regs->pc;
+	regs->pstate |= SPSR_D;
+}
+
+static void guest_ss_handler(struct ex_regs *regs)
+{
+	__GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%lu'", ss_idx);
+	ss_addr[ss_idx++] = regs->pc;
+	regs->pstate |= SPSR_SS;
+}
+
+static void guest_svc_handler(struct ex_regs *regs)
+{
+	svc_addr = regs->pc;
+}
+
+static void guest_code_ss(int test_cnt)
+{
+	uint64_t i;
+	uint64_t bvr, wvr, w_bvr, w_wvr;
+
+	for (i = 0; i < test_cnt; i++) {
+		/* Bits [1:0] of dbg{b,w}vr are RES0 */
+		w_bvr = i << 2;
+		w_wvr = i << 2;
+
+		/*
+		 * Enable Single Step execution.  Note!  This _must_ be a bare
+		 * ucall as the ucall() path uses atomic operations to manage
+		 * the ucall structures, and the built-in "atomics" are usually
+		 * implemented via exclusive access instructions.  The exlusive
+		 * monitor is cleared on ERET, and so taking debug exceptions
+		 * during a LDREX=>STREX sequence will prevent forward progress
+		 * and hang the guest/test.
+		 */
+		GUEST_UCALL_NONE();
+
+		/*
+		 * The userspace will verify that the pc is as expected during
+		 * single step execution between iter_ss_begin and iter_ss_end.
+		 */
+		asm volatile("iter_ss_begin:nop\n");
+
+		write_sysreg(w_bvr, dbgbvr0_el1);
+		write_sysreg(w_wvr, dbgwvr0_el1);
+		bvr = read_sysreg(dbgbvr0_el1);
+		wvr = read_sysreg(dbgwvr0_el1);
+
+		/* Userspace disables Single Step when the end is nigh. */
+		asm volatile("iter_ss_end:\n");
+
+		GUEST_ASSERT_EQ(bvr, w_bvr);
+		GUEST_ASSERT_EQ(wvr, w_wvr);
+	}
+	GUEST_DONE();
+}
+
+static int debug_version(uint64_t id_aa64dfr0)
+{
+	return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0);
+}
+
+static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_BRK64, guest_sw_bp_handler);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_BREAKPT_CUR, guest_hw_bp_handler);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_WATCHPT_CUR, guest_wp_handler);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_SOFTSTP_CUR, guest_ss_handler);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_SVC64, guest_svc_handler);
+
+	/* Specify bpn/wpn/ctx_bpn to be tested */
+	vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn);
+	pr_debug("Use bpn#%d, wpn#%d and ctx_bpn#%d\n", bpn, wpn, ctx_bpn);
+
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		goto done;
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+
+done:
+	kvm_vm_free(vm);
+}
+
+void test_single_step_from_userspace(int test_cnt)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	struct kvm_run *run;
+	uint64_t pc, cmd;
+	uint64_t test_pc = 0;
+	bool ss_enable = false;
+	struct kvm_guest_debug debug = {};
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss);
+	run = vcpu->run;
+	vcpu_args_set(vcpu, 1, test_cnt);
+
+	while (1) {
+		vcpu_run(vcpu);
+		if (run->exit_reason != KVM_EXIT_DEBUG) {
+			cmd = get_ucall(vcpu, &uc);
+			if (cmd == UCALL_ABORT) {
+				REPORT_GUEST_ASSERT(uc);
+				/* NOT REACHED */
+			} else if (cmd == UCALL_DONE) {
+				break;
+			}
+
+			TEST_ASSERT(cmd == UCALL_NONE,
+				    "Unexpected ucall cmd 0x%lx", cmd);
+
+			debug.control = KVM_GUESTDBG_ENABLE |
+					KVM_GUESTDBG_SINGLESTEP;
+			ss_enable = true;
+			vcpu_guest_debug_set(vcpu, &debug);
+			continue;
+		}
+
+		TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG");
+
+		/* Check if the current pc is expected. */
+		pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
+		TEST_ASSERT(!test_pc || pc == test_pc,
+			    "Unexpected pc 0x%lx (expected 0x%lx)",
+			    pc, test_pc);
+
+		if ((pc + 4) == (uint64_t)&iter_ss_end) {
+			test_pc = 0;
+			debug.control = KVM_GUESTDBG_ENABLE;
+			ss_enable = false;
+			vcpu_guest_debug_set(vcpu, &debug);
+			continue;
+		}
+
+		/*
+		 * If the current pc is between iter_ss_bgin and
+		 * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should
+		 * be the current pc + 4.
+		 */
+		if ((pc >= (uint64_t)&iter_ss_begin) &&
+		    (pc < (uint64_t)&iter_ss_end))
+			test_pc = pc + 4;
+		else
+			test_pc = 0;
+	}
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Run debug testing using the various breakpoint#, watchpoint# and
+ * context-aware breakpoint# with the given ID_AA64DFR0_EL1 configuration.
+ */
+void test_guest_debug_exceptions_all(uint64_t aa64dfr0)
+{
+	uint8_t brp_num, wrp_num, ctx_brp_num, normal_brp_num, ctx_brp_base;
+	int b, w, c;
+
+	/* Number of breakpoints */
+	brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1;
+	__TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required");
+
+	/* Number of watchpoints */
+	wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1;
+
+	/* Number of context aware breakpoints */
+	ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1;
+
+	pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__,
+		 brp_num, wrp_num, ctx_brp_num);
+
+	/* Number of normal (non-context aware) breakpoints */
+	normal_brp_num = brp_num - ctx_brp_num;
+
+	/* Lowest context aware breakpoint number */
+	ctx_brp_base = normal_brp_num;
+
+	/* Run tests with all supported breakpoints/watchpoints */
+	for (c = ctx_brp_base; c < ctx_brp_base + ctx_brp_num; c++) {
+		for (b = 0; b < normal_brp_num; b++) {
+			for (w = 0; w < wrp_num; w++)
+				test_guest_debug_exceptions(b, w, c);
+		}
+	}
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("Usage: %s [-h] [-i iterations of the single step test]\n", name);
+	puts("");
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int opt;
+	int ss_iteration = 10000;
+	uint64_t aa64dfr0;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	aa64dfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
+	__TEST_REQUIRE(debug_version(aa64dfr0) >= 6,
+		       "Armv8 debug architecture not supported.");
+	kvm_vm_free(vm);
+
+	while ((opt = getopt(argc, argv, "i:")) != -1) {
+		switch (opt) {
+		case 'i':
+			ss_iteration = atoi_positive("Number of iterations", optarg);
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	test_guest_debug_exceptions_all(aa64dfr0);
+	test_single_step_from_userspace(ss_iteration);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c
new file mode 100644
index 000000000000..d43fb3f49050
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c
@@ -0,0 +1,771 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check for KVM_GET_REG_LIST regressions.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ *
+ * While the blessed list should be created from the oldest possible
+ * kernel, we can't go older than v5.2, though, because that's the first
+ * release which includes df205b5c6328 ("KVM: arm64: Filter out invalid
+ * core register IDs in KVM_GET_REG_LIST"). Without that commit the core
+ * registers won't match expectations.
+ */
+#include <stdio.h>
+#include "kvm_util.h"
+#include "test_util.h"
+#include "processor.h"
+
+struct feature_id_reg {
+	__u64 reg;
+	__u64 id_reg;
+	__u64 feat_shift;
+	__u64 feat_min;
+};
+
+static struct feature_id_reg feat_id_regs[] = {
+	{
+		ARM64_SYS_REG(3, 0, 2, 0, 3),	/* TCR2_EL1 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		0,
+		1
+	},
+	{
+		ARM64_SYS_REG(3, 0, 10, 2, 2),	/* PIRE0_EL1 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		8,
+		1
+	},
+	{
+		ARM64_SYS_REG(3, 0, 10, 2, 3),	/* PIR_EL1 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		8,
+		1
+	},
+	{
+		ARM64_SYS_REG(3, 0, 10, 2, 4),	/* POR_EL1 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		16,
+		1
+	},
+	{
+		ARM64_SYS_REG(3, 3, 10, 2, 4),	/* POR_EL0 */
+		ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+		16,
+		1
+	}
+};
+
+bool filter_reg(__u64 reg)
+{
+	/*
+	 * DEMUX register presence depends on the host's CLIDR_EL1.
+	 * This means there's no set of them that we can bless.
+	 */
+	if ((reg & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
+		return true;
+
+	return false;
+}
+
+static bool check_supported_feat_reg(struct kvm_vcpu *vcpu, __u64 reg)
+{
+	int i, ret;
+	__u64 data, feat_val;
+
+	for (i = 0; i < ARRAY_SIZE(feat_id_regs); i++) {
+		if (feat_id_regs[i].reg == reg) {
+			ret = __vcpu_get_reg(vcpu, feat_id_regs[i].id_reg, &data);
+			if (ret < 0)
+				return false;
+
+			feat_val = ((data >> feat_id_regs[i].feat_shift) & 0xf);
+			return feat_val >= feat_id_regs[i].feat_min;
+		}
+	}
+
+	return true;
+}
+
+bool check_supported_reg(struct kvm_vcpu *vcpu, __u64 reg)
+{
+	return check_supported_feat_reg(vcpu, reg);
+}
+
+bool check_reject_set(int err)
+{
+	return err == EPERM;
+}
+
+void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c)
+{
+	struct vcpu_reg_sublist *s;
+	int feature;
+
+	for_each_sublist(c, s) {
+		if (s->finalize) {
+			feature = s->feature;
+			vcpu_ioctl(vcpu, KVM_ARM_VCPU_FINALIZE, &feature);
+		}
+	}
+}
+
+#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK)
+
+#define CORE_REGS_XX_NR_WORDS	2
+#define CORE_SPSR_XX_NR_WORDS	2
+#define CORE_FPREGS_XX_NR_WORDS	4
+
+static const char *core_id_to_str(const char *prefix, __u64 id)
+{
+	__u64 core_off = id & ~REG_MASK, idx;
+
+	/*
+	 * core_off is the offset into struct kvm_regs
+	 */
+	switch (core_off) {
+	case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
+	     KVM_REG_ARM_CORE_REG(regs.regs[30]):
+		idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS;
+		TEST_ASSERT(idx < 31, "%s: Unexpected regs.regs index: %lld", prefix, idx);
+		return strdup_printf("KVM_REG_ARM_CORE_REG(regs.regs[%lld])", idx);
+	case KVM_REG_ARM_CORE_REG(regs.sp):
+		return "KVM_REG_ARM_CORE_REG(regs.sp)";
+	case KVM_REG_ARM_CORE_REG(regs.pc):
+		return "KVM_REG_ARM_CORE_REG(regs.pc)";
+	case KVM_REG_ARM_CORE_REG(regs.pstate):
+		return "KVM_REG_ARM_CORE_REG(regs.pstate)";
+	case KVM_REG_ARM_CORE_REG(sp_el1):
+		return "KVM_REG_ARM_CORE_REG(sp_el1)";
+	case KVM_REG_ARM_CORE_REG(elr_el1):
+		return "KVM_REG_ARM_CORE_REG(elr_el1)";
+	case KVM_REG_ARM_CORE_REG(spsr[0]) ...
+	     KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
+		idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS;
+		TEST_ASSERT(idx < KVM_NR_SPSR, "%s: Unexpected spsr index: %lld", prefix, idx);
+		return strdup_printf("KVM_REG_ARM_CORE_REG(spsr[%lld])", idx);
+	case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
+	     KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
+		idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS;
+		TEST_ASSERT(idx < 32, "%s: Unexpected fp_regs.vregs index: %lld", prefix, idx);
+		return strdup_printf("KVM_REG_ARM_CORE_REG(fp_regs.vregs[%lld])", idx);
+	case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
+		return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)";
+	case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
+		return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)";
+	}
+
+	TEST_FAIL("%s: Unknown core reg id: 0x%llx", prefix, id);
+	return NULL;
+}
+
+static const char *sve_id_to_str(const char *prefix, __u64 id)
+{
+	__u64 sve_off, n, i;
+
+	if (id == KVM_REG_ARM64_SVE_VLS)
+		return "KVM_REG_ARM64_SVE_VLS";
+
+	sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1));
+	i = id & (KVM_ARM64_SVE_MAX_SLICES - 1);
+
+	TEST_ASSERT(i == 0, "%s: Currently we don't expect slice > 0, reg id 0x%llx", prefix, id);
+
+	switch (sve_off) {
+	case KVM_REG_ARM64_SVE_ZREG_BASE ...
+	     KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1:
+		n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1);
+		TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0),
+			    "%s: Unexpected bits set in SVE ZREG id: 0x%llx", prefix, id);
+		return strdup_printf("KVM_REG_ARM64_SVE_ZREG(%lld, 0)", n);
+	case KVM_REG_ARM64_SVE_PREG_BASE ...
+	     KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1:
+		n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1);
+		TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0),
+			    "%s: Unexpected bits set in SVE PREG id: 0x%llx", prefix, id);
+		return strdup_printf("KVM_REG_ARM64_SVE_PREG(%lld, 0)", n);
+	case KVM_REG_ARM64_SVE_FFR_BASE:
+		TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0),
+			    "%s: Unexpected bits set in SVE FFR id: 0x%llx", prefix, id);
+		return "KVM_REG_ARM64_SVE_FFR(0)";
+	}
+
+	return NULL;
+}
+
+void print_reg(const char *prefix, __u64 id)
+{
+	unsigned op0, op1, crn, crm, op2;
+	const char *reg_size = NULL;
+
+	TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
+		    "%s: KVM_REG_ARM64 missing in reg id: 0x%llx", prefix, id);
+
+	switch (id & KVM_REG_SIZE_MASK) {
+	case KVM_REG_SIZE_U8:
+		reg_size = "KVM_REG_SIZE_U8";
+		break;
+	case KVM_REG_SIZE_U16:
+		reg_size = "KVM_REG_SIZE_U16";
+		break;
+	case KVM_REG_SIZE_U32:
+		reg_size = "KVM_REG_SIZE_U32";
+		break;
+	case KVM_REG_SIZE_U64:
+		reg_size = "KVM_REG_SIZE_U64";
+		break;
+	case KVM_REG_SIZE_U128:
+		reg_size = "KVM_REG_SIZE_U128";
+		break;
+	case KVM_REG_SIZE_U256:
+		reg_size = "KVM_REG_SIZE_U256";
+		break;
+	case KVM_REG_SIZE_U512:
+		reg_size = "KVM_REG_SIZE_U512";
+		break;
+	case KVM_REG_SIZE_U1024:
+		reg_size = "KVM_REG_SIZE_U1024";
+		break;
+	case KVM_REG_SIZE_U2048:
+		reg_size = "KVM_REG_SIZE_U2048";
+		break;
+	default:
+		TEST_FAIL("%s: Unexpected reg size: 0x%llx in reg id: 0x%llx",
+			  prefix, (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id);
+	}
+
+	switch (id & KVM_REG_ARM_COPROC_MASK) {
+	case KVM_REG_ARM_CORE:
+		printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(prefix, id));
+		break;
+	case KVM_REG_ARM_DEMUX:
+		TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)),
+			    "%s: Unexpected bits set in DEMUX reg id: 0x%llx", prefix, id);
+		printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n",
+		       reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK);
+		break;
+	case KVM_REG_ARM64_SYSREG:
+		op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT;
+		op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT;
+		crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT;
+		crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT;
+		op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT;
+		TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2),
+			    "%s: Unexpected bits set in SYSREG reg id: 0x%llx", prefix, id);
+		printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2);
+		break;
+	case KVM_REG_ARM_FW:
+		TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff),
+			    "%s: Unexpected bits set in FW reg id: 0x%llx", prefix, id);
+		printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff);
+		break;
+	case KVM_REG_ARM_FW_FEAT_BMAP:
+		TEST_ASSERT(id == KVM_REG_ARM_FW_FEAT_BMAP_REG(id & 0xffff),
+			    "%s: Unexpected bits set in the bitmap feature FW reg id: 0x%llx", prefix, id);
+		printf("\tKVM_REG_ARM_FW_FEAT_BMAP_REG(%lld),\n", id & 0xffff);
+		break;
+	case KVM_REG_ARM64_SVE:
+		printf("\t%s,\n", sve_id_to_str(prefix, id));
+		break;
+	default:
+		TEST_FAIL("%s: Unexpected coproc type: 0x%llx in reg id: 0x%llx",
+			  prefix, (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id);
+	}
+}
+
+/*
+ * The original blessed list was primed with the output of kernel version
+ * v4.15 with --core-reg-fixup and then later updated with new registers.
+ * (The --core-reg-fixup option and it's fixup function have been removed
+ * from the test, as it's unlikely to use this type of test on a kernel
+ * older than v5.2.)
+ *
+ * The blessed list is up to date with kernel version v6.4 (or so we hope)
+ */
+static __u64 base_regs[] = {
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr),
+	KVM_REG_ARM_FW_REG(0),		/* KVM_REG_ARM_PSCI_VERSION */
+	KVM_REG_ARM_FW_REG(1),		/* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 */
+	KVM_REG_ARM_FW_REG(2),		/* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 */
+	KVM_REG_ARM_FW_REG(3),		/* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 */
+	KVM_REG_ARM_FW_FEAT_BMAP_REG(0),	/* KVM_REG_ARM_STD_BMAP */
+	KVM_REG_ARM_FW_FEAT_BMAP_REG(1),	/* KVM_REG_ARM_STD_HYP_BMAP */
+	KVM_REG_ARM_FW_FEAT_BMAP_REG(2),	/* KVM_REG_ARM_VENDOR_HYP_BMAP */
+	ARM64_SYS_REG(3, 3, 14, 3, 1),	/* CNTV_CTL_EL0 */
+	ARM64_SYS_REG(3, 3, 14, 3, 2),	/* CNTV_CVAL_EL0 */
+	ARM64_SYS_REG(3, 3, 14, 0, 2),
+	ARM64_SYS_REG(3, 0, 0, 0, 0),	/* MIDR_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 0, 6),	/* REVIDR_EL1 */
+	ARM64_SYS_REG(3, 1, 0, 0, 1),	/* CLIDR_EL1 */
+	ARM64_SYS_REG(3, 1, 0, 0, 7),	/* AIDR_EL1 */
+	ARM64_SYS_REG(3, 3, 0, 0, 1),	/* CTR_EL0 */
+	ARM64_SYS_REG(2, 0, 0, 0, 4),
+	ARM64_SYS_REG(2, 0, 0, 0, 5),
+	ARM64_SYS_REG(2, 0, 0, 0, 6),
+	ARM64_SYS_REG(2, 0, 0, 0, 7),
+	ARM64_SYS_REG(2, 0, 0, 1, 4),
+	ARM64_SYS_REG(2, 0, 0, 1, 5),
+	ARM64_SYS_REG(2, 0, 0, 1, 6),
+	ARM64_SYS_REG(2, 0, 0, 1, 7),
+	ARM64_SYS_REG(2, 0, 0, 2, 0),	/* MDCCINT_EL1 */
+	ARM64_SYS_REG(2, 0, 0, 2, 2),	/* MDSCR_EL1 */
+	ARM64_SYS_REG(2, 0, 0, 2, 4),
+	ARM64_SYS_REG(2, 0, 0, 2, 5),
+	ARM64_SYS_REG(2, 0, 0, 2, 6),
+	ARM64_SYS_REG(2, 0, 0, 2, 7),
+	ARM64_SYS_REG(2, 0, 0, 3, 4),
+	ARM64_SYS_REG(2, 0, 0, 3, 5),
+	ARM64_SYS_REG(2, 0, 0, 3, 6),
+	ARM64_SYS_REG(2, 0, 0, 3, 7),
+	ARM64_SYS_REG(2, 0, 0, 4, 4),
+	ARM64_SYS_REG(2, 0, 0, 4, 5),
+	ARM64_SYS_REG(2, 0, 0, 4, 6),
+	ARM64_SYS_REG(2, 0, 0, 4, 7),
+	ARM64_SYS_REG(2, 0, 0, 5, 4),
+	ARM64_SYS_REG(2, 0, 0, 5, 5),
+	ARM64_SYS_REG(2, 0, 0, 5, 6),
+	ARM64_SYS_REG(2, 0, 0, 5, 7),
+	ARM64_SYS_REG(2, 0, 0, 6, 4),
+	ARM64_SYS_REG(2, 0, 0, 6, 5),
+	ARM64_SYS_REG(2, 0, 0, 6, 6),
+	ARM64_SYS_REG(2, 0, 0, 6, 7),
+	ARM64_SYS_REG(2, 0, 0, 7, 4),
+	ARM64_SYS_REG(2, 0, 0, 7, 5),
+	ARM64_SYS_REG(2, 0, 0, 7, 6),
+	ARM64_SYS_REG(2, 0, 0, 7, 7),
+	ARM64_SYS_REG(2, 0, 0, 8, 4),
+	ARM64_SYS_REG(2, 0, 0, 8, 5),
+	ARM64_SYS_REG(2, 0, 0, 8, 6),
+	ARM64_SYS_REG(2, 0, 0, 8, 7),
+	ARM64_SYS_REG(2, 0, 0, 9, 4),
+	ARM64_SYS_REG(2, 0, 0, 9, 5),
+	ARM64_SYS_REG(2, 0, 0, 9, 6),
+	ARM64_SYS_REG(2, 0, 0, 9, 7),
+	ARM64_SYS_REG(2, 0, 0, 10, 4),
+	ARM64_SYS_REG(2, 0, 0, 10, 5),
+	ARM64_SYS_REG(2, 0, 0, 10, 6),
+	ARM64_SYS_REG(2, 0, 0, 10, 7),
+	ARM64_SYS_REG(2, 0, 0, 11, 4),
+	ARM64_SYS_REG(2, 0, 0, 11, 5),
+	ARM64_SYS_REG(2, 0, 0, 11, 6),
+	ARM64_SYS_REG(2, 0, 0, 11, 7),
+	ARM64_SYS_REG(2, 0, 0, 12, 4),
+	ARM64_SYS_REG(2, 0, 0, 12, 5),
+	ARM64_SYS_REG(2, 0, 0, 12, 6),
+	ARM64_SYS_REG(2, 0, 0, 12, 7),
+	ARM64_SYS_REG(2, 0, 0, 13, 4),
+	ARM64_SYS_REG(2, 0, 0, 13, 5),
+	ARM64_SYS_REG(2, 0, 0, 13, 6),
+	ARM64_SYS_REG(2, 0, 0, 13, 7),
+	ARM64_SYS_REG(2, 0, 0, 14, 4),
+	ARM64_SYS_REG(2, 0, 0, 14, 5),
+	ARM64_SYS_REG(2, 0, 0, 14, 6),
+	ARM64_SYS_REG(2, 0, 0, 14, 7),
+	ARM64_SYS_REG(2, 0, 0, 15, 4),
+	ARM64_SYS_REG(2, 0, 0, 15, 5),
+	ARM64_SYS_REG(2, 0, 0, 15, 6),
+	ARM64_SYS_REG(2, 0, 0, 15, 7),
+	ARM64_SYS_REG(2, 0, 1, 1, 4),	/* OSLSR_EL1 */
+	ARM64_SYS_REG(2, 4, 0, 7, 0),	/* DBGVCR32_EL2 */
+	ARM64_SYS_REG(3, 0, 0, 0, 5),	/* MPIDR_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 0),	/* ID_PFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 1),	/* ID_PFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 2),	/* ID_DFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 3),	/* ID_AFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 4),	/* ID_MMFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 5),	/* ID_MMFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 6),	/* ID_MMFR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 1, 7),	/* ID_MMFR3_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 0),	/* ID_ISAR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 1),	/* ID_ISAR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 2),	/* ID_ISAR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 3),	/* ID_ISAR3_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 4),	/* ID_ISAR4_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 5),	/* ID_ISAR5_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 6),	/* ID_MMFR4_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 2, 7),	/* ID_ISAR6_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 0),	/* MVFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 1),	/* MVFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 2),	/* MVFR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 3),
+	ARM64_SYS_REG(3, 0, 0, 3, 4),	/* ID_PFR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 5),	/* ID_DFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 6),	/* ID_MMFR5_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 3, 7),
+	ARM64_SYS_REG(3, 0, 0, 4, 0),	/* ID_AA64PFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 4, 1),	/* ID_AA64PFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 4, 2),	/* ID_AA64PFR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 4, 3),
+	ARM64_SYS_REG(3, 0, 0, 4, 4),	/* ID_AA64ZFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 4, 5),	/* ID_AA64SMFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 4, 6),
+	ARM64_SYS_REG(3, 0, 0, 4, 7),
+	ARM64_SYS_REG(3, 0, 0, 5, 0),	/* ID_AA64DFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 5, 1),	/* ID_AA64DFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 5, 2),
+	ARM64_SYS_REG(3, 0, 0, 5, 3),
+	ARM64_SYS_REG(3, 0, 0, 5, 4),	/* ID_AA64AFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 5, 5),	/* ID_AA64AFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 5, 6),
+	ARM64_SYS_REG(3, 0, 0, 5, 7),
+	ARM64_SYS_REG(3, 0, 0, 6, 0),	/* ID_AA64ISAR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 6, 1),	/* ID_AA64ISAR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 6, 2),	/* ID_AA64ISAR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 6, 3),
+	ARM64_SYS_REG(3, 0, 0, 6, 4),
+	ARM64_SYS_REG(3, 0, 0, 6, 5),
+	ARM64_SYS_REG(3, 0, 0, 6, 6),
+	ARM64_SYS_REG(3, 0, 0, 6, 7),
+	ARM64_SYS_REG(3, 0, 0, 7, 0),	/* ID_AA64MMFR0_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 7, 1),	/* ID_AA64MMFR1_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 7, 2),	/* ID_AA64MMFR2_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 7, 3),	/* ID_AA64MMFR3_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 7, 4),	/* ID_AA64MMFR4_EL1 */
+	ARM64_SYS_REG(3, 0, 0, 7, 5),
+	ARM64_SYS_REG(3, 0, 0, 7, 6),
+	ARM64_SYS_REG(3, 0, 0, 7, 7),
+	ARM64_SYS_REG(3, 0, 1, 0, 0),	/* SCTLR_EL1 */
+	ARM64_SYS_REG(3, 0, 1, 0, 1),	/* ACTLR_EL1 */
+	ARM64_SYS_REG(3, 0, 1, 0, 2),	/* CPACR_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 0, 0),	/* TTBR0_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 0, 1),	/* TTBR1_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 0, 2),	/* TCR_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 0, 3),	/* TCR2_EL1 */
+	ARM64_SYS_REG(3, 0, 5, 1, 0),	/* AFSR0_EL1 */
+	ARM64_SYS_REG(3, 0, 5, 1, 1),	/* AFSR1_EL1 */
+	ARM64_SYS_REG(3, 0, 5, 2, 0),	/* ESR_EL1 */
+	ARM64_SYS_REG(3, 0, 6, 0, 0),	/* FAR_EL1 */
+	ARM64_SYS_REG(3, 0, 7, 4, 0),	/* PAR_EL1 */
+	ARM64_SYS_REG(3, 0, 10, 2, 0),	/* MAIR_EL1 */
+	ARM64_SYS_REG(3, 0, 10, 2, 2),	/* PIRE0_EL1 */
+	ARM64_SYS_REG(3, 0, 10, 2, 3),	/* PIR_EL1 */
+	ARM64_SYS_REG(3, 0, 10, 2, 4),	/* POR_EL1 */
+	ARM64_SYS_REG(3, 0, 10, 3, 0),	/* AMAIR_EL1 */
+	ARM64_SYS_REG(3, 0, 12, 0, 0),	/* VBAR_EL1 */
+	ARM64_SYS_REG(3, 0, 12, 1, 1),	/* DISR_EL1 */
+	ARM64_SYS_REG(3, 0, 13, 0, 1),	/* CONTEXTIDR_EL1 */
+	ARM64_SYS_REG(3, 0, 13, 0, 4),	/* TPIDR_EL1 */
+	ARM64_SYS_REG(3, 0, 14, 1, 0),	/* CNTKCTL_EL1 */
+	ARM64_SYS_REG(3, 2, 0, 0, 0),	/* CSSELR_EL1 */
+	ARM64_SYS_REG(3, 3, 10, 2, 4),	/* POR_EL0 */
+	ARM64_SYS_REG(3, 3, 13, 0, 2),	/* TPIDR_EL0 */
+	ARM64_SYS_REG(3, 3, 13, 0, 3),	/* TPIDRRO_EL0 */
+	ARM64_SYS_REG(3, 3, 14, 0, 1),	/* CNTPCT_EL0 */
+	ARM64_SYS_REG(3, 3, 14, 2, 1),	/* CNTP_CTL_EL0 */
+	ARM64_SYS_REG(3, 3, 14, 2, 2),	/* CNTP_CVAL_EL0 */
+	ARM64_SYS_REG(3, 4, 3, 0, 0),	/* DACR32_EL2 */
+	ARM64_SYS_REG(3, 4, 5, 0, 1),	/* IFSR32_EL2 */
+	ARM64_SYS_REG(3, 4, 5, 3, 0),	/* FPEXC32_EL2 */
+};
+
+static __u64 pmu_regs[] = {
+	ARM64_SYS_REG(3, 0, 9, 14, 1),	/* PMINTENSET_EL1 */
+	ARM64_SYS_REG(3, 0, 9, 14, 2),	/* PMINTENCLR_EL1 */
+	ARM64_SYS_REG(3, 3, 9, 12, 0),	/* PMCR_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 12, 1),	/* PMCNTENSET_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 12, 2),	/* PMCNTENCLR_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 12, 3),	/* PMOVSCLR_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 12, 4),	/* PMSWINC_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 12, 5),	/* PMSELR_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 13, 0),	/* PMCCNTR_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 14, 0),	/* PMUSERENR_EL0 */
+	ARM64_SYS_REG(3, 3, 9, 14, 3),	/* PMOVSSET_EL0 */
+	ARM64_SYS_REG(3, 3, 14, 8, 0),
+	ARM64_SYS_REG(3, 3, 14, 8, 1),
+	ARM64_SYS_REG(3, 3, 14, 8, 2),
+	ARM64_SYS_REG(3, 3, 14, 8, 3),
+	ARM64_SYS_REG(3, 3, 14, 8, 4),
+	ARM64_SYS_REG(3, 3, 14, 8, 5),
+	ARM64_SYS_REG(3, 3, 14, 8, 6),
+	ARM64_SYS_REG(3, 3, 14, 8, 7),
+	ARM64_SYS_REG(3, 3, 14, 9, 0),
+	ARM64_SYS_REG(3, 3, 14, 9, 1),
+	ARM64_SYS_REG(3, 3, 14, 9, 2),
+	ARM64_SYS_REG(3, 3, 14, 9, 3),
+	ARM64_SYS_REG(3, 3, 14, 9, 4),
+	ARM64_SYS_REG(3, 3, 14, 9, 5),
+	ARM64_SYS_REG(3, 3, 14, 9, 6),
+	ARM64_SYS_REG(3, 3, 14, 9, 7),
+	ARM64_SYS_REG(3, 3, 14, 10, 0),
+	ARM64_SYS_REG(3, 3, 14, 10, 1),
+	ARM64_SYS_REG(3, 3, 14, 10, 2),
+	ARM64_SYS_REG(3, 3, 14, 10, 3),
+	ARM64_SYS_REG(3, 3, 14, 10, 4),
+	ARM64_SYS_REG(3, 3, 14, 10, 5),
+	ARM64_SYS_REG(3, 3, 14, 10, 6),
+	ARM64_SYS_REG(3, 3, 14, 10, 7),
+	ARM64_SYS_REG(3, 3, 14, 11, 0),
+	ARM64_SYS_REG(3, 3, 14, 11, 1),
+	ARM64_SYS_REG(3, 3, 14, 11, 2),
+	ARM64_SYS_REG(3, 3, 14, 11, 3),
+	ARM64_SYS_REG(3, 3, 14, 11, 4),
+	ARM64_SYS_REG(3, 3, 14, 11, 5),
+	ARM64_SYS_REG(3, 3, 14, 11, 6),
+	ARM64_SYS_REG(3, 3, 14, 12, 0),
+	ARM64_SYS_REG(3, 3, 14, 12, 1),
+	ARM64_SYS_REG(3, 3, 14, 12, 2),
+	ARM64_SYS_REG(3, 3, 14, 12, 3),
+	ARM64_SYS_REG(3, 3, 14, 12, 4),
+	ARM64_SYS_REG(3, 3, 14, 12, 5),
+	ARM64_SYS_REG(3, 3, 14, 12, 6),
+	ARM64_SYS_REG(3, 3, 14, 12, 7),
+	ARM64_SYS_REG(3, 3, 14, 13, 0),
+	ARM64_SYS_REG(3, 3, 14, 13, 1),
+	ARM64_SYS_REG(3, 3, 14, 13, 2),
+	ARM64_SYS_REG(3, 3, 14, 13, 3),
+	ARM64_SYS_REG(3, 3, 14, 13, 4),
+	ARM64_SYS_REG(3, 3, 14, 13, 5),
+	ARM64_SYS_REG(3, 3, 14, 13, 6),
+	ARM64_SYS_REG(3, 3, 14, 13, 7),
+	ARM64_SYS_REG(3, 3, 14, 14, 0),
+	ARM64_SYS_REG(3, 3, 14, 14, 1),
+	ARM64_SYS_REG(3, 3, 14, 14, 2),
+	ARM64_SYS_REG(3, 3, 14, 14, 3),
+	ARM64_SYS_REG(3, 3, 14, 14, 4),
+	ARM64_SYS_REG(3, 3, 14, 14, 5),
+	ARM64_SYS_REG(3, 3, 14, 14, 6),
+	ARM64_SYS_REG(3, 3, 14, 14, 7),
+	ARM64_SYS_REG(3, 3, 14, 15, 0),
+	ARM64_SYS_REG(3, 3, 14, 15, 1),
+	ARM64_SYS_REG(3, 3, 14, 15, 2),
+	ARM64_SYS_REG(3, 3, 14, 15, 3),
+	ARM64_SYS_REG(3, 3, 14, 15, 4),
+	ARM64_SYS_REG(3, 3, 14, 15, 5),
+	ARM64_SYS_REG(3, 3, 14, 15, 6),
+	ARM64_SYS_REG(3, 3, 14, 15, 7),	/* PMCCFILTR_EL0 */
+};
+
+static __u64 vregs[] = {
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]),
+	KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]),
+};
+
+static __u64 sve_regs[] = {
+	KVM_REG_ARM64_SVE_VLS,
+	KVM_REG_ARM64_SVE_ZREG(0, 0),
+	KVM_REG_ARM64_SVE_ZREG(1, 0),
+	KVM_REG_ARM64_SVE_ZREG(2, 0),
+	KVM_REG_ARM64_SVE_ZREG(3, 0),
+	KVM_REG_ARM64_SVE_ZREG(4, 0),
+	KVM_REG_ARM64_SVE_ZREG(5, 0),
+	KVM_REG_ARM64_SVE_ZREG(6, 0),
+	KVM_REG_ARM64_SVE_ZREG(7, 0),
+	KVM_REG_ARM64_SVE_ZREG(8, 0),
+	KVM_REG_ARM64_SVE_ZREG(9, 0),
+	KVM_REG_ARM64_SVE_ZREG(10, 0),
+	KVM_REG_ARM64_SVE_ZREG(11, 0),
+	KVM_REG_ARM64_SVE_ZREG(12, 0),
+	KVM_REG_ARM64_SVE_ZREG(13, 0),
+	KVM_REG_ARM64_SVE_ZREG(14, 0),
+	KVM_REG_ARM64_SVE_ZREG(15, 0),
+	KVM_REG_ARM64_SVE_ZREG(16, 0),
+	KVM_REG_ARM64_SVE_ZREG(17, 0),
+	KVM_REG_ARM64_SVE_ZREG(18, 0),
+	KVM_REG_ARM64_SVE_ZREG(19, 0),
+	KVM_REG_ARM64_SVE_ZREG(20, 0),
+	KVM_REG_ARM64_SVE_ZREG(21, 0),
+	KVM_REG_ARM64_SVE_ZREG(22, 0),
+	KVM_REG_ARM64_SVE_ZREG(23, 0),
+	KVM_REG_ARM64_SVE_ZREG(24, 0),
+	KVM_REG_ARM64_SVE_ZREG(25, 0),
+	KVM_REG_ARM64_SVE_ZREG(26, 0),
+	KVM_REG_ARM64_SVE_ZREG(27, 0),
+	KVM_REG_ARM64_SVE_ZREG(28, 0),
+	KVM_REG_ARM64_SVE_ZREG(29, 0),
+	KVM_REG_ARM64_SVE_ZREG(30, 0),
+	KVM_REG_ARM64_SVE_ZREG(31, 0),
+	KVM_REG_ARM64_SVE_PREG(0, 0),
+	KVM_REG_ARM64_SVE_PREG(1, 0),
+	KVM_REG_ARM64_SVE_PREG(2, 0),
+	KVM_REG_ARM64_SVE_PREG(3, 0),
+	KVM_REG_ARM64_SVE_PREG(4, 0),
+	KVM_REG_ARM64_SVE_PREG(5, 0),
+	KVM_REG_ARM64_SVE_PREG(6, 0),
+	KVM_REG_ARM64_SVE_PREG(7, 0),
+	KVM_REG_ARM64_SVE_PREG(8, 0),
+	KVM_REG_ARM64_SVE_PREG(9, 0),
+	KVM_REG_ARM64_SVE_PREG(10, 0),
+	KVM_REG_ARM64_SVE_PREG(11, 0),
+	KVM_REG_ARM64_SVE_PREG(12, 0),
+	KVM_REG_ARM64_SVE_PREG(13, 0),
+	KVM_REG_ARM64_SVE_PREG(14, 0),
+	KVM_REG_ARM64_SVE_PREG(15, 0),
+	KVM_REG_ARM64_SVE_FFR(0),
+	ARM64_SYS_REG(3, 0, 1, 2, 0),   /* ZCR_EL1 */
+};
+
+static __u64 sve_rejects_set[] = {
+	KVM_REG_ARM64_SVE_VLS,
+};
+
+static __u64 pauth_addr_regs[] = {
+	ARM64_SYS_REG(3, 0, 2, 1, 0),	/* APIAKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 1, 1),	/* APIAKEYHI_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 1, 2),	/* APIBKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 1, 3),	/* APIBKEYHI_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 0),	/* APDAKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 1),	/* APDAKEYHI_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 2),	/* APDBKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 3)	/* APDBKEYHI_EL1 */
+};
+
+static __u64 pauth_generic_regs[] = {
+	ARM64_SYS_REG(3, 0, 2, 3, 0),	/* APGAKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 3, 1),	/* APGAKEYHI_EL1 */
+};
+
+#define BASE_SUBLIST \
+	{ "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), }
+#define VREGS_SUBLIST \
+	{ "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), }
+#define PMU_SUBLIST \
+	{ "pmu", .capability = KVM_CAP_ARM_PMU_V3, .feature = KVM_ARM_VCPU_PMU_V3, \
+	  .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), }
+#define SVE_SUBLIST \
+	{ "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \
+	  .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \
+	  .rejects_set = sve_rejects_set, .rejects_set_n = ARRAY_SIZE(sve_rejects_set), }
+#define PAUTH_SUBLIST							\
+	{								\
+		.name 		= "pauth_address",			\
+		.capability	= KVM_CAP_ARM_PTRAUTH_ADDRESS,		\
+		.feature	= KVM_ARM_VCPU_PTRAUTH_ADDRESS,		\
+		.regs		= pauth_addr_regs,			\
+		.regs_n		= ARRAY_SIZE(pauth_addr_regs),		\
+	},								\
+	{								\
+		.name 		= "pauth_generic",			\
+		.capability	= KVM_CAP_ARM_PTRAUTH_GENERIC,		\
+		.feature	= KVM_ARM_VCPU_PTRAUTH_GENERIC,		\
+		.regs		= pauth_generic_regs,			\
+		.regs_n		= ARRAY_SIZE(pauth_generic_regs),	\
+	}
+
+static struct vcpu_reg_list vregs_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	VREGS_SUBLIST,
+	{0},
+	},
+};
+static struct vcpu_reg_list vregs_pmu_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	VREGS_SUBLIST,
+	PMU_SUBLIST,
+	{0},
+	},
+};
+static struct vcpu_reg_list sve_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	SVE_SUBLIST,
+	{0},
+	},
+};
+static struct vcpu_reg_list sve_pmu_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	SVE_SUBLIST,
+	PMU_SUBLIST,
+	{0},
+	},
+};
+static struct vcpu_reg_list pauth_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	VREGS_SUBLIST,
+	PAUTH_SUBLIST,
+	{0},
+	},
+};
+static struct vcpu_reg_list pauth_pmu_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	VREGS_SUBLIST,
+	PAUTH_SUBLIST,
+	PMU_SUBLIST,
+	{0},
+	},
+};
+
+struct vcpu_reg_list *vcpu_configs[] = {
+	&vregs_config,
+	&vregs_pmu_config,
+	&sve_config,
+	&sve_pmu_config,
+	&pauth_config,
+	&pauth_pmu_config,
+};
+int vcpu_configs_n = ARRAY_SIZE(vcpu_configs);
diff --git a/tools/testing/selftests/kvm/arm64/hypercalls.c b/tools/testing/selftests/kvm/arm64/hypercalls.c
new file mode 100644
index 000000000000..ec54ec7726e9
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/hypercalls.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* hypercalls: Check the ARM64's psuedo-firmware bitmap register interface.
+ *
+ * The test validates the basic hypercall functionalities that are exposed
+ * via the psuedo-firmware bitmap register. This includes the registers'
+ * read/write behavior before and after the VM has started, and if the
+ * hypercalls are properly masked or unmasked to the guest when disabled or
+ * enabled from the KVM userspace, respectively.
+ */
+#include <errno.h>
+#include <linux/arm-smccc.h>
+#include <asm/kvm.h>
+#include <kvm_util.h>
+
+#include "processor.h"
+
+#define FW_REG_ULIMIT_VAL(max_feat_bit) (GENMASK(max_feat_bit, 0))
+
+/* Last valid bits of the bitmapped firmware registers */
+#define KVM_REG_ARM_STD_BMAP_BIT_MAX		0
+#define KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX	0
+#define KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX	1
+
+struct kvm_fw_reg_info {
+	uint64_t reg;		/* Register definition */
+	uint64_t max_feat_bit;	/* Bit that represents the upper limit of the feature-map */
+};
+
+#define FW_REG_INFO(r)			\
+	{					\
+		.reg = r,			\
+		.max_feat_bit = r##_BIT_MAX,	\
+	}
+
+static const struct kvm_fw_reg_info fw_reg_info[] = {
+	FW_REG_INFO(KVM_REG_ARM_STD_BMAP),
+	FW_REG_INFO(KVM_REG_ARM_STD_HYP_BMAP),
+	FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP),
+};
+
+enum test_stage {
+	TEST_STAGE_REG_IFACE,
+	TEST_STAGE_HVC_IFACE_FEAT_DISABLED,
+	TEST_STAGE_HVC_IFACE_FEAT_ENABLED,
+	TEST_STAGE_HVC_IFACE_FALSE_INFO,
+	TEST_STAGE_END,
+};
+
+static int stage = TEST_STAGE_REG_IFACE;
+
+struct test_hvc_info {
+	uint32_t func_id;
+	uint64_t arg1;
+};
+
+#define TEST_HVC_INFO(f, a1)	\
+	{			\
+		.func_id = f,	\
+		.arg1 = a1,	\
+	}
+
+static const struct test_hvc_info hvc_info[] = {
+	/* KVM_REG_ARM_STD_BMAP */
+	TEST_HVC_INFO(ARM_SMCCC_TRNG_VERSION, 0),
+	TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_TRNG_RND64),
+	TEST_HVC_INFO(ARM_SMCCC_TRNG_GET_UUID, 0),
+	TEST_HVC_INFO(ARM_SMCCC_TRNG_RND32, 0),
+	TEST_HVC_INFO(ARM_SMCCC_TRNG_RND64, 0),
+
+	/* KVM_REG_ARM_STD_HYP_BMAP */
+	TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_HV_PV_TIME_FEATURES),
+	TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_HV_PV_TIME_ST),
+	TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_ST, 0),
+
+	/* KVM_REG_ARM_VENDOR_HYP_BMAP */
+	TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID,
+			ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID),
+	TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0),
+	TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, KVM_PTP_VIRT_COUNTER),
+};
+
+/* Feed false hypercall info to test the KVM behavior */
+static const struct test_hvc_info false_hvc_info[] = {
+	/* Feature support check against a different family of hypercalls */
+	TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID),
+	TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_TRNG_RND64),
+	TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_TRNG_RND64),
+};
+
+static void guest_test_hvc(const struct test_hvc_info *hc_info)
+{
+	unsigned int i;
+	struct arm_smccc_res res;
+	unsigned int hvc_info_arr_sz;
+
+	hvc_info_arr_sz =
+	hc_info == hvc_info ? ARRAY_SIZE(hvc_info) : ARRAY_SIZE(false_hvc_info);
+
+	for (i = 0; i < hvc_info_arr_sz; i++, hc_info++) {
+		memset(&res, 0, sizeof(res));
+		smccc_hvc(hc_info->func_id, hc_info->arg1, 0, 0, 0, 0, 0, 0, &res);
+
+		switch (stage) {
+		case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
+		case TEST_STAGE_HVC_IFACE_FALSE_INFO:
+			__GUEST_ASSERT(res.a0 == SMCCC_RET_NOT_SUPPORTED,
+				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u",
+					res.a0, hc_info->func_id, hc_info->arg1, stage);
+			break;
+		case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
+			__GUEST_ASSERT(res.a0 != SMCCC_RET_NOT_SUPPORTED,
+				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u",
+					res.a0, hc_info->func_id, hc_info->arg1, stage);
+			break;
+		default:
+			GUEST_FAIL("Unexpected stage = %u", stage);
+		}
+	}
+}
+
+static void guest_code(void)
+{
+	while (stage != TEST_STAGE_END) {
+		switch (stage) {
+		case TEST_STAGE_REG_IFACE:
+			break;
+		case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
+		case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
+			guest_test_hvc(hvc_info);
+			break;
+		case TEST_STAGE_HVC_IFACE_FALSE_INFO:
+			guest_test_hvc(false_hvc_info);
+			break;
+		default:
+			GUEST_FAIL("Unexpected stage = %u", stage);
+		}
+
+		GUEST_SYNC(stage);
+	}
+
+	GUEST_DONE();
+}
+
+struct st_time {
+	uint32_t rev;
+	uint32_t attr;
+	uint64_t st_time;
+};
+
+#define STEAL_TIME_SIZE		((sizeof(struct st_time) + 63) & ~63)
+#define ST_GPA_BASE		(1 << 30)
+
+static void steal_time_init(struct kvm_vcpu *vcpu)
+{
+	uint64_t st_ipa = (ulong)ST_GPA_BASE;
+	unsigned int gpages;
+
+	gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE);
+	vm_userspace_mem_region_add(vcpu->vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
+
+	vcpu_device_attr_set(vcpu, KVM_ARM_VCPU_PVTIME_CTRL,
+			     KVM_ARM_VCPU_PVTIME_IPA, &st_ipa);
+}
+
+static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu)
+{
+	uint64_t val;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) {
+		const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
+
+		/* First 'read' should be an upper limit of the features supported */
+		val = vcpu_get_reg(vcpu, reg_info->reg);
+		TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit),
+			"Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx",
+			reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val);
+
+		/* Test a 'write' by disabling all the features of the register map */
+		ret = __vcpu_set_reg(vcpu, reg_info->reg, 0);
+		TEST_ASSERT(ret == 0,
+			"Failed to clear all the features of reg: 0x%lx; ret: %d",
+			reg_info->reg, errno);
+
+		val = vcpu_get_reg(vcpu, reg_info->reg);
+		TEST_ASSERT(val == 0,
+			"Expected all the features to be cleared for reg: 0x%lx", reg_info->reg);
+
+		/*
+		 * Test enabling a feature that's not supported.
+		 * Avoid this check if all the bits are occupied.
+		 */
+		if (reg_info->max_feat_bit < 63) {
+			ret = __vcpu_set_reg(vcpu, reg_info->reg, BIT(reg_info->max_feat_bit + 1));
+			TEST_ASSERT(ret != 0 && errno == EINVAL,
+			"Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx",
+			errno, reg_info->reg);
+		}
+	}
+}
+
+static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu)
+{
+	uint64_t val;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) {
+		const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i];
+
+		/*
+		 * Before starting the VM, the test clears all the bits.
+		 * Check if that's still the case.
+		 */
+		val = vcpu_get_reg(vcpu, reg_info->reg);
+		TEST_ASSERT(val == 0,
+			"Expected all the features to be cleared for reg: 0x%lx",
+			reg_info->reg);
+
+		/*
+		 * Since the VM has run at least once, KVM shouldn't allow modification of
+		 * the registers and should return EBUSY. Set the registers and check for
+		 * the expected errno.
+		 */
+		ret = __vcpu_set_reg(vcpu, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit));
+		TEST_ASSERT(ret != 0 && errno == EBUSY,
+		"Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx",
+		errno, reg_info->reg);
+	}
+}
+
+static struct kvm_vm *test_vm_create(struct kvm_vcpu **vcpu)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(vcpu, guest_code);
+
+	steal_time_init(*vcpu);
+
+	return vm;
+}
+
+static void test_guest_stage(struct kvm_vm **vm, struct kvm_vcpu **vcpu)
+{
+	int prev_stage = stage;
+
+	pr_debug("Stage: %d\n", prev_stage);
+
+	/* Sync the stage early, the VM might be freed below. */
+	stage++;
+	sync_global_to_guest(*vm, stage);
+
+	switch (prev_stage) {
+	case TEST_STAGE_REG_IFACE:
+		test_fw_regs_after_vm_start(*vcpu);
+		break;
+	case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
+		/* Start a new VM so that all the features are now enabled by default */
+		kvm_vm_free(*vm);
+		*vm = test_vm_create(vcpu);
+		break;
+	case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
+	case TEST_STAGE_HVC_IFACE_FALSE_INFO:
+		break;
+	default:
+		TEST_FAIL("Unknown test stage: %d", prev_stage);
+	}
+}
+
+static void test_run(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	bool guest_done = false;
+
+	vm = test_vm_create(&vcpu);
+
+	test_fw_regs_before_vm_start(vcpu);
+
+	while (!guest_done) {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			test_guest_stage(&vm, &vcpu);
+			break;
+		case UCALL_DONE:
+			guest_done = true;
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		default:
+			TEST_FAIL("Unexpected guest exit");
+		}
+	}
+
+	kvm_vm_free(vm);
+}
+
+int main(void)
+{
+	test_run();
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/mmio_abort.c b/tools/testing/selftests/kvm/arm64/mmio_abort.c
new file mode 100644
index 000000000000..8b7a80a51b1c
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/mmio_abort.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mmio_abort - Tests for userspace MMIO abort injection
+ *
+ * Copyright (c) 2024 Google LLC
+ */
+#include "processor.h"
+#include "test_util.h"
+
+#define MMIO_ADDR	0x8000000ULL
+
+static u64 expected_abort_pc;
+
+static void expect_sea_handler(struct ex_regs *regs)
+{
+	u64 esr = read_sysreg(esr_el1);
+
+	GUEST_ASSERT_EQ(regs->pc, expected_abort_pc);
+	GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR);
+	GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT);
+
+	GUEST_DONE();
+}
+
+static void unexpected_dabt_handler(struct ex_regs *regs)
+{
+	GUEST_FAIL("Unexpected data abort at PC: %lx\n", regs->pc);
+}
+
+static struct kvm_vm *vm_create_with_dabt_handler(struct kvm_vcpu **vcpu, void *guest_code,
+						  handler_fn dabt_handler)
+{
+	struct kvm_vm *vm = vm_create_with_one_vcpu(vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(*vcpu);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_ELx_EC_DABT_CUR, dabt_handler);
+
+	virt_map(vm, MMIO_ADDR, MMIO_ADDR, 1);
+
+	return vm;
+}
+
+static void vcpu_inject_extabt(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events = {};
+
+	events.exception.ext_dabt_pending = true;
+	vcpu_events_set(vcpu, &events);
+}
+
+static void vcpu_run_expect_done(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
+
+extern char test_mmio_abort_insn;
+
+static void test_mmio_abort_guest(void)
+{
+	WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_abort_insn);
+
+	asm volatile("test_mmio_abort_insn:\n\t"
+		     "ldr x0, [%0]\n\t"
+		     : : "r" (MMIO_ADDR) : "x0", "memory");
+
+	GUEST_FAIL("MMIO instruction should not retire");
+}
+
+/*
+ * Test that KVM doesn't complete MMIO emulation when userspace has made an
+ * external abort pending for the instruction.
+ */
+static void test_mmio_abort(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_abort_guest,
+							expect_sea_handler);
+	struct kvm_run *run = vcpu->run;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
+	TEST_ASSERT_EQ(run->mmio.phys_addr, MMIO_ADDR);
+	TEST_ASSERT_EQ(run->mmio.len, sizeof(unsigned long));
+	TEST_ASSERT(!run->mmio.is_write, "Expected MMIO read");
+
+	vcpu_inject_extabt(vcpu);
+	vcpu_run_expect_done(vcpu);
+	kvm_vm_free(vm);
+}
+
+extern char test_mmio_nisv_insn;
+
+static void test_mmio_nisv_guest(void)
+{
+	WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_nisv_insn);
+
+	asm volatile("test_mmio_nisv_insn:\n\t"
+		     "ldr x0, [%0], #8\n\t"
+		     : : "r" (MMIO_ADDR) : "x0", "memory");
+
+	GUEST_FAIL("MMIO instruction should not retire");
+}
+
+/*
+ * Test that the KVM_RUN ioctl fails for ESR_EL2.ISV=0 MMIO aborts if userspace
+ * hasn't enabled KVM_CAP_ARM_NISV_TO_USER.
+ */
+static void test_mmio_nisv(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest,
+							unexpected_dabt_handler);
+
+	TEST_ASSERT(_vcpu_run(vcpu), "Expected nonzero return code from KVM_RUN");
+	TEST_ASSERT_EQ(errno, ENOSYS);
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Test that ESR_EL2.ISV=0 MMIO aborts reach userspace and that an injected SEA
+ * reaches the guest.
+ */
+static void test_mmio_nisv_abort(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest,
+							expect_sea_handler);
+	struct kvm_run *run = vcpu->run;
+
+	vm_enable_cap(vm, KVM_CAP_ARM_NISV_TO_USER, 1);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_NISV);
+	TEST_ASSERT_EQ(run->arm_nisv.fault_ipa, MMIO_ADDR);
+
+	vcpu_inject_extabt(vcpu);
+	vcpu_run_expect_done(vcpu);
+	kvm_vm_free(vm);
+}
+
+int main(void)
+{
+	test_mmio_abort();
+	test_mmio_nisv();
+	test_mmio_nisv_abort();
+}
diff --git a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
new file mode 100644
index 000000000000..ebd70430c89d
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Check that, on a GICv3 system, not configuring GICv3 correctly
+// results in all of the sysregs generating an UNDEF exception.
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+static volatile bool handled;
+
+#define __check_sr_read(r)					\
+	({							\
+		uint64_t val;					\
+								\
+		handled = false;				\
+		dsb(sy);					\
+		val = read_sysreg_s(SYS_ ## r);			\
+		val;						\
+	})
+
+#define __check_sr_write(r)					\
+	do {							\
+		handled = false;				\
+		dsb(sy);					\
+		write_sysreg_s(0, SYS_ ## r);			\
+		isb();						\
+	} while(0)
+
+/* Fatal checks */
+#define check_sr_read(r)					\
+	do {							\
+		__check_sr_read(r);				\
+		__GUEST_ASSERT(handled, #r " no read trap");	\
+	} while(0)
+
+#define check_sr_write(r)					\
+	do {							\
+		__check_sr_write(r);				\
+		__GUEST_ASSERT(handled, #r " no write trap");	\
+	} while(0)
+
+#define check_sr_rw(r)				\
+	do {					\
+		check_sr_read(r);		\
+		check_sr_write(r);		\
+	} while(0)
+
+static void guest_code(void)
+{
+	uint64_t val;
+
+	/*
+	 * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having
+	 * hidden the feature at runtime without any other userspace action.
+	 */
+	__GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC),
+				 read_sysreg(id_aa64pfr0_el1)) == 0,
+		       "GICv3 wrongly advertised");
+
+	/*
+	 * Access all GICv3 registers, and fail if we don't get an UNDEF.
+	 * Note that we happily access all the APxRn registers without
+	 * checking their existance, as all we want to see is a failure.
+	 */
+	check_sr_rw(ICC_PMR_EL1);
+	check_sr_read(ICC_IAR0_EL1);
+	check_sr_write(ICC_EOIR0_EL1);
+	check_sr_rw(ICC_HPPIR0_EL1);
+	check_sr_rw(ICC_BPR0_EL1);
+	check_sr_rw(ICC_AP0R0_EL1);
+	check_sr_rw(ICC_AP0R1_EL1);
+	check_sr_rw(ICC_AP0R2_EL1);
+	check_sr_rw(ICC_AP0R3_EL1);
+	check_sr_rw(ICC_AP1R0_EL1);
+	check_sr_rw(ICC_AP1R1_EL1);
+	check_sr_rw(ICC_AP1R2_EL1);
+	check_sr_rw(ICC_AP1R3_EL1);
+	check_sr_write(ICC_DIR_EL1);
+	check_sr_read(ICC_RPR_EL1);
+	check_sr_write(ICC_SGI1R_EL1);
+	check_sr_write(ICC_ASGI1R_EL1);
+	check_sr_write(ICC_SGI0R_EL1);
+	check_sr_read(ICC_IAR1_EL1);
+	check_sr_write(ICC_EOIR1_EL1);
+	check_sr_rw(ICC_HPPIR1_EL1);
+	check_sr_rw(ICC_BPR1_EL1);
+	check_sr_rw(ICC_CTLR_EL1);
+	check_sr_rw(ICC_IGRPEN0_EL1);
+	check_sr_rw(ICC_IGRPEN1_EL1);
+
+	/*
+	 * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can
+	 * be RAO/WI. Engage in non-fatal accesses, starting with a
+	 * write of 0 to try and disable SRE, and let's see if it
+	 * sticks.
+	 */
+	__check_sr_write(ICC_SRE_EL1);
+	if (!handled)
+		GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n");
+
+	val = __check_sr_read(ICC_SRE_EL1);
+	if (!handled) {
+		__GUEST_ASSERT((val & BIT(0)),
+			       "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n");
+		GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n");
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_undef_handler(struct ex_regs *regs)
+{
+	/* Success, we've gracefully exploded! */
+	handled = true;
+	regs->pc += 4;
+}
+
+static void test_run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	do {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_PRINTF:
+			printf("%s", uc.buffer);
+			break;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	} while (uc.cmd != UCALL_DONE);
+}
+
+static void test_guest_no_gicv3(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/* Create a VM without a GICv3 */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_UNKNOWN, guest_undef_handler);
+
+	test_run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t pfr0;
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+	pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
+	__TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0),
+		       "GICv3 not supported.");
+	kvm_vm_free(vm);
+
+	test_guest_no_gicv3();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/page_fault_test.c b/tools/testing/selftests/kvm/arm64/page_fault_test.c
new file mode 100644
index 000000000000..ec33a8f9c908
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/page_fault_test.c
@@ -0,0 +1,1135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * page_fault_test.c - Test stage 2 faults.
+ *
+ * This test tries different combinations of guest accesses (e.g., write,
+ * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on
+ * hugetlbfs with a hole). It checks that the expected handling method is
+ * called (e.g., uffd faults with the right address and write/read flag).
+ */
+#include <linux/bitmap.h>
+#include <fcntl.h>
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+#include <asm/sysreg.h>
+#include <linux/bitfield.h>
+#include "guest_modes.h"
+#include "userfaultfd_util.h"
+
+/* Guest virtual addresses that point to the test page and its PTE. */
+#define TEST_GVA				0xc0000000
+#define TEST_EXEC_GVA				(TEST_GVA + 0x8)
+#define TEST_PTE_GVA				0xb0000000
+#define TEST_DATA				0x0123456789ABCDEF
+
+static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA;
+
+#define CMD_NONE				(0)
+#define CMD_SKIP_TEST				(1ULL << 1)
+#define CMD_HOLE_PT				(1ULL << 2)
+#define CMD_HOLE_DATA				(1ULL << 3)
+#define CMD_CHECK_WRITE_IN_DIRTY_LOG		(1ULL << 4)
+#define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG		(1ULL << 5)
+#define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG		(1ULL << 6)
+#define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG	(1ULL << 7)
+#define CMD_SET_PTE_AF				(1ULL << 8)
+
+#define PREPARE_FN_NR				10
+#define CHECK_FN_NR				10
+
+static struct event_cnt {
+	int mmio_exits;
+	int fail_vcpu_runs;
+	int uffd_faults;
+	/* uffd_faults is incremented from multiple threads. */
+	pthread_mutex_t uffd_faults_mutex;
+} events;
+
+struct test_desc {
+	const char *name;
+	uint64_t mem_mark_cmd;
+	/* Skip the test if any prepare function returns false */
+	bool (*guest_prepare[PREPARE_FN_NR])(void);
+	void (*guest_test)(void);
+	void (*guest_test_check[CHECK_FN_NR])(void);
+	uffd_handler_t uffd_pt_handler;
+	uffd_handler_t uffd_data_handler;
+	void (*dabt_handler)(struct ex_regs *regs);
+	void (*iabt_handler)(struct ex_regs *regs);
+	void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run);
+	void (*fail_vcpu_run_handler)(int ret);
+	uint32_t pt_memslot_flags;
+	uint32_t data_memslot_flags;
+	bool skip;
+	struct event_cnt expected_events;
+};
+
+struct test_params {
+	enum vm_mem_backing_src_type src_type;
+	struct test_desc *test_desc;
+};
+
+static inline void flush_tlb_page(uint64_t vaddr)
+{
+	uint64_t page = vaddr >> 12;
+
+	dsb(ishst);
+	asm volatile("tlbi vaae1is, %0" :: "r" (page));
+	dsb(ish);
+	isb();
+}
+
+static void guest_write64(void)
+{
+	uint64_t val;
+
+	WRITE_ONCE(*guest_test_memory, TEST_DATA);
+	val = READ_ONCE(*guest_test_memory);
+	GUEST_ASSERT_EQ(val, TEST_DATA);
+}
+
+/* Check the system for atomic instructions. */
+static bool guest_check_lse(void)
+{
+	uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
+	uint64_t atomic;
+
+	atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0);
+	return atomic >= 2;
+}
+
+static bool guest_check_dc_zva(void)
+{
+	uint64_t dczid = read_sysreg(dczid_el0);
+	uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid);
+
+	return dzp == 0;
+}
+
+/* Compare and swap instruction. */
+static void guest_cas(void)
+{
+	uint64_t val;
+
+	GUEST_ASSERT(guest_check_lse());
+	asm volatile(".arch_extension lse\n"
+		     "casal %0, %1, [%2]\n"
+		     :: "r" (0ul), "r" (TEST_DATA), "r" (guest_test_memory));
+	val = READ_ONCE(*guest_test_memory);
+	GUEST_ASSERT_EQ(val, TEST_DATA);
+}
+
+static void guest_read64(void)
+{
+	uint64_t val;
+
+	val = READ_ONCE(*guest_test_memory);
+	GUEST_ASSERT_EQ(val, 0);
+}
+
+/* Address translation instruction */
+static void guest_at(void)
+{
+	uint64_t par;
+
+	asm volatile("at s1e1r, %0" :: "r" (guest_test_memory));
+	isb();
+	par = read_sysreg(par_el1);
+
+	/* Bit 1 indicates whether the AT was successful */
+	GUEST_ASSERT_EQ(par & 1, 0);
+}
+
+/*
+ * The size of the block written by "dc zva" is guaranteed to be between (2 <<
+ * 0) and (2 << 9), which is safe in our case as we need the write to happen
+ * for at least a word, and not more than a page.
+ */
+static void guest_dc_zva(void)
+{
+	uint16_t val;
+
+	asm volatile("dc zva, %0" :: "r" (guest_test_memory));
+	dsb(ish);
+	val = READ_ONCE(*guest_test_memory);
+	GUEST_ASSERT_EQ(val, 0);
+}
+
+/*
+ * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0).
+ * And that's special because KVM must take special care with those: they
+ * should still count as accesses for dirty logging or user-faulting, but
+ * should be handled differently on mmio.
+ */
+static void guest_ld_preidx(void)
+{
+	uint64_t val;
+	uint64_t addr = TEST_GVA - 8;
+
+	/*
+	 * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is
+	 * in a gap between memslots not backing by anything.
+	 */
+	asm volatile("ldr %0, [%1, #8]!"
+		     : "=r" (val), "+r" (addr));
+	GUEST_ASSERT_EQ(val, 0);
+	GUEST_ASSERT_EQ(addr, TEST_GVA);
+}
+
+static void guest_st_preidx(void)
+{
+	uint64_t val = TEST_DATA;
+	uint64_t addr = TEST_GVA - 8;
+
+	asm volatile("str %0, [%1, #8]!"
+		     : "+r" (val), "+r" (addr));
+
+	GUEST_ASSERT_EQ(addr, TEST_GVA);
+	val = READ_ONCE(*guest_test_memory);
+}
+
+static bool guest_set_ha(void)
+{
+	uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1);
+	uint64_t hadbs, tcr;
+
+	/* Skip if HA is not supported. */
+	hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1);
+	if (hadbs == 0)
+		return false;
+
+	tcr = read_sysreg(tcr_el1) | TCR_EL1_HA;
+	write_sysreg(tcr, tcr_el1);
+	isb();
+
+	return true;
+}
+
+static bool guest_clear_pte_af(void)
+{
+	*((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF;
+	flush_tlb_page(TEST_GVA);
+
+	return true;
+}
+
+static void guest_check_pte_af(void)
+{
+	dsb(ish);
+	GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF);
+}
+
+static void guest_check_write_in_dirty_log(void)
+{
+	GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG);
+}
+
+static void guest_check_no_write_in_dirty_log(void)
+{
+	GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG);
+}
+
+static void guest_check_s1ptw_wr_in_dirty_log(void)
+{
+	GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG);
+}
+
+static void guest_check_no_s1ptw_wr_in_dirty_log(void)
+{
+	GUEST_SYNC(CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG);
+}
+
+static void guest_exec(void)
+{
+	int (*code)(void) = (int (*)(void))TEST_EXEC_GVA;
+	int ret;
+
+	ret = code();
+	GUEST_ASSERT_EQ(ret, 0x77);
+}
+
+static bool guest_prepare(struct test_desc *test)
+{
+	bool (*prepare_fn)(void);
+	int i;
+
+	for (i = 0; i < PREPARE_FN_NR; i++) {
+		prepare_fn = test->guest_prepare[i];
+		if (prepare_fn && !prepare_fn())
+			return false;
+	}
+
+	return true;
+}
+
+static void guest_test_check(struct test_desc *test)
+{
+	void (*check_fn)(void);
+	int i;
+
+	for (i = 0; i < CHECK_FN_NR; i++) {
+		check_fn = test->guest_test_check[i];
+		if (check_fn)
+			check_fn();
+	}
+}
+
+static void guest_code(struct test_desc *test)
+{
+	if (!guest_prepare(test))
+		GUEST_SYNC(CMD_SKIP_TEST);
+
+	GUEST_SYNC(test->mem_mark_cmd);
+
+	if (test->guest_test)
+		test->guest_test();
+
+	guest_test_check(test);
+	GUEST_DONE();
+}
+
+static void no_dabt_handler(struct ex_regs *regs)
+{
+	GUEST_FAIL("Unexpected dabt, far_el1 = 0x%lx", read_sysreg(far_el1));
+}
+
+static void no_iabt_handler(struct ex_regs *regs)
+{
+	GUEST_FAIL("Unexpected iabt, pc = 0x%lx", regs->pc);
+}
+
+static struct uffd_args {
+	char *copy;
+	void *hva;
+	uint64_t paging_size;
+} pt_args, data_args;
+
+/* Returns true to continue the test, and false if it should be skipped. */
+static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg,
+				struct uffd_args *args)
+{
+	uint64_t addr = msg->arg.pagefault.address;
+	uint64_t flags = msg->arg.pagefault.flags;
+	struct uffdio_copy copy;
+	int ret;
+
+	TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING,
+		    "The only expected UFFD mode is MISSING");
+	TEST_ASSERT_EQ(addr, (uint64_t)args->hva);
+
+	pr_debug("uffd fault: addr=%p write=%d\n",
+		 (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE));
+
+	copy.src = (uint64_t)args->copy;
+	copy.dst = addr;
+	copy.len = args->paging_size;
+	copy.mode = 0;
+
+	ret = ioctl(uffd, UFFDIO_COPY, &copy);
+	if (ret == -1) {
+		pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n",
+			addr, errno);
+		return ret;
+	}
+
+	pthread_mutex_lock(&events.uffd_faults_mutex);
+	events.uffd_faults += 1;
+	pthread_mutex_unlock(&events.uffd_faults_mutex);
+	return 0;
+}
+
+static int uffd_pt_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+	return uffd_generic_handler(mode, uffd, msg, &pt_args);
+}
+
+static int uffd_data_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+	return uffd_generic_handler(mode, uffd, msg, &data_args);
+}
+
+static void setup_uffd_args(struct userspace_mem_region *region,
+			    struct uffd_args *args)
+{
+	args->hva = (void *)region->region.userspace_addr;
+	args->paging_size = region->region.memory_size;
+
+	args->copy = malloc(args->paging_size);
+	TEST_ASSERT(args->copy, "Failed to allocate data copy.");
+	memcpy(args->copy, args->hva, args->paging_size);
+}
+
+static void setup_uffd(struct kvm_vm *vm, struct test_params *p,
+		       struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd)
+{
+	struct test_desc *test = p->test_desc;
+	int uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
+
+	setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args);
+	setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args);
+
+	*pt_uffd = NULL;
+	if (test->uffd_pt_handler)
+		*pt_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+						    pt_args.hva,
+						    pt_args.paging_size,
+						    1, test->uffd_pt_handler);
+
+	*data_uffd = NULL;
+	if (test->uffd_data_handler)
+		*data_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+						      data_args.hva,
+						      data_args.paging_size,
+						      1, test->uffd_data_handler);
+}
+
+static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd,
+		      struct uffd_desc *data_uffd)
+{
+	if (test->uffd_pt_handler)
+		uffd_stop_demand_paging(pt_uffd);
+	if (test->uffd_data_handler)
+		uffd_stop_demand_paging(data_uffd);
+
+	free(pt_args.copy);
+	free(data_args.copy);
+}
+
+static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg)
+{
+	TEST_FAIL("There was no UFFD fault expected.");
+	return -1;
+}
+
+/* Returns false if the test should be skipped. */
+static bool punch_hole_in_backing_store(struct kvm_vm *vm,
+					struct userspace_mem_region *region)
+{
+	void *hva = (void *)region->region.userspace_addr;
+	uint64_t paging_size = region->region.memory_size;
+	int ret, fd = region->fd;
+
+	if (fd != -1) {
+		ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				0, paging_size);
+		TEST_ASSERT(ret == 0, "fallocate failed");
+	} else {
+		ret = madvise(hva, paging_size, MADV_DONTNEED);
+		TEST_ASSERT(ret == 0, "madvise failed");
+	}
+
+	return true;
+}
+
+static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run)
+{
+	struct userspace_mem_region *region;
+	void *hva;
+
+	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+	hva = (void *)region->region.userspace_addr;
+
+	TEST_ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr);
+
+	memcpy(hva, run->mmio.data, run->mmio.len);
+	events.mmio_exits += 1;
+}
+
+static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run)
+{
+	uint64_t data;
+
+	memcpy(&data, run->mmio.data, sizeof(data));
+	pr_debug("addr=%lld len=%d w=%d data=%lx\n",
+		 run->mmio.phys_addr, run->mmio.len,
+		 run->mmio.is_write, data);
+	TEST_FAIL("There was no MMIO exit expected.");
+}
+
+static bool check_write_in_dirty_log(struct kvm_vm *vm,
+				     struct userspace_mem_region *region,
+				     uint64_t host_pg_nr)
+{
+	unsigned long *bmap;
+	bool first_page_dirty;
+	uint64_t size = region->region.memory_size;
+
+	/* getpage_size() is not always equal to vm->page_size */
+	bmap = bitmap_zalloc(size / getpagesize());
+	kvm_vm_get_dirty_log(vm, region->region.slot, bmap);
+	first_page_dirty = test_bit(host_pg_nr, bmap);
+	free(bmap);
+	return first_page_dirty;
+}
+
+/* Returns true to continue the test, and false if it should be skipped. */
+static bool handle_cmd(struct kvm_vm *vm, int cmd)
+{
+	struct userspace_mem_region *data_region, *pt_region;
+	bool continue_test = true;
+	uint64_t pte_gpa, pte_pg;
+
+	data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+	pt_region = vm_get_mem_region(vm, MEM_REGION_PT);
+	pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
+	pte_pg = (pte_gpa - pt_region->region.guest_phys_addr) / getpagesize();
+
+	if (cmd == CMD_SKIP_TEST)
+		continue_test = false;
+
+	if (cmd & CMD_HOLE_PT)
+		continue_test = punch_hole_in_backing_store(vm, pt_region);
+	if (cmd & CMD_HOLE_DATA)
+		continue_test = punch_hole_in_backing_store(vm, data_region);
+	if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG)
+		TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0),
+			    "Missing write in dirty log");
+	if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG)
+		TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, pte_pg),
+			    "Missing s1ptw write in dirty log");
+	if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG)
+		TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0),
+			    "Unexpected write in dirty log");
+	if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG)
+		TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, pte_pg),
+			    "Unexpected s1ptw write in dirty log");
+
+	return continue_test;
+}
+
+void fail_vcpu_run_no_handler(int ret)
+{
+	TEST_FAIL("Unexpected vcpu run failure");
+}
+
+void fail_vcpu_run_mmio_no_syndrome_handler(int ret)
+{
+	TEST_ASSERT(errno == ENOSYS,
+		    "The mmio handler should have returned not implemented.");
+	events.fail_vcpu_runs += 1;
+}
+
+typedef uint32_t aarch64_insn_t;
+extern aarch64_insn_t __exec_test[2];
+
+noinline void __return_0x77(void)
+{
+	asm volatile("__exec_test: mov x0, #0x77\n"
+		     "ret\n");
+}
+
+/*
+ * Note that this function runs on the host before the test VM starts: there's
+ * no need to sync the D$ and I$ caches.
+ */
+static void load_exec_code_for_test(struct kvm_vm *vm)
+{
+	uint64_t *code;
+	struct userspace_mem_region *region;
+	void *hva;
+
+	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+	hva = (void *)region->region.userspace_addr;
+
+	assert(TEST_EXEC_GVA > TEST_GVA);
+	code = hva + TEST_EXEC_GVA - TEST_GVA;
+	memcpy(code, __exec_test, sizeof(__exec_test));
+}
+
+static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+				 struct test_desc *test)
+{
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_DABT_CUR, no_dabt_handler);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_IABT_CUR, no_iabt_handler);
+}
+
+static void setup_gva_maps(struct kvm_vm *vm)
+{
+	struct userspace_mem_region *region;
+	uint64_t pte_gpa;
+
+	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+	/* Map TEST_GVA first. This will install a new PTE. */
+	virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr);
+	/* Then map TEST_PTE_GVA to the above PTE. */
+	pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
+	virt_pg_map(vm, TEST_PTE_GVA, pte_gpa);
+}
+
+enum pf_test_memslots {
+	CODE_AND_DATA_MEMSLOT,
+	PAGE_TABLE_MEMSLOT,
+	TEST_DATA_MEMSLOT,
+};
+
+/*
+ * Create a memslot for code and data at pfn=0, and test-data and PT ones
+ * at max_gfn.
+ */
+static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
+{
+	uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type);
+	uint64_t guest_page_size = vm->page_size;
+	uint64_t max_gfn = vm_compute_max_gfn(vm);
+	/* Enough for 2M of code when using 4K guest pages. */
+	uint64_t code_npages = 512;
+	uint64_t pt_size, data_size, data_gpa;
+
+	/*
+	 * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using
+	 * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs.  That's 13
+	 * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use
+	 * twice that just in case.
+	 */
+	pt_size = 26 * guest_page_size;
+
+	/* memslot sizes and gpa's must be aligned to the backing page size */
+	pt_size = align_up(pt_size, backing_src_pagesz);
+	data_size = align_up(guest_page_size, backing_src_pagesz);
+	data_gpa = (max_gfn * guest_page_size) - data_size;
+	data_gpa = align_down(data_gpa, backing_src_pagesz);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0,
+				    CODE_AND_DATA_MEMSLOT, code_npages, 0);
+	vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT;
+	vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT;
+
+	vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size,
+				    PAGE_TABLE_MEMSLOT, pt_size / guest_page_size,
+				    p->test_desc->pt_memslot_flags);
+	vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT;
+
+	vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT,
+				    data_size / guest_page_size,
+				    p->test_desc->data_memslot_flags);
+	vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+}
+
+static void setup_ucall(struct kvm_vm *vm)
+{
+	struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+
+	ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size);
+}
+
+static void setup_default_handlers(struct test_desc *test)
+{
+	if (!test->mmio_handler)
+		test->mmio_handler = mmio_no_handler;
+
+	if (!test->fail_vcpu_run_handler)
+		test->fail_vcpu_run_handler = fail_vcpu_run_no_handler;
+}
+
+static void check_event_counts(struct test_desc *test)
+{
+	TEST_ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults);
+	TEST_ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits);
+	TEST_ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs);
+}
+
+static void print_test_banner(enum vm_guest_mode mode, struct test_params *p)
+{
+	struct test_desc *test = p->test_desc;
+
+	pr_debug("Test: %s\n", test->name);
+	pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+	pr_debug("Testing memory backing src type: %s\n",
+		 vm_mem_backing_src_alias(p->src_type)->name);
+}
+
+static void reset_event_counts(void)
+{
+	memset(&events, 0, sizeof(events));
+}
+
+/*
+ * This function either succeeds, skips the test (after setting test->skip), or
+ * fails with a TEST_FAIL that aborts all tests.
+ */
+static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+			  struct test_desc *test)
+{
+	struct kvm_run *run;
+	struct ucall uc;
+	int ret;
+
+	run = vcpu->run;
+
+	for (;;) {
+		ret = _vcpu_run(vcpu);
+		if (ret) {
+			test->fail_vcpu_run_handler(ret);
+			goto done;
+		}
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			if (!handle_cmd(vm, uc.args[1])) {
+				test->skip = true;
+				goto done;
+			}
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			goto done;
+		case UCALL_NONE:
+			if (run->exit_reason == KVM_EXIT_MMIO)
+				test->mmio_handler(vm, run);
+			break;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	pr_debug(test->skip ? "Skipped.\n" : "Done.\n");
+}
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+	struct test_params *p = (struct test_params *)arg;
+	struct test_desc *test = p->test_desc;
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	struct uffd_desc *pt_uffd, *data_uffd;
+
+	print_test_banner(mode, p);
+
+	vm = ____vm_create(VM_SHAPE(mode));
+	setup_memslots(vm, p);
+	kvm_vm_elf_load(vm, program_invocation_name);
+	setup_ucall(vm);
+	vcpu = vm_vcpu_add(vm, 0, guest_code);
+
+	setup_gva_maps(vm);
+
+	reset_event_counts();
+
+	/*
+	 * Set some code in the data memslot for the guest to execute (only
+	 * applicable to the EXEC tests). This has to be done before
+	 * setup_uffd() as that function copies the memslot data for the uffd
+	 * handler.
+	 */
+	load_exec_code_for_test(vm);
+	setup_uffd(vm, p, &pt_uffd, &data_uffd);
+	setup_abort_handlers(vm, vcpu, test);
+	setup_default_handlers(test);
+	vcpu_args_set(vcpu, 1, test);
+
+	vcpu_run_loop(vm, vcpu, test);
+
+	kvm_vm_free(vm);
+	free_uffd(test, pt_uffd, data_uffd);
+
+	/*
+	 * Make sure we check the events after the uffd threads have exited,
+	 * which means they updated their respective event counters.
+	 */
+	if (!test->skip)
+		check_event_counts(test);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-s mem-type]\n", name);
+	puts("");
+	guest_modes_help();
+	backing_src_help("-s");
+	puts("");
+}
+
+#define SNAME(s)			#s
+#define SCAT2(a, b)			SNAME(a ## _ ## b)
+#define SCAT3(a, b, c)			SCAT2(a, SCAT2(b, c))
+#define SCAT4(a, b, c, d)		SCAT2(a, SCAT3(b, c, d))
+
+#define _CHECK(_test)			_CHECK_##_test
+#define _PREPARE(_test)			_PREPARE_##_test
+#define _PREPARE_guest_read64		NULL
+#define _PREPARE_guest_ld_preidx	NULL
+#define _PREPARE_guest_write64		NULL
+#define _PREPARE_guest_st_preidx	NULL
+#define _PREPARE_guest_exec		NULL
+#define _PREPARE_guest_at		NULL
+#define _PREPARE_guest_dc_zva		guest_check_dc_zva
+#define _PREPARE_guest_cas		guest_check_lse
+
+/* With or without access flag checks */
+#define _PREPARE_with_af		guest_set_ha, guest_clear_pte_af
+#define _PREPARE_no_af			NULL
+#define _CHECK_with_af			guest_check_pte_af
+#define _CHECK_no_af			NULL
+
+/* Performs an access and checks that no faults were triggered. */
+#define TEST_ACCESS(_access, _with_af, _mark_cmd)				\
+{										\
+	.name			= SCAT3(_access, _with_af, #_mark_cmd),		\
+	.guest_prepare		= { _PREPARE(_with_af),				\
+				    _PREPARE(_access) },			\
+	.mem_mark_cmd		= _mark_cmd,					\
+	.guest_test		= _access,					\
+	.guest_test_check	= { _CHECK(_with_af) },				\
+	.expected_events	= { 0 },					\
+}
+
+#define TEST_UFFD(_access, _with_af, _mark_cmd,					\
+		  _uffd_data_handler, _uffd_pt_handler, _uffd_faults)		\
+{										\
+	.name			= SCAT4(uffd, _access, _with_af, #_mark_cmd),	\
+	.guest_prepare		= { _PREPARE(_with_af),				\
+				    _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.mem_mark_cmd		= _mark_cmd,					\
+	.guest_test_check	= { _CHECK(_with_af) },				\
+	.uffd_data_handler	= _uffd_data_handler,				\
+	.uffd_pt_handler	= _uffd_pt_handler,				\
+	.expected_events	= { .uffd_faults = _uffd_faults, },		\
+}
+
+#define TEST_DIRTY_LOG(_access, _with_af, _test_check, _pt_check)		\
+{										\
+	.name			= SCAT3(dirty_log, _access, _with_af),		\
+	.data_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+	.guest_prepare		= { _PREPARE(_with_af),				\
+				    _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.guest_test_check	= { _CHECK(_with_af), _test_check, _pt_check },	\
+	.expected_events	= { 0 },					\
+}
+
+#define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler,		\
+				_uffd_faults, _test_check, _pt_check)		\
+{										\
+	.name			= SCAT3(uffd_and_dirty_log, _access, _with_af),	\
+	.data_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+	.guest_prepare		= { _PREPARE(_with_af),				\
+				    _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
+	.guest_test_check	= { _CHECK(_with_af), _test_check, _pt_check },	\
+	.uffd_data_handler	= _uffd_data_handler,				\
+	.uffd_pt_handler	= uffd_pt_handler,				\
+	.expected_events	= { .uffd_faults = _uffd_faults, },		\
+}
+
+#define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits)			\
+{										\
+	.name			= SCAT2(ro_memslot, _access),			\
+	.data_memslot_flags	= KVM_MEM_READONLY,				\
+	.pt_memslot_flags	= KVM_MEM_READONLY,				\
+	.guest_prepare		= { _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.mmio_handler		= _mmio_handler,				\
+	.expected_events	= { .mmio_exits = _mmio_exits },		\
+}
+
+#define TEST_RO_MEMSLOT_NO_SYNDROME(_access)					\
+{										\
+	.name			= SCAT2(ro_memslot_no_syndrome, _access),	\
+	.data_memslot_flags	= KVM_MEM_READONLY,				\
+	.pt_memslot_flags	= KVM_MEM_READONLY,				\
+	.guest_prepare		= { _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
+	.expected_events	= { .fail_vcpu_runs = 1 },			\
+}
+
+#define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits,	\
+				      _test_check)				\
+{										\
+	.name			= SCAT2(ro_memslot, _access),			\
+	.data_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
+	.pt_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
+	.guest_prepare		= { _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.guest_test_check	= { _test_check },				\
+	.mmio_handler		= _mmio_handler,				\
+	.expected_events	= { .mmio_exits = _mmio_exits},			\
+}
+
+#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check)		\
+{										\
+	.name			= SCAT2(ro_memslot_no_syn_and_dlog, _access),	\
+	.data_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
+	.pt_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
+	.guest_prepare		= { _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.guest_test_check	= { _test_check },				\
+	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
+	.expected_events	= { .fail_vcpu_runs = 1 },			\
+}
+
+#define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits,		\
+				 _uffd_data_handler, _uffd_faults)		\
+{										\
+	.name			= SCAT2(ro_memslot_uffd, _access),		\
+	.data_memslot_flags	= KVM_MEM_READONLY,				\
+	.pt_memslot_flags	= KVM_MEM_READONLY,				\
+	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
+	.guest_prepare		= { _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.uffd_data_handler	= _uffd_data_handler,				\
+	.uffd_pt_handler	= uffd_pt_handler,				\
+	.mmio_handler		= _mmio_handler,				\
+	.expected_events	= { .mmio_exits = _mmio_exits,			\
+				    .uffd_faults = _uffd_faults },		\
+}
+
+#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler,	\
+					     _uffd_faults)			\
+{										\
+	.name			= SCAT2(ro_memslot_no_syndrome, _access),	\
+	.data_memslot_flags	= KVM_MEM_READONLY,				\
+	.pt_memslot_flags	= KVM_MEM_READONLY,				\
+	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
+	.guest_prepare		= { _PREPARE(_access) },			\
+	.guest_test		= _access,					\
+	.uffd_data_handler	= _uffd_data_handler,				\
+	.uffd_pt_handler	= uffd_pt_handler,			\
+	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
+	.expected_events	= { .fail_vcpu_runs = 1,			\
+				    .uffd_faults = _uffd_faults },		\
+}
+
+static struct test_desc tests[] = {
+
+	/* Check that HW is setting the Access Flag (AF) (sanity checks). */
+	TEST_ACCESS(guest_read64, with_af, CMD_NONE),
+	TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE),
+	TEST_ACCESS(guest_cas, with_af, CMD_NONE),
+	TEST_ACCESS(guest_write64, with_af, CMD_NONE),
+	TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE),
+	TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE),
+	TEST_ACCESS(guest_exec, with_af, CMD_NONE),
+
+	/*
+	 * Punch a hole in the data backing store, and then try multiple
+	 * accesses: reads should rturn zeroes, and writes should
+	 * re-populate the page. Moreover, the test also check that no
+	 * exception was generated in the guest.  Note that this
+	 * reading/writing behavior is the same as reading/writing a
+	 * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from
+	 * userspace.
+	 */
+	TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA),
+	TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA),
+	TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA),
+	TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA),
+	TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA),
+	TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA),
+	TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA),
+
+	/*
+	 * Punch holes in the data and PT backing stores and mark them for
+	 * userfaultfd handling. This should result in 2 faults: the access
+	 * on the data backing store, and its respective S1 page table walk
+	 * (S1PTW).
+	 */
+	TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	/*
+	 * Can't test guest_at with_af as it's IMPDEF whether the AF is set.
+	 * The S1PTW fault should still be marked as a write.
+	 */
+	TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_no_handler, uffd_pt_handler, 1),
+	TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+	TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+		  uffd_data_handler, uffd_pt_handler, 2),
+
+	/*
+	 * Try accesses when the data and PT memory regions are both
+	 * tracked for dirty logging.
+	 */
+	TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log,
+		       guest_check_no_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_ld_preidx, with_af,
+		       guest_check_no_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log,
+		       guest_check_no_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+	TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log,
+		       guest_check_s1ptw_wr_in_dirty_log),
+
+	/*
+	 * Access when the data and PT memory regions are both marked for
+	 * dirty logging and UFFD at the same time. The expected result is
+	 * that writes should mark the dirty log and trigger a userfaultfd
+	 * write fault.  Reads/execs should result in a read userfaultfd
+	 * fault, and nothing in the dirty log.  Any S1PTW should result in
+	 * a write in the dirty log and a userfaultfd write.
+	 */
+	TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af,
+				uffd_data_handler, 2,
+				guest_check_no_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af,
+				uffd_data_handler, 2,
+				guest_check_no_write_in_dirty_log,
+				guest_check_no_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af,
+				uffd_data_handler,
+				2, guest_check_no_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, uffd_no_handler, 1,
+				guest_check_no_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af,
+				uffd_data_handler, 2,
+				guest_check_no_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af,
+				uffd_data_handler,
+				2, guest_check_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af,
+				uffd_data_handler, 2,
+				guest_check_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af,
+				uffd_data_handler,
+				2, guest_check_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af,
+				uffd_data_handler, 2,
+				guest_check_write_in_dirty_log,
+				guest_check_s1ptw_wr_in_dirty_log),
+	/*
+	 * Access when both the PT and data regions are marked read-only
+	 * (with KVM_MEM_READONLY). Writes with a syndrome result in an
+	 * MMIO exit, writes with no syndrome (e.g., CAS) result in a
+	 * failed vcpu run, and reads/execs with and without syndroms do
+	 * not fault.
+	 */
+	TEST_RO_MEMSLOT(guest_read64, 0, 0),
+	TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0),
+	TEST_RO_MEMSLOT(guest_at, 0, 0),
+	TEST_RO_MEMSLOT(guest_exec, 0, 0),
+	TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1),
+	TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva),
+	TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas),
+	TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx),
+
+	/*
+	 * The PT and data regions are both read-only and marked
+	 * for dirty logging at the same time. The expected result is that
+	 * for writes there should be no write in the dirty log. The
+	 * readonly handling is the same as if the memslot was not marked
+	 * for dirty logging: writes with a syndrome result in an MMIO
+	 * exit, and writes with no syndrome result in a failed vcpu run.
+	 */
+	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0,
+				      guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0,
+				      guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0,
+				      guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0,
+				      guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler,
+				      1, guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva,
+						  guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas,
+						  guest_check_no_write_in_dirty_log),
+	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx,
+						  guest_check_no_write_in_dirty_log),
+
+	/*
+	 * The PT and data regions are both read-only and punched with
+	 * holes tracked with userfaultfd.  The expected result is the
+	 * union of both userfaultfd and read-only behaviors. For example,
+	 * write accesses result in a userfaultfd write fault and an MMIO
+	 * exit.  Writes with no syndrome result in a failed vcpu run and
+	 * no userfaultfd write fault. Reads result in userfaultfd getting
+	 * triggered.
+	 */
+	TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, uffd_data_handler, 2),
+	TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, uffd_data_handler, 2),
+	TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, uffd_no_handler, 1),
+	TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, uffd_data_handler, 2),
+	TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1,
+				 uffd_data_handler, 2),
+	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, uffd_data_handler, 2),
+	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, uffd_no_handler, 1),
+	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, uffd_no_handler, 1),
+
+	{ 0 }
+};
+
+static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type)
+{
+	struct test_desc *t;
+
+	for (t = &tests[0]; t->name; t++) {
+		if (t->skip)
+			continue;
+
+		struct test_params p = {
+			.src_type = src_type,
+			.test_desc = t,
+		};
+
+		for_each_guest_mode(run_test, &p);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	enum vm_mem_backing_src_type src_type;
+	int opt;
+
+	src_type = DEFAULT_VM_MEM_SRC;
+
+	while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
+		switch (opt) {
+		case 'm':
+			guest_modes_cmdline(optarg);
+			break;
+		case 's':
+			src_type = parse_backing_src_type(optarg);
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			exit(0);
+		}
+	}
+
+	for_each_test_and_guest_mode(src_type);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/psci_test.c b/tools/testing/selftests/kvm/arm64/psci_test.c
new file mode 100644
index 000000000000..ab491ee9e5f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/psci_test.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * psci_test - Tests relating to KVM's PSCI implementation.
+ *
+ * Copyright (c) 2021 Google LLC.
+ *
+ * This test includes:
+ *  - A regression test for a race between KVM servicing the PSCI CPU_ON call
+ *    and userspace reading the targeted vCPU's registers.
+ *  - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated
+ *    KVM_SYSTEM_EVENT_SUSPEND UAPI.
+ */
+
+#include <linux/kernel.h>
+#include <linux/psci.h>
+#include <asm/cputype.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define CPU_ON_ENTRY_ADDR 0xfeedf00dul
+#define CPU_ON_CONTEXT_ID 0xdeadc0deul
+
+static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr,
+			    uint64_t context_id)
+{
+	struct arm_smccc_res res;
+
+	smccc_hvc(PSCI_0_2_FN64_CPU_ON, target_cpu, entry_addr, context_id,
+		  0, 0, 0, 0, &res);
+
+	return res.a0;
+}
+
+static uint64_t psci_affinity_info(uint64_t target_affinity,
+				   uint64_t lowest_affinity_level)
+{
+	struct arm_smccc_res res;
+
+	smccc_hvc(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity, lowest_affinity_level,
+		  0, 0, 0, 0, 0, &res);
+
+	return res.a0;
+}
+
+static uint64_t psci_system_suspend(uint64_t entry_addr, uint64_t context_id)
+{
+	struct arm_smccc_res res;
+
+	smccc_hvc(PSCI_1_0_FN64_SYSTEM_SUSPEND, entry_addr, context_id,
+		  0, 0, 0, 0, 0, &res);
+
+	return res.a0;
+}
+
+static uint64_t psci_system_off2(uint64_t type, uint64_t cookie)
+{
+	struct arm_smccc_res res;
+
+	smccc_hvc(PSCI_1_3_FN64_SYSTEM_OFF2, type, cookie, 0, 0, 0, 0, 0, &res);
+
+	return res.a0;
+}
+
+static uint64_t psci_features(uint32_t func_id)
+{
+	struct arm_smccc_res res;
+
+	smccc_hvc(PSCI_1_0_FN_PSCI_FEATURES, func_id, 0, 0, 0, 0, 0, 0, &res);
+
+	return res.a0;
+}
+
+static void vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mp_state mp_state = {
+		.mp_state = KVM_MP_STATE_STOPPED,
+	};
+
+	vcpu_mp_state_set(vcpu, &mp_state);
+}
+
+static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source,
+			       struct kvm_vcpu **target)
+{
+	struct kvm_vcpu_init init;
+	struct kvm_vm *vm;
+
+	vm = vm_create(2);
+
+	vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
+	init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
+
+	*source = aarch64_vcpu_add(vm, 0, &init, guest_code);
+	*target = aarch64_vcpu_add(vm, 1, &init, guest_code);
+
+	return vm;
+}
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	if (get_ucall(vcpu, &uc) == UCALL_ABORT)
+		REPORT_GUEST_ASSERT(uc);
+}
+
+static void assert_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	uint64_t obs_pc, obs_x0;
+
+	obs_pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
+	obs_x0 = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.regs[0]));
+
+	TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR,
+		    "unexpected target cpu pc: %lx (expected: %lx)",
+		    obs_pc, CPU_ON_ENTRY_ADDR);
+	TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID,
+		    "unexpected target context id: %lx (expected: %lx)",
+		    obs_x0, CPU_ON_CONTEXT_ID);
+}
+
+static void guest_test_cpu_on(uint64_t target_cpu)
+{
+	uint64_t target_state;
+
+	GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID));
+
+	do {
+		target_state = psci_affinity_info(target_cpu, 0);
+
+		GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) ||
+			     (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF));
+	} while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON);
+
+	GUEST_DONE();
+}
+
+static void host_test_cpu_on(void)
+{
+	struct kvm_vcpu *source, *target;
+	uint64_t target_mpidr;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm = setup_vm(guest_test_cpu_on, &source, &target);
+
+	/*
+	 * make sure the target is already off when executing the test.
+	 */
+	vcpu_power_off(target);
+
+	target_mpidr = vcpu_get_reg(target, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1));
+	vcpu_args_set(source, 1, target_mpidr & MPIDR_HWID_BITMASK);
+	enter_guest(source);
+
+	if (get_ucall(source, &uc) != UCALL_DONE)
+		TEST_FAIL("Unhandled ucall: %lu", uc.cmd);
+
+	assert_vcpu_reset(target);
+	kvm_vm_free(vm);
+}
+
+static void guest_test_system_suspend(void)
+{
+	uint64_t ret;
+
+	/* assert that SYSTEM_SUSPEND is discoverable */
+	GUEST_ASSERT(!psci_features(PSCI_1_0_FN_SYSTEM_SUSPEND));
+	GUEST_ASSERT(!psci_features(PSCI_1_0_FN64_SYSTEM_SUSPEND));
+
+	ret = psci_system_suspend(CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID);
+	GUEST_SYNC(ret);
+}
+
+static void host_test_system_suspend(void)
+{
+	struct kvm_vcpu *source, *target;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+
+	vm = setup_vm(guest_test_system_suspend, &source, &target);
+	vm_enable_cap(vm, KVM_CAP_ARM_SYSTEM_SUSPEND, 0);
+
+	vcpu_power_off(target);
+	run = source->run;
+
+	enter_guest(source);
+
+	TEST_ASSERT_KVM_EXIT_REASON(source, KVM_EXIT_SYSTEM_EVENT);
+	TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SUSPEND,
+		    "Unhandled system event: %u (expected: %u)",
+		    run->system_event.type, KVM_SYSTEM_EVENT_SUSPEND);
+
+	kvm_vm_free(vm);
+}
+
+static void guest_test_system_off2(void)
+{
+	uint64_t ret;
+
+	/* assert that SYSTEM_OFF2 is discoverable */
+	GUEST_ASSERT(psci_features(PSCI_1_3_FN_SYSTEM_OFF2) &
+		     PSCI_1_3_OFF_TYPE_HIBERNATE_OFF);
+	GUEST_ASSERT(psci_features(PSCI_1_3_FN64_SYSTEM_OFF2) &
+		     PSCI_1_3_OFF_TYPE_HIBERNATE_OFF);
+
+	/* With non-zero 'cookie' field, it should fail */
+	ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 1);
+	GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS);
+
+	/*
+	 * This would normally never return, so KVM sets the return value
+	 * to PSCI_RET_INTERNAL_FAILURE. The test case *does* return, so
+	 * that it can test both values for HIBERNATE_OFF.
+	 */
+	ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 0);
+	GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE);
+
+	/*
+	 * Revision F.b of the PSCI v1.3 specification documents zero as an
+	 * alias for HIBERNATE_OFF, since that's the value used in earlier
+	 * revisions of the spec and some implementations in the field.
+	 */
+	ret = psci_system_off2(0, 1);
+	GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS);
+
+	ret = psci_system_off2(0, 0);
+	GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE);
+
+	GUEST_DONE();
+}
+
+static void host_test_system_off2(void)
+{
+	struct kvm_vcpu *source, *target;
+	struct kvm_mp_state mps;
+	uint64_t psci_version = 0;
+	int nr_shutdowns = 0;
+	struct kvm_run *run;
+	struct ucall uc;
+
+	setup_vm(guest_test_system_off2, &source, &target);
+
+	psci_version = vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION);
+
+	TEST_ASSERT(psci_version >= PSCI_VERSION(1, 3),
+		    "Unexpected PSCI version %lu.%lu",
+		    PSCI_VERSION_MAJOR(psci_version),
+		    PSCI_VERSION_MINOR(psci_version));
+
+	vcpu_power_off(target);
+	run = source->run;
+
+	enter_guest(source);
+	while (run->exit_reason == KVM_EXIT_SYSTEM_EVENT) {
+		TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SHUTDOWN,
+			    "Unhandled system event: %u (expected: %u)",
+			    run->system_event.type, KVM_SYSTEM_EVENT_SHUTDOWN);
+		TEST_ASSERT(run->system_event.ndata >= 1,
+			    "Unexpected amount of system event data: %u (expected, >= 1)",
+			    run->system_event.ndata);
+		TEST_ASSERT(run->system_event.data[0] & KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2,
+			    "PSCI_OFF2 flag not set. Flags %llu (expected %llu)",
+			    run->system_event.data[0], KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2);
+
+		nr_shutdowns++;
+
+		/* Restart the vCPU */
+	        mps.mp_state = KVM_MP_STATE_RUNNABLE;
+		vcpu_mp_state_set(source, &mps);
+
+		enter_guest(source);
+	}
+
+	TEST_ASSERT(get_ucall(source, &uc) == UCALL_DONE, "Guest did not exit cleanly");
+	TEST_ASSERT(nr_shutdowns == 2, "Two shutdown events were expected, but saw %d", nr_shutdowns);
+}
+
+int main(void)
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SYSTEM_SUSPEND));
+
+	host_test_cpu_on();
+	host_test_system_suspend();
+	host_test_system_off2();
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
new file mode 100644
index 000000000000..bc6cf50e5135
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * set_id_regs - Test for setting ID register from usersapce.
+ *
+ * Copyright (c) 2023 Google LLC.
+ *
+ *
+ * Test that KVM supports setting ID registers from userspace and handles the
+ * feature set correctly.
+ */
+
+#include <stdint.h>
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include <linux/bitfield.h>
+
+enum ftr_type {
+	FTR_EXACT,			/* Use a predefined safe value */
+	FTR_LOWER_SAFE,			/* Smaller value is safe */
+	FTR_HIGHER_SAFE,		/* Bigger value is safe */
+	FTR_HIGHER_OR_ZERO_SAFE,	/* Bigger value is safe, but 0 is biggest */
+	FTR_END,			/* Mark the last ftr bits */
+};
+
+#define FTR_SIGNED	true	/* Value should be treated as signed */
+#define FTR_UNSIGNED	false	/* Value should be treated as unsigned */
+
+struct reg_ftr_bits {
+	char *name;
+	bool sign;
+	enum ftr_type type;
+	uint8_t shift;
+	uint64_t mask;
+	/*
+	 * For FTR_EXACT, safe_val is used as the exact safe value.
+	 * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value.
+	 */
+	int64_t safe_val;
+};
+
+struct test_feature_reg {
+	uint32_t reg;
+	const struct reg_ftr_bits *ftr_bits;
+};
+
+#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL)	\
+	{								\
+		.name = #NAME,						\
+		.sign = SIGNED,						\
+		.type = TYPE,						\
+		.shift = SHIFT,						\
+		.mask = MASK,						\
+		.safe_val = SAFE_VAL,					\
+	}
+
+#define REG_FTR_BITS(type, reg, field, safe_val) \
+	__REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \
+		       reg##_##field##_MASK, safe_val)
+
+#define S_REG_FTR_BITS(type, reg, field, safe_val) \
+	__REG_FTR_BITS(reg##_##field, FTR_SIGNED, type, reg##_##field##_SHIFT, \
+		       reg##_##field##_MASK, safe_val)
+
+#define REG_FTR_END					\
+	{						\
+		.type = FTR_END,			\
+	}
+
+static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = {
+	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DoubleLock, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, WRPs, 0),
+	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_dfr0_el1[] = {
+	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, ID_DFR0_EL1_PerfMon_PMUv3),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, ID_DFR0_EL1_CopDbg_Armv8),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64isar0_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RNDR, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TLB, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TS, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, FHM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, DP, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM4, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM3, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA3, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RDM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TME, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, ATOMIC, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, CRC32, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA2, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA1, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, AES, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64isar1_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, LS64, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, XS, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, I8MM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, DGH, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, BF16, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, SPECRES, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, SB, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, FRINTTS, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, LRCPC, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, FCMA, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, JSCVT, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, DPB, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, BC, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, RPRES, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, WFxT, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV3, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0),
+	REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL0, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, CSV2_frac, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, SSBS, ID_AA64PFR1_EL1_SSBS_NI),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, BT, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0),
+	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN4, 0),
+	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN64, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN16, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGENDEL0, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, SNSMEM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGEND, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ASIDBITS, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, PARANGE, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64mmfr1_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, TIDCP1, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, AFP, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, ETS, 0),
+	REG_FTR_BITS(FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1, SpecSEI, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, PAN, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, LO, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, HPDS, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, HAFDBS, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64mmfr2_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, E0PD, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, BBM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, TTL, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, AT, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, ST, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, VARange, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, IESB, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, LSM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, UAO, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, CnP, 0),
+	REG_FTR_END,
+};
+
+static const struct reg_ftr_bits ftr_id_aa64zfr0_el1[] = {
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, F64MM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, F32MM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, I8MM, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SM4, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SHA3, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, BF16, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, BitPerm, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, AES, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SVEver, 0),
+	REG_FTR_END,
+};
+
+#define TEST_REG(id, table)			\
+	{					\
+		.reg = id,			\
+		.ftr_bits = &((table)[0]),	\
+	}
+
+static struct test_feature_reg test_regs[] = {
+	TEST_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0_el1),
+	TEST_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0_el1),
+	TEST_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0_el1),
+	TEST_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1_el1),
+	TEST_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2_el1),
+	TEST_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0_el1),
+	TEST_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1_el1),
+	TEST_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0_el1),
+	TEST_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1_el1),
+	TEST_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2_el1),
+	TEST_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0_el1),
+};
+
+#define GUEST_REG_SYNC(id) GUEST_SYNC_ARGS(0, id, read_sysreg_s(id), 0, 0);
+
+static void guest_code(void)
+{
+	GUEST_REG_SYNC(SYS_ID_AA64DFR0_EL1);
+	GUEST_REG_SYNC(SYS_ID_DFR0_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64ISAR0_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64ISAR1_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64ISAR2_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64PFR0_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1);
+	GUEST_REG_SYNC(SYS_CTR_EL0);
+
+	GUEST_DONE();
+}
+
+/* Return a safe value to a given ftr_bits an ftr value */
+uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
+{
+	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+
+	if (ftr_bits->sign == FTR_UNSIGNED) {
+		switch (ftr_bits->type) {
+		case FTR_EXACT:
+			ftr = ftr_bits->safe_val;
+			break;
+		case FTR_LOWER_SAFE:
+			if (ftr > ftr_bits->safe_val)
+				ftr--;
+			break;
+		case FTR_HIGHER_SAFE:
+			if (ftr < ftr_max)
+				ftr++;
+			break;
+		case FTR_HIGHER_OR_ZERO_SAFE:
+			if (ftr == ftr_max)
+				ftr = 0;
+			else if (ftr != 0)
+				ftr++;
+			break;
+		default:
+			break;
+		}
+	} else if (ftr != ftr_max) {
+		switch (ftr_bits->type) {
+		case FTR_EXACT:
+			ftr = ftr_bits->safe_val;
+			break;
+		case FTR_LOWER_SAFE:
+			if (ftr > ftr_bits->safe_val)
+				ftr--;
+			break;
+		case FTR_HIGHER_SAFE:
+			if (ftr < ftr_max - 1)
+				ftr++;
+			break;
+		case FTR_HIGHER_OR_ZERO_SAFE:
+			if (ftr != 0 && ftr != ftr_max - 1)
+				ftr++;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return ftr;
+}
+
+/* Return an invalid value to a given ftr_bits an ftr value */
+uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
+{
+	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+
+	if (ftr_bits->sign == FTR_UNSIGNED) {
+		switch (ftr_bits->type) {
+		case FTR_EXACT:
+			ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1);
+			break;
+		case FTR_LOWER_SAFE:
+			ftr++;
+			break;
+		case FTR_HIGHER_SAFE:
+			ftr--;
+			break;
+		case FTR_HIGHER_OR_ZERO_SAFE:
+			if (ftr == 0)
+				ftr = ftr_max;
+			else
+				ftr--;
+			break;
+		default:
+			break;
+		}
+	} else if (ftr != ftr_max) {
+		switch (ftr_bits->type) {
+		case FTR_EXACT:
+			ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1);
+			break;
+		case FTR_LOWER_SAFE:
+			ftr++;
+			break;
+		case FTR_HIGHER_SAFE:
+			ftr--;
+			break;
+		case FTR_HIGHER_OR_ZERO_SAFE:
+			if (ftr == 0)
+				ftr = ftr_max - 1;
+			else
+				ftr--;
+			break;
+		default:
+			break;
+		}
+	} else {
+		ftr = 0;
+	}
+
+	return ftr;
+}
+
+static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg,
+				     const struct reg_ftr_bits *ftr_bits)
+{
+	uint8_t shift = ftr_bits->shift;
+	uint64_t mask = ftr_bits->mask;
+	uint64_t val, new_val, ftr;
+
+	val = vcpu_get_reg(vcpu, reg);
+	ftr = (val & mask) >> shift;
+
+	ftr = get_safe_value(ftr_bits, ftr);
+
+	ftr <<= shift;
+	val &= ~mask;
+	val |= ftr;
+
+	vcpu_set_reg(vcpu, reg, val);
+	new_val = vcpu_get_reg(vcpu, reg);
+	TEST_ASSERT_EQ(new_val, val);
+
+	return new_val;
+}
+
+static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg,
+			      const struct reg_ftr_bits *ftr_bits)
+{
+	uint8_t shift = ftr_bits->shift;
+	uint64_t mask = ftr_bits->mask;
+	uint64_t val, old_val, ftr;
+	int r;
+
+	val = vcpu_get_reg(vcpu, reg);
+	ftr = (val & mask) >> shift;
+
+	ftr = get_invalid_value(ftr_bits, ftr);
+
+	old_val = val;
+	ftr <<= shift;
+	val &= ~mask;
+	val |= ftr;
+
+	r = __vcpu_set_reg(vcpu, reg, val);
+	TEST_ASSERT(r < 0 && errno == EINVAL,
+		    "Unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
+
+	val = vcpu_get_reg(vcpu, reg);
+	TEST_ASSERT_EQ(val, old_val);
+}
+
+static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE];
+
+#define encoding_to_range_idx(encoding)							\
+	KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding),	\
+				     sys_reg_CRn(encoding), sys_reg_CRm(encoding),	\
+				     sys_reg_Op2(encoding))
+
+
+static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only)
+{
+	uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE];
+	struct reg_mask_range range = {
+		.addr = (__u64)masks,
+	};
+	int ret;
+
+	/* KVM should return error when reserved field is not zero */
+	range.reserved[0] = 1;
+	ret = __vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
+	TEST_ASSERT(ret, "KVM doesn't check invalid parameters.");
+
+	/* Get writable masks for feature ID registers */
+	memset(range.reserved, 0, sizeof(range.reserved));
+	vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
+
+	for (int i = 0; i < ARRAY_SIZE(test_regs); i++) {
+		const struct reg_ftr_bits *ftr_bits = test_regs[i].ftr_bits;
+		uint32_t reg_id = test_regs[i].reg;
+		uint64_t reg = KVM_ARM64_SYS_REG(reg_id);
+		int idx;
+
+		/* Get the index to masks array for the idreg */
+		idx = encoding_to_range_idx(reg_id);
+
+		for (int j = 0;  ftr_bits[j].type != FTR_END; j++) {
+			/* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */
+			if (aarch64_only && sys_reg_CRm(reg_id) < 4) {
+				ksft_test_result_skip("%s on AARCH64 only system\n",
+						      ftr_bits[j].name);
+				continue;
+			}
+
+			/* Make sure the feature field is writable */
+			TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask);
+
+			test_reg_set_fail(vcpu, reg, &ftr_bits[j]);
+
+			test_reg_vals[idx] = test_reg_set_success(vcpu, reg,
+								  &ftr_bits[j]);
+
+			ksft_test_result_pass("%s\n", ftr_bits[j].name);
+		}
+	}
+}
+
+#define MPAM_IDREG_TEST	6
+static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu)
+{
+	uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE];
+	struct reg_mask_range range = {
+		.addr = (__u64)masks,
+	};
+	uint64_t val;
+	int idx, err;
+
+	/*
+	 * If ID_AA64PFR0.MPAM is _not_ officially modifiable and is zero,
+	 * check that if it can be set to 1, (i.e. it is supported by the
+	 * hardware), that it can't be set to other values.
+	 */
+
+	/* Get writable masks for feature ID registers */
+	memset(range.reserved, 0, sizeof(range.reserved));
+	vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range);
+
+	/* Writeable? Nothing to test! */
+	idx = encoding_to_range_idx(SYS_ID_AA64PFR0_EL1);
+	if ((masks[idx] & ID_AA64PFR0_EL1_MPAM_MASK) == ID_AA64PFR0_EL1_MPAM_MASK) {
+		ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is officially writable, nothing to test\n");
+		return;
+	}
+
+	/* Get the id register value */
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
+
+	/* Try to set MPAM=0. This should always be possible. */
+	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 0);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
+	if (err)
+		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM=0 was not accepted\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=0 worked\n");
+
+	/* Try to set MPAM=1 */
+	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 1);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
+	if (err)
+		ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is not writable, nothing to test\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=1 was writable\n");
+
+	/* Try to set MPAM=2 */
+	val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 2);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val);
+	if (err)
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM not arbitrarily modifiable\n");
+	else
+		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM value should not be ignored\n");
+
+	/* And again for ID_AA64PFR1_EL1.MPAM_frac */
+	idx = encoding_to_range_idx(SYS_ID_AA64PFR1_EL1);
+	if ((masks[idx] & ID_AA64PFR1_EL1_MPAM_frac_MASK) == ID_AA64PFR1_EL1_MPAM_frac_MASK) {
+		ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is officially writable, nothing to test\n");
+		return;
+	}
+
+	/* Get the id register value */
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
+
+	/* Try to set MPAM_frac=0. This should always be possible. */
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 0);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
+	if (err)
+		ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM_frac=0 was not accepted\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=0 worked\n");
+
+	/* Try to set MPAM_frac=1 */
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 1);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
+	if (err)
+		ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is not writable, nothing to test\n");
+	else
+		ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=1 was writable\n");
+
+	/* Try to set MPAM_frac=2 */
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
+	val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 2);
+	err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val);
+	if (err)
+		ksft_test_result_pass("ID_AA64PFR1_EL1.MPAM_frac not arbitrarily modifiable\n");
+	else
+		ksft_test_result_fail("ID_AA64PFR1_EL1.MPAM_frac value should not be ignored\n");
+}
+
+static void test_guest_reg_read(struct kvm_vcpu *vcpu)
+{
+	bool done = false;
+	struct ucall uc;
+
+	while (!done) {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_SYNC:
+			/* Make sure the written values are seen by guest */
+			TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])],
+				       uc.args[3]);
+			break;
+		case UCALL_DONE:
+			done = true;
+			break;
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	}
+}
+
+/* Politely lifted from arch/arm64/include/asm/cache.h */
+/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */
+#define CLIDR_CTYPE_SHIFT(level)	(3 * (level - 1))
+#define CLIDR_CTYPE_MASK(level)		(7 << CLIDR_CTYPE_SHIFT(level))
+#define CLIDR_CTYPE(clidr, level)	\
+	(((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
+
+static void test_clidr(struct kvm_vcpu *vcpu)
+{
+	uint64_t clidr;
+	int level;
+
+	clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1));
+
+	/* find the first empty level in the cache hierarchy */
+	for (level = 1; level < 7; level++) {
+		if (!CLIDR_CTYPE(clidr, level))
+			break;
+	}
+
+	/*
+	 * If you have a mind-boggling 7 levels of cache, congratulations, you
+	 * get to fix this.
+	 */
+	TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy");
+
+	/* stick in a unified cache level */
+	clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level);
+
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr);
+	test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr;
+}
+
+static void test_ctr(struct kvm_vcpu *vcpu)
+{
+	u64 ctr;
+
+	ctr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0));
+	ctr &= ~CTR_EL0_DIC_MASK;
+	if (ctr & CTR_EL0_IminLine_MASK)
+		ctr--;
+
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), ctr);
+	test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr;
+}
+
+static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu)
+{
+	u64 val;
+
+	test_clidr(vcpu);
+	test_ctr(vcpu);
+
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1));
+	val++;
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val);
+
+	test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val;
+	ksft_test_result_pass("%s\n", __func__);
+}
+
+static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding)
+{
+	size_t idx = encoding_to_range_idx(encoding);
+	uint64_t observed;
+
+	observed = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding));
+	TEST_ASSERT_EQ(test_reg_vals[idx], observed);
+}
+
+static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an
+	 * architectural reset of the vCPU.
+	 */
+	aarch64_vcpu_setup(vcpu, NULL);
+
+	for (int i = 0; i < ARRAY_SIZE(test_regs); i++)
+		test_assert_id_reg_unchanged(vcpu, test_regs[i].reg);
+
+	test_assert_id_reg_unchanged(vcpu, SYS_MPIDR_EL1);
+	test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1);
+	test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0);
+
+	ksft_test_result_pass("%s\n", __func__);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	bool aarch64_only;
+	uint64_t val, el0;
+	int test_cnt;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	/* Check for AARCH64 only system */
+	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
+	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
+	aarch64_only = (el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
+
+	ksft_print_header();
+
+	test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) +
+		   ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) +
+		   ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) +
+		   ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) +
+		   ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) +
+		   ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 +
+		   MPAM_IDREG_TEST;
+
+	ksft_set_plan(test_cnt);
+
+	test_vm_ftr_id_regs(vcpu, aarch64_only);
+	test_vcpu_ftr_id_regs(vcpu);
+	test_user_set_mpam_reg(vcpu);
+
+	test_guest_reg_read(vcpu);
+
+	test_reset_preserves_id_regs(vcpu);
+
+	kvm_vm_free(vm);
+
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/kvm/arm64/smccc_filter.c b/tools/testing/selftests/kvm/arm64/smccc_filter.c
new file mode 100644
index 000000000000..2d189f3da228
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/smccc_filter.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * smccc_filter - Tests for the SMCCC filter UAPI.
+ *
+ * Copyright (c) 2023 Google LLC
+ *
+ * This test includes:
+ *  - Tests that the UAPI constraints are upheld by KVM. For example, userspace
+ *    is prevented from filtering the architecture range of SMCCC calls.
+ *  - Test that the filter actions (DENIED, FWD_TO_USER) work as intended.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/psci.h>
+#include <stdint.h>
+
+#include "processor.h"
+#include "test_util.h"
+
+enum smccc_conduit {
+	HVC_INSN,
+	SMC_INSN,
+};
+
+#define for_each_conduit(conduit)					\
+	for (conduit = HVC_INSN; conduit <= SMC_INSN; conduit++)
+
+static void guest_main(uint32_t func_id, enum smccc_conduit conduit)
+{
+	struct arm_smccc_res res;
+
+	if (conduit == SMC_INSN)
+		smccc_smc(func_id, 0, 0, 0, 0, 0, 0, 0, &res);
+	else
+		smccc_hvc(func_id, 0, 0, 0, 0, 0, 0, 0, &res);
+
+	GUEST_SYNC(res.a0);
+}
+
+static int __set_smccc_filter(struct kvm_vm *vm, uint32_t start, uint32_t nr_functions,
+			      enum kvm_smccc_filter_action action)
+{
+	struct kvm_smccc_filter filter = {
+		.base		= start,
+		.nr_functions	= nr_functions,
+		.action		= action,
+	};
+
+	return __kvm_device_attr_set(vm->fd, KVM_ARM_VM_SMCCC_CTRL,
+				     KVM_ARM_VM_SMCCC_FILTER, &filter);
+}
+
+static void set_smccc_filter(struct kvm_vm *vm, uint32_t start, uint32_t nr_functions,
+			     enum kvm_smccc_filter_action action)
+{
+	int ret = __set_smccc_filter(vm, start, nr_functions, action);
+
+	TEST_ASSERT(!ret, "failed to configure SMCCC filter: %d", ret);
+}
+
+static struct kvm_vm *setup_vm(struct kvm_vcpu **vcpu)
+{
+	struct kvm_vcpu_init init;
+	struct kvm_vm *vm;
+
+	vm = vm_create(1);
+	vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
+
+	/*
+	 * Enable in-kernel emulation of PSCI to ensure that calls are denied
+	 * due to the SMCCC filter, not because of KVM.
+	 */
+	init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
+
+	*vcpu = aarch64_vcpu_add(vm, 0, &init, guest_main);
+	return vm;
+}
+
+static void test_pad_must_be_zero(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = setup_vm(&vcpu);
+	struct kvm_smccc_filter filter = {
+		.base		= PSCI_0_2_FN_PSCI_VERSION,
+		.nr_functions	= 1,
+		.action		= KVM_SMCCC_FILTER_DENY,
+		.pad		= { -1 },
+	};
+	int r;
+
+	r = __kvm_device_attr_set(vm->fd, KVM_ARM_VM_SMCCC_CTRL,
+				  KVM_ARM_VM_SMCCC_FILTER, &filter);
+	TEST_ASSERT(r < 0 && errno == EINVAL,
+		    "Setting filter with nonzero padding should return EINVAL");
+}
+
+/* Ensure that userspace cannot filter the Arm Architecture SMCCC range */
+static void test_filter_reserved_range(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = setup_vm(&vcpu);
+	uint32_t smc64_fn;
+	int r;
+
+	r = __set_smccc_filter(vm, ARM_SMCCC_ARCH_WORKAROUND_1,
+			       1, KVM_SMCCC_FILTER_DENY);
+	TEST_ASSERT(r < 0 && errno == EEXIST,
+		    "Attempt to filter reserved range should return EEXIST");
+
+	smc64_fn = ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64,
+				      0, 0);
+
+	r = __set_smccc_filter(vm, smc64_fn, 1, KVM_SMCCC_FILTER_DENY);
+	TEST_ASSERT(r < 0 && errno == EEXIST,
+		    "Attempt to filter reserved range should return EEXIST");
+
+	kvm_vm_free(vm);
+}
+
+static void test_invalid_nr_functions(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = setup_vm(&vcpu);
+	int r;
+
+	r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 0, KVM_SMCCC_FILTER_DENY);
+	TEST_ASSERT(r < 0 && errno == EINVAL,
+		    "Attempt to filter 0 functions should return EINVAL");
+
+	kvm_vm_free(vm);
+}
+
+static void test_overflow_nr_functions(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = setup_vm(&vcpu);
+	int r;
+
+	r = __set_smccc_filter(vm, ~0, ~0, KVM_SMCCC_FILTER_DENY);
+	TEST_ASSERT(r < 0 && errno == EINVAL,
+		    "Attempt to overflow filter range should return EINVAL");
+
+	kvm_vm_free(vm);
+}
+
+static void test_reserved_action(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = setup_vm(&vcpu);
+	int r;
+
+	r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, -1);
+	TEST_ASSERT(r < 0 && errno == EINVAL,
+		    "Attempt to use reserved filter action should return EINVAL");
+
+	kvm_vm_free(vm);
+}
+
+
+/* Test that overlapping configurations of the SMCCC filter are rejected */
+static void test_filter_overlap(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm = setup_vm(&vcpu);
+	int r;
+
+	set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, KVM_SMCCC_FILTER_DENY);
+
+	r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, KVM_SMCCC_FILTER_DENY);
+	TEST_ASSERT(r < 0 && errno == EEXIST,
+		    "Attempt to filter already configured range should return EEXIST");
+
+	kvm_vm_free(vm);
+}
+
+static void expect_call_denied(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	if (get_ucall(vcpu, &uc) != UCALL_SYNC)
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+
+	TEST_ASSERT(uc.args[1] == SMCCC_RET_NOT_SUPPORTED,
+		    "Unexpected SMCCC return code: %lu", uc.args[1]);
+}
+
+/* Denied SMCCC calls have a return code of SMCCC_RET_NOT_SUPPORTED */
+static void test_filter_denied(void)
+{
+	enum smccc_conduit conduit;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	for_each_conduit(conduit) {
+		vm = setup_vm(&vcpu);
+
+		set_smccc_filter(vm, PSCI_0_2_FN_PSCI_VERSION, 1, KVM_SMCCC_FILTER_DENY);
+		vcpu_args_set(vcpu, 2, PSCI_0_2_FN_PSCI_VERSION, conduit);
+
+		vcpu_run(vcpu);
+		expect_call_denied(vcpu);
+
+		kvm_vm_free(vm);
+	}
+}
+
+static void expect_call_fwd_to_user(struct kvm_vcpu *vcpu, uint32_t func_id,
+				    enum smccc_conduit conduit)
+{
+	struct kvm_run *run = vcpu->run;
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERCALL,
+		    "Unexpected exit reason: %u", run->exit_reason);
+	TEST_ASSERT(run->hypercall.nr == func_id,
+		    "Unexpected SMCCC function: %llu", run->hypercall.nr);
+
+	if (conduit == SMC_INSN)
+		TEST_ASSERT(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC,
+			    "KVM_HYPERCALL_EXIT_SMC is not set");
+	else
+		TEST_ASSERT(!(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC),
+			    "KVM_HYPERCALL_EXIT_SMC is set");
+}
+
+/* SMCCC calls forwarded to userspace cause KVM_EXIT_HYPERCALL exits */
+static void test_filter_fwd_to_user(void)
+{
+	enum smccc_conduit conduit;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	for_each_conduit(conduit) {
+		vm = setup_vm(&vcpu);
+
+		set_smccc_filter(vm, PSCI_0_2_FN_PSCI_VERSION, 1, KVM_SMCCC_FILTER_FWD_TO_USER);
+		vcpu_args_set(vcpu, 2, PSCI_0_2_FN_PSCI_VERSION, conduit);
+
+		vcpu_run(vcpu);
+		expect_call_fwd_to_user(vcpu, PSCI_0_2_FN_PSCI_VERSION, conduit);
+
+		kvm_vm_free(vm);
+	}
+}
+
+static bool kvm_supports_smccc_filter(void)
+{
+	struct kvm_vm *vm = vm_create_barebones();
+	int r;
+
+	r = __kvm_has_device_attr(vm->fd, KVM_ARM_VM_SMCCC_CTRL, KVM_ARM_VM_SMCCC_FILTER);
+
+	kvm_vm_free(vm);
+	return !r;
+}
+
+int main(void)
+{
+	TEST_REQUIRE(kvm_supports_smccc_filter());
+
+	test_pad_must_be_zero();
+	test_invalid_nr_functions();
+	test_overflow_nr_functions();
+	test_reserved_action();
+	test_filter_reserved_range();
+	test_filter_overlap();
+	test_filter_denied();
+	test_filter_fwd_to_user();
+}
diff --git a/tools/testing/selftests/kvm/arm64/vcpu_width_config.c b/tools/testing/selftests/kvm/arm64/vcpu_width_config.c
new file mode 100644
index 000000000000..80b74c6f152b
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vcpu_width_config.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vcpu_width_config - Test KVM_ARM_VCPU_INIT() with KVM_ARM_VCPU_EL1_32BIT.
+ *
+ * Copyright (c) 2022 Google LLC.
+ *
+ * This is a test that ensures that non-mixed-width vCPUs (all 64bit vCPUs
+ * or all 32bit vcPUs) can be configured and mixed-width vCPUs cannot be
+ * configured.
+ */
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+
+/*
+ * Add a vCPU, run KVM_ARM_VCPU_INIT with @init0, and then
+ * add another vCPU, and run KVM_ARM_VCPU_INIT with @init1.
+ */
+static int add_init_2vcpus(struct kvm_vcpu_init *init0,
+			   struct kvm_vcpu_init *init1)
+{
+	struct kvm_vcpu *vcpu0, *vcpu1;
+	struct kvm_vm *vm;
+	int ret;
+
+	vm = vm_create_barebones();
+
+	vcpu0 = __vm_vcpu_add(vm, 0);
+	ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0);
+	if (ret)
+		goto free_exit;
+
+	vcpu1 = __vm_vcpu_add(vm, 1);
+	ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1);
+
+free_exit:
+	kvm_vm_free(vm);
+	return ret;
+}
+
+/*
+ * Add two vCPUs, then run KVM_ARM_VCPU_INIT for one vCPU with @init0,
+ * and run KVM_ARM_VCPU_INIT for another vCPU with @init1.
+ */
+static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init0,
+				  struct kvm_vcpu_init *init1)
+{
+	struct kvm_vcpu *vcpu0, *vcpu1;
+	struct kvm_vm *vm;
+	int ret;
+
+	vm = vm_create_barebones();
+
+	vcpu0 = __vm_vcpu_add(vm, 0);
+	vcpu1 = __vm_vcpu_add(vm, 1);
+
+	ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0);
+	if (ret)
+		goto free_exit;
+
+	ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1);
+
+free_exit:
+	kvm_vm_free(vm);
+	return ret;
+}
+
+/*
+ * Tests that two 64bit vCPUs can be configured, two 32bit vCPUs can be
+ * configured, and two mixed-width vCPUs cannot be configured.
+ * Each of those three cases, configure vCPUs in two different orders.
+ * The one is running KVM_CREATE_VCPU for 2 vCPUs, and then running
+ * KVM_ARM_VCPU_INIT for them.
+ * The other is running KVM_CREATE_VCPU and KVM_ARM_VCPU_INIT for a vCPU,
+ * and then run those commands for another vCPU.
+ */
+int main(void)
+{
+	struct kvm_vcpu_init init0, init1;
+	struct kvm_vm *vm;
+	int ret;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_EL1_32BIT));
+
+	/* Get the preferred target type and copy that to init1 for later use */
+	vm = vm_create_barebones();
+	vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init0);
+	kvm_vm_free(vm);
+	init1 = init0;
+
+	/* Test with 64bit vCPUs */
+	ret = add_init_2vcpus(&init0, &init0);
+	TEST_ASSERT(ret == 0,
+		    "Configuring 64bit EL1 vCPUs failed unexpectedly");
+	ret = add_2vcpus_init_2vcpus(&init0, &init0);
+	TEST_ASSERT(ret == 0,
+		    "Configuring 64bit EL1 vCPUs failed unexpectedly");
+
+	/* Test with 32bit vCPUs */
+	init0.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT);
+	ret = add_init_2vcpus(&init0, &init0);
+	TEST_ASSERT(ret == 0,
+		    "Configuring 32bit EL1 vCPUs failed unexpectedly");
+	ret = add_2vcpus_init_2vcpus(&init0, &init0);
+	TEST_ASSERT(ret == 0,
+		    "Configuring 32bit EL1 vCPUs failed unexpectedly");
+
+	/* Test with mixed-width vCPUs  */
+	init0.features[0] = 0;
+	init1.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT);
+	ret = add_init_2vcpus(&init0, &init1);
+	TEST_ASSERT(ret != 0,
+		    "Configuring mixed-width vCPUs worked unexpectedly");
+	ret = add_2vcpus_init_2vcpus(&init0, &init1);
+	TEST_ASSERT(ret != 0,
+		    "Configuring mixed-width vCPUs worked unexpectedly");
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/vgic_init.c b/tools/testing/selftests/kvm/arm64/vgic_init.c
new file mode 100644
index 000000000000..b3b5fb0ff0a9
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vgic_init.c
@@ -0,0 +1,764 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic init sequence tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vgic.h"
+
+#define NR_VCPUS		4
+
+#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
+
+#define GICR_TYPER 0x8
+
+#define VGIC_DEV_IS_V2(_d) ((_d) == KVM_DEV_TYPE_ARM_VGIC_V2)
+#define VGIC_DEV_IS_V3(_d) ((_d) == KVM_DEV_TYPE_ARM_VGIC_V3)
+
+struct vm_gic {
+	struct kvm_vm *vm;
+	int gic_fd;
+	uint32_t gic_dev_type;
+};
+
+static uint64_t max_phys_size;
+
+/*
+ * Helpers to access a redistributor register and verify the ioctl() failed or
+ * succeeded as expected, and provided the correct value on success.
+ */
+static void v3_redist_reg_get_errno(int gicv3_fd, int vcpu, int offset,
+				    int want, const char *msg)
+{
+	uint32_t ignored_val;
+	int ret = __kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+					REG_OFFSET(vcpu, offset), &ignored_val);
+
+	TEST_ASSERT(ret && errno == want, "%s; want errno = %d", msg, want);
+}
+
+static void v3_redist_reg_get(int gicv3_fd, int vcpu, int offset, uint32_t want,
+			      const char *msg)
+{
+	uint32_t val;
+
+	kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+			    REG_OFFSET(vcpu, offset), &val);
+	TEST_ASSERT(val == want, "%s; want '0x%x', got '0x%x'", msg, want, val);
+}
+
+/* dummy guest code */
+static void guest_code(void)
+{
+	GUEST_SYNC(0);
+	GUEST_SYNC(1);
+	GUEST_SYNC(2);
+	GUEST_DONE();
+}
+
+/* we don't want to assert on run execution, hence that helper */
+static int run_vcpu(struct kvm_vcpu *vcpu)
+{
+	return __vcpu_run(vcpu) ? -errno : 0;
+}
+
+static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type,
+					      uint32_t nr_vcpus,
+					      struct kvm_vcpu *vcpus[])
+{
+	struct vm_gic v;
+
+	v.gic_dev_type = gic_dev_type;
+	v.vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
+	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
+
+	return v;
+}
+
+static struct vm_gic vm_gic_create_barebones(uint32_t gic_dev_type)
+{
+	struct vm_gic v;
+
+	v.gic_dev_type = gic_dev_type;
+	v.vm = vm_create_barebones();
+	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
+
+	return v;
+}
+
+
+static void vm_gic_destroy(struct vm_gic *v)
+{
+	close(v->gic_fd);
+	kvm_vm_free(v->vm);
+}
+
+struct vgic_region_attr {
+	uint64_t attr;
+	uint64_t size;
+	uint64_t alignment;
+};
+
+struct vgic_region_attr gic_v3_dist_region = {
+	.attr = KVM_VGIC_V3_ADDR_TYPE_DIST,
+	.size = 0x10000,
+	.alignment = 0x10000,
+};
+
+struct vgic_region_attr gic_v3_redist_region = {
+	.attr = KVM_VGIC_V3_ADDR_TYPE_REDIST,
+	.size = NR_VCPUS * 0x20000,
+	.alignment = 0x10000,
+};
+
+struct vgic_region_attr gic_v2_dist_region = {
+	.attr = KVM_VGIC_V2_ADDR_TYPE_DIST,
+	.size = 0x1000,
+	.alignment = 0x1000,
+};
+
+struct vgic_region_attr gic_v2_cpu_region = {
+	.attr = KVM_VGIC_V2_ADDR_TYPE_CPU,
+	.size = 0x2000,
+	.alignment = 0x1000,
+};
+
+/**
+ * Helper routine that performs KVM device tests in general. Eventually the
+ * ARM_VGIC (GICv2 or GICv3) device gets created with an overlapping
+ * DIST/REDIST (or DIST/CPUIF for GICv2). Assumption is 4 vcpus are going to be
+ * used hence the overlap. In the case of GICv3, A RDIST region is set at @0x0
+ * and a DIST region is set @0x70000. The GICv2 case sets a CPUIF @0x0 and a
+ * DIST region @0x1000.
+ */
+static void subtest_dist_rdist(struct vm_gic *v)
+{
+	int ret;
+	uint64_t addr;
+	struct vgic_region_attr rdist; /* CPU interface in GICv2*/
+	struct vgic_region_attr dist;
+
+	rdist = VGIC_DEV_IS_V3(v->gic_dev_type) ? gic_v3_redist_region
+						: gic_v2_cpu_region;
+	dist = VGIC_DEV_IS_V3(v->gic_dev_type) ? gic_v3_dist_region
+						: gic_v2_dist_region;
+
+	/* Check existing group/attributes */
+	kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, dist.attr);
+
+	kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, rdist.attr);
+
+	/* check non existing attribute */
+	ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, -1);
+	TEST_ASSERT(ret && errno == ENXIO, "attribute not supported");
+
+	/* misaligned DIST and REDIST address settings */
+	addr = dist.alignment / 0x10;
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    dist.attr, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "GIC dist base not aligned");
+
+	addr = rdist.alignment / 0x10;
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    rdist.attr, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "GIC redist/cpu base not aligned");
+
+	/* out of range address */
+	addr = max_phys_size;
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    dist.attr, &addr);
+	TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit");
+
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    rdist.attr, &addr);
+	TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit");
+
+	/* Space for half a rdist (a rdist is: 2 * rdist.alignment). */
+	addr = max_phys_size - dist.alignment;
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    rdist.attr, &addr);
+	TEST_ASSERT(ret && errno == E2BIG,
+			"half of the redist is beyond IPA limit");
+
+	/* set REDIST base address @0x0*/
+	addr = 0x00000;
+	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    rdist.attr, &addr);
+
+	/* Attempt to create a second legacy redistributor region */
+	addr = 0xE0000;
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    rdist.attr, &addr);
+	TEST_ASSERT(ret && errno == EEXIST, "GIC redist base set again");
+
+	ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				     KVM_VGIC_V3_ADDR_TYPE_REDIST);
+	if (!ret) {
+		/* Attempt to mix legacy and new redistributor regions */
+		addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0);
+		ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+					    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+		TEST_ASSERT(ret && errno == EINVAL,
+			    "attempt to mix GICv3 REDIST and REDIST_REGION");
+	}
+
+	/*
+	 * Set overlapping DIST / REDIST, cannot be detected here. Will be detected
+	 * on first vcpu run instead.
+	 */
+	addr = rdist.size - rdist.alignment;
+	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    dist.attr, &addr);
+}
+
+/* Test the new REDIST region API */
+static void subtest_v3_redist_regions(struct vm_gic *v)
+{
+	uint64_t addr, expected_addr;
+	int ret;
+
+	ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST);
+	TEST_ASSERT(!ret, "Multiple redist regions advertised");
+
+	addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0");
+
+	addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0");
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "attempt to register the first rdist region with index != 0");
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address");
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index");
+
+	addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "register an rdist region overlapping with another one");
+
+	addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1");
+
+	addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	addr = REDIST_REGION_ATTR_ADDR(1, max_phys_size, 0, 2);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == E2BIG,
+		    "register redist region with base address beyond IPA range");
+
+	/* The last redist is above the pa range. */
+	addr = REDIST_REGION_ATTR_ADDR(2, max_phys_size - 0x30000, 0, 2);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == E2BIG,
+		    "register redist region with top address beyond IPA range");
+
+	addr = 0x260000;
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION");
+
+	/*
+	 * Now there are 2 redist regions:
+	 * region 0 @ 0x200000 2 redists
+	 * region 1 @ 0x240000 1 redist
+	 * Attempt to read their characteristics
+	 */
+
+	addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0);
+	expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+	ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0");
+
+	addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1);
+	expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+	ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1");
+
+	addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2);
+	ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region");
+
+	addr = 0x260000;
+	kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_DIST, &addr);
+
+	addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2);
+	ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist");
+}
+
+/*
+ * VGIC KVM device is created and initialized before the secondary CPUs
+ * get created
+ */
+static void test_vgic_then_vcpus(uint32_t gic_dev_type)
+{
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	struct vm_gic v;
+	int ret, i;
+
+	v = vm_gic_create_with_vcpus(gic_dev_type, 1, vcpus);
+
+	subtest_dist_rdist(&v);
+
+	/* Add the rest of the VCPUs */
+	for (i = 1; i < NR_VCPUS; ++i)
+		vcpus[i] = vm_vcpu_add(v.vm, i, guest_code);
+
+	ret = run_vcpu(vcpus[3]);
+	TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+	vm_gic_destroy(&v);
+}
+
+/* All the VCPUs are created before the VGIC KVM device gets initialized */
+static void test_vcpus_then_vgic(uint32_t gic_dev_type)
+{
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	struct vm_gic v;
+	int ret;
+
+	v = vm_gic_create_with_vcpus(gic_dev_type, NR_VCPUS, vcpus);
+
+	subtest_dist_rdist(&v);
+
+	ret = run_vcpu(vcpus[3]);
+	TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+	vm_gic_destroy(&v);
+}
+
+#define KVM_VGIC_V2_ATTR(offset, cpu) \
+	(FIELD_PREP(KVM_DEV_ARM_VGIC_OFFSET_MASK, offset) | \
+	 FIELD_PREP(KVM_DEV_ARM_VGIC_CPUID_MASK, cpu))
+
+#define GIC_CPU_CTRL	0x00
+
+static void test_v2_uaccess_cpuif_no_vcpus(void)
+{
+	struct vm_gic v;
+	u64 val = 0;
+	int ret;
+
+	v = vm_gic_create_barebones(KVM_DEV_TYPE_ARM_VGIC_V2);
+	subtest_dist_rdist(&v);
+
+	ret = __kvm_has_device_attr(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
+				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0));
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "accessed non-existent CPU interface, want errno: %i",
+		    EINVAL);
+	ret = __kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
+				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "accessed non-existent CPU interface, want errno: %i",
+		    EINVAL);
+	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
+				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "accessed non-existent CPU interface, want errno: %i",
+		    EINVAL);
+
+	vm_gic_destroy(&v);
+}
+
+static void test_v3_new_redist_regions(void)
+{
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	void *dummy = NULL;
+	struct vm_gic v;
+	uint64_t addr;
+	int ret;
+
+	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
+	subtest_v3_redist_regions(&v);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	ret = run_vcpu(vcpus[3]);
+	TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists");
+	vm_gic_destroy(&v);
+
+	/* step2 */
+
+	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
+	subtest_v3_redist_regions(&v);
+
+	addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	ret = run_vcpu(vcpus[3]);
+	TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init");
+
+	vm_gic_destroy(&v);
+
+	/* step 3 */
+
+	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
+	subtest_v3_redist_regions(&v);
+
+	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy);
+	TEST_ASSERT(ret && errno == EFAULT,
+		    "register a third region allowing to cover the 4 vcpus");
+
+	addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	ret = run_vcpu(vcpus[3]);
+	TEST_ASSERT(!ret, "vcpu run");
+
+	vm_gic_destroy(&v);
+}
+
+static void test_v3_typer_accesses(void)
+{
+	struct vm_gic v;
+	uint64_t addr;
+	int ret, i;
+
+	v.vm = vm_create(NR_VCPUS);
+	(void)vm_vcpu_add(v.vm, 0, guest_code);
+
+	v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3);
+
+	(void)vm_vcpu_add(v.vm, 3, guest_code);
+
+	v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EINVAL,
+				"attempting to read GICR_TYPER of non created vcpu");
+
+	(void)vm_vcpu_add(v.vm, 1, guest_code);
+
+	v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EBUSY,
+				"read GICR_TYPER before GIC initialized");
+
+	(void)vm_vcpu_add(v.vm, 2, guest_code);
+
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	for (i = 0; i < NR_VCPUS ; i++) {
+		v3_redist_reg_get(v.gic_fd, i, GICR_TYPER, i * 0x100,
+				  "read GICR_TYPER before rdist region setting");
+	}
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	/* The 2 first rdists should be put there (vcpu 0 and 3) */
+	v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x0, "read typer of rdist #0");
+	v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #1");
+
+	addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1);
+	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+	TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region");
+
+	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100,
+			  "no redist region attached to vcpu #1 yet, last cannot be returned");
+	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200,
+			  "no redist region attached to vcpu #2, last cannot be returned");
+
+	addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1");
+	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210,
+			  "read typer of rdist #1, last properly returned");
+
+	vm_gic_destroy(&v);
+}
+
+static struct vm_gic vm_gic_v3_create_with_vcpuids(int nr_vcpus,
+						   uint32_t vcpuids[])
+{
+	struct vm_gic v;
+	int i;
+
+	v.vm = vm_create(nr_vcpus);
+	for (i = 0; i < nr_vcpus; i++)
+		vm_vcpu_add(v.vm, vcpuids[i], guest_code);
+
+	v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3);
+
+	return v;
+}
+
+/**
+ * Test GICR_TYPER last bit with new redist regions
+ * rdist regions #1 and #2 are contiguous
+ * rdist region #0 @0x100000 2 rdist capacity
+ *     rdists: 0, 3 (Last)
+ * rdist region #1 @0x240000 2 rdist capacity
+ *     rdists:  5, 4 (Last)
+ * rdist region #2 @0x200000 2 rdist capacity
+ *     rdists: 1, 2
+ */
+static void test_v3_last_bit_redist_regions(void)
+{
+	uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+	struct vm_gic v;
+	uint64_t addr;
+
+	v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids);
+
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr);
+
+	v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0");
+	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1");
+	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200, "read typer of rdist #2");
+	v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #3");
+	v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #5");
+	v3_redist_reg_get(v.gic_fd, 4, GICR_TYPER, 0x410, "read typer of rdist #4");
+
+	vm_gic_destroy(&v);
+}
+
+/* Test last bit with legacy region */
+static void test_v3_last_bit_single_rdist(void)
+{
+	uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+	struct vm_gic v;
+	uint64_t addr;
+
+	v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids);
+
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	addr = 0x10000;
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr);
+
+	v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0");
+	v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x300, "read typer of rdist #1");
+	v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #2");
+	v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #3");
+	v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210, "read typer of rdist #3");
+
+	vm_gic_destroy(&v);
+}
+
+/* Uses the legacy REDIST region API. */
+static void test_v3_redist_ipa_range_check_at_vcpu_run(void)
+{
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	struct vm_gic v;
+	int ret, i;
+	uint64_t addr;
+
+	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, 1, vcpus);
+
+	/* Set space for 3 redists, we have 1 vcpu, so this succeeds. */
+	addr = max_phys_size - (3 * 2 * 0x10000);
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr);
+
+	addr = 0x00000;
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_DIST, &addr);
+
+	/* Add the rest of the VCPUs */
+	for (i = 1; i < NR_VCPUS; ++i)
+		vcpus[i] = vm_vcpu_add(v.vm, i, guest_code);
+
+	kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	/* Attempt to run a vcpu without enough redist space. */
+	ret = run_vcpu(vcpus[2]);
+	TEST_ASSERT(ret && errno == EINVAL,
+		"redist base+size above PA range detected on 1st vcpu run");
+
+	vm_gic_destroy(&v);
+}
+
+static void test_v3_its_region(void)
+{
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	struct vm_gic v;
+	uint64_t addr;
+	int its_fd, ret;
+
+	v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus);
+	its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS);
+
+	addr = 0x401000;
+	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
+	TEST_ASSERT(ret && errno == EINVAL,
+		"ITS region with misaligned address");
+
+	addr = max_phys_size;
+	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
+	TEST_ASSERT(ret && errno == E2BIG,
+		"register ITS region with base address beyond IPA range");
+
+	addr = max_phys_size - 0x10000;
+	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
+	TEST_ASSERT(ret && errno == E2BIG,
+		"Half of ITS region is beyond IPA range");
+
+	/* This one succeeds setting the ITS base */
+	addr = 0x400000;
+	kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_ITS_ADDR_TYPE, &addr);
+
+	addr = 0x300000;
+	ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+				    KVM_VGIC_ITS_ADDR_TYPE, &addr);
+	TEST_ASSERT(ret && errno == EEXIST, "ITS base set again");
+
+	close(its_fd);
+	vm_gic_destroy(&v);
+}
+
+/*
+ * Returns 0 if it's possible to create GIC device of a given type (V2 or V3).
+ */
+int test_kvm_device(uint32_t gic_dev_type)
+{
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	struct vm_gic v;
+	uint32_t other;
+	int ret;
+
+	v.vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus);
+
+	/* try to create a non existing KVM device */
+	ret = __kvm_test_create_device(v.vm, 0);
+	TEST_ASSERT(ret && errno == ENODEV, "unsupported device");
+
+	/* trial mode */
+	ret = __kvm_test_create_device(v.vm, gic_dev_type);
+	if (ret)
+		return ret;
+	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
+
+	ret = __kvm_create_device(v.vm, gic_dev_type);
+	TEST_ASSERT(ret < 0 && errno == EEXIST, "create GIC device twice");
+
+	/* try to create the other gic_dev_type */
+	other = VGIC_DEV_IS_V2(gic_dev_type) ? KVM_DEV_TYPE_ARM_VGIC_V3
+					     : KVM_DEV_TYPE_ARM_VGIC_V2;
+
+	if (!__kvm_test_create_device(v.vm, other)) {
+		ret = __kvm_create_device(v.vm, other);
+		TEST_ASSERT(ret < 0 && (errno == EINVAL || errno == EEXIST),
+				"create GIC device while other version exists");
+	}
+
+	vm_gic_destroy(&v);
+
+	return 0;
+}
+
+void run_tests(uint32_t gic_dev_type)
+{
+	test_vcpus_then_vgic(gic_dev_type);
+	test_vgic_then_vcpus(gic_dev_type);
+
+	if (VGIC_DEV_IS_V2(gic_dev_type))
+		test_v2_uaccess_cpuif_no_vcpus();
+
+	if (VGIC_DEV_IS_V3(gic_dev_type)) {
+		test_v3_new_redist_regions();
+		test_v3_typer_accesses();
+		test_v3_last_bit_redist_regions();
+		test_v3_last_bit_single_rdist();
+		test_v3_redist_ipa_range_check_at_vcpu_run();
+		test_v3_its_region();
+	}
+}
+
+int main(int ac, char **av)
+{
+	int ret;
+	int pa_bits;
+	int cnt_impl = 0;
+
+	pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits;
+	max_phys_size = 1ULL << pa_bits;
+
+	ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+	if (!ret) {
+		pr_info("Running GIC_v3 tests.\n");
+		run_tests(KVM_DEV_TYPE_ARM_VGIC_V3);
+		cnt_impl++;
+	}
+
+	ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+	if (!ret) {
+		pr_info("Running GIC_v2 tests.\n");
+		run_tests(KVM_DEV_TYPE_ARM_VGIC_V2);
+		cnt_impl++;
+	}
+
+	if (!cnt_impl) {
+		print_skip("No GICv2 nor GICv3 support");
+		exit(KSFT_SKIP);
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c
new file mode 100644
index 000000000000..f4ac28d53747
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c
@@ -0,0 +1,847 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic_irq.c - Test userspace injection of IRQs
+ *
+ * This test validates the injection of IRQs from userspace using various
+ * methods (e.g., KVM_IRQ_LINE) and modes (e.g., EOI). The guest "asks" the
+ * host to inject a specific intid via a GUEST_SYNC call, and then checks that
+ * it received it.
+ */
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+#include <sys/eventfd.h>
+#include <linux/sizes.h>
+
+#include "processor.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "gic.h"
+#include "gic_v3.h"
+#include "vgic.h"
+
+/*
+ * Stores the user specified args; it's passed to the guest and to every test
+ * function.
+ */
+struct test_args {
+	uint32_t nr_irqs; /* number of KVM supported IRQs. */
+	bool eoi_split; /* 1 is eoir+dir, 0 is eoir only */
+	bool level_sensitive; /* 1 is level, 0 is edge */
+	int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */
+	bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */
+};
+
+/*
+ * KVM implements 32 priority levels:
+ * 0x00 (highest priority) - 0xF8 (lowest priority), in steps of 8
+ *
+ * Note that these macros will still be correct in the case that KVM implements
+ * more priority levels. Also note that 32 is the minimum for GICv3 and GICv2.
+ */
+#define KVM_NUM_PRIOS		32
+#define KVM_PRIO_SHIFT		3 /* steps of 8 = 1 << 3 */
+#define KVM_PRIO_STEPS		(1 << KVM_PRIO_SHIFT) /* 8 */
+#define LOWEST_PRIO		(KVM_NUM_PRIOS - 1)
+#define CPU_PRIO_MASK		(LOWEST_PRIO << KVM_PRIO_SHIFT)	/* 0xf8 */
+#define IRQ_DEFAULT_PRIO	(LOWEST_PRIO - 1)
+#define IRQ_DEFAULT_PRIO_REG	(IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */
+
+/*
+ * The kvm_inject_* utilities are used by the guest to ask the host to inject
+ * interrupts (e.g., using the KVM_IRQ_LINE ioctl).
+ */
+
+typedef enum {
+	KVM_INJECT_EDGE_IRQ_LINE = 1,
+	KVM_SET_IRQ_LINE,
+	KVM_SET_IRQ_LINE_HIGH,
+	KVM_SET_LEVEL_INFO_HIGH,
+	KVM_INJECT_IRQFD,
+	KVM_WRITE_ISPENDR,
+	KVM_WRITE_ISACTIVER,
+} kvm_inject_cmd;
+
+struct kvm_inject_args {
+	kvm_inject_cmd cmd;
+	uint32_t first_intid;
+	uint32_t num;
+	int level;
+	bool expect_failure;
+};
+
+/* Used on the guest side to perform the hypercall. */
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
+		uint32_t num, int level, bool expect_failure);
+
+/* Used on the host side to get the hypercall info. */
+static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
+		struct kvm_inject_args *args);
+
+#define _KVM_INJECT_MULTI(cmd, intid, num, expect_failure)			\
+	kvm_inject_call(cmd, intid, num, -1 /* not used */, expect_failure)
+
+#define KVM_INJECT_MULTI(cmd, intid, num)					\
+	_KVM_INJECT_MULTI(cmd, intid, num, false)
+
+#define _KVM_INJECT(cmd, intid, expect_failure)					\
+	_KVM_INJECT_MULTI(cmd, intid, 1, expect_failure)
+
+#define KVM_INJECT(cmd, intid)							\
+	_KVM_INJECT_MULTI(cmd, intid, 1, false)
+
+#define KVM_ACTIVATE(cmd, intid)						\
+	kvm_inject_call(cmd, intid, 1, 1, false);
+
+struct kvm_inject_desc {
+	kvm_inject_cmd cmd;
+	/* can inject PPIs, PPIs, and/or SPIs. */
+	bool sgi, ppi, spi;
+};
+
+static struct kvm_inject_desc inject_edge_fns[] = {
+	/*                                      sgi    ppi    spi */
+	{ KVM_INJECT_EDGE_IRQ_LINE,		false, false, true },
+	{ KVM_INJECT_IRQFD,			false, false, true },
+	{ KVM_WRITE_ISPENDR,			true,  false, true },
+	{ 0, },
+};
+
+static struct kvm_inject_desc inject_level_fns[] = {
+	/*                                      sgi    ppi    spi */
+	{ KVM_SET_IRQ_LINE_HIGH,		false, true,  true },
+	{ KVM_SET_LEVEL_INFO_HIGH,		false, true,  true },
+	{ KVM_INJECT_IRQFD,			false, false, true },
+	{ KVM_WRITE_ISPENDR,			false, true,  true },
+	{ 0, },
+};
+
+static struct kvm_inject_desc set_active_fns[] = {
+	/*                                      sgi    ppi    spi */
+	{ KVM_WRITE_ISACTIVER,			true,  true,  true },
+	{ 0, },
+};
+
+#define for_each_inject_fn(t, f)						\
+	for ((f) = (t); (f)->cmd; (f)++)
+
+#define for_each_supported_inject_fn(args, t, f)				\
+	for_each_inject_fn(t, f)						\
+		if ((args)->kvm_supports_irqfd || (f)->cmd != KVM_INJECT_IRQFD)
+
+#define for_each_supported_activate_fn(args, t, f)				\
+	for_each_supported_inject_fn((args), (t), (f))
+
+/* Shared between the guest main thread and the IRQ handlers. */
+volatile uint64_t irq_handled;
+volatile uint32_t irqnr_received[MAX_SPI + 1];
+
+static void reset_stats(void)
+{
+	int i;
+
+	irq_handled = 0;
+	for (i = 0; i <= MAX_SPI; i++)
+		irqnr_received[i] = 0;
+}
+
+static uint64_t gic_read_ap1r0(void)
+{
+	uint64_t reg = read_sysreg_s(SYS_ICC_AP1R0_EL1);
+
+	dsb(sy);
+	return reg;
+}
+
+static void gic_write_ap1r0(uint64_t val)
+{
+	write_sysreg_s(val, SYS_ICC_AP1R0_EL1);
+	isb();
+}
+
+static void guest_set_irq_line(uint32_t intid, uint32_t level);
+
+static void guest_irq_generic_handler(bool eoi_split, bool level_sensitive)
+{
+	uint32_t intid = gic_get_and_ack_irq();
+
+	if (intid == IAR_SPURIOUS)
+		return;
+
+	GUEST_ASSERT(gic_irq_get_active(intid));
+
+	if (!level_sensitive)
+		GUEST_ASSERT(!gic_irq_get_pending(intid));
+
+	if (level_sensitive)
+		guest_set_irq_line(intid, 0);
+
+	GUEST_ASSERT(intid < MAX_SPI);
+	irqnr_received[intid] += 1;
+	irq_handled += 1;
+
+	gic_set_eoi(intid);
+	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+	if (eoi_split)
+		gic_set_dir(intid);
+
+	GUEST_ASSERT(!gic_irq_get_active(intid));
+	GUEST_ASSERT(!gic_irq_get_pending(intid));
+}
+
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
+		uint32_t num, int level, bool expect_failure)
+{
+	struct kvm_inject_args args = {
+		.cmd = cmd,
+		.first_intid = first_intid,
+		.num = num,
+		.level = level,
+		.expect_failure = expect_failure,
+	};
+	GUEST_SYNC(&args);
+}
+
+#define GUEST_ASSERT_IAR_EMPTY()						\
+do { 										\
+	uint32_t _intid;							\
+	_intid = gic_get_and_ack_irq();						\
+	GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS);			\
+} while (0)
+
+#define CAT_HELPER(a, b) a ## b
+#define CAT(a, b) CAT_HELPER(a, b)
+#define PREFIX guest_irq_handler_
+#define GUEST_IRQ_HANDLER_NAME(split, lev) CAT(PREFIX, CAT(split, lev))
+#define GENERATE_GUEST_IRQ_HANDLER(split, lev)					\
+static void CAT(PREFIX, CAT(split, lev))(struct ex_regs *regs)			\
+{										\
+	guest_irq_generic_handler(split, lev);					\
+}
+
+GENERATE_GUEST_IRQ_HANDLER(0, 0);
+GENERATE_GUEST_IRQ_HANDLER(0, 1);
+GENERATE_GUEST_IRQ_HANDLER(1, 0);
+GENERATE_GUEST_IRQ_HANDLER(1, 1);
+
+static void (*guest_irq_handlers[2][2])(struct ex_regs *) = {
+	{GUEST_IRQ_HANDLER_NAME(0, 0), GUEST_IRQ_HANDLER_NAME(0, 1),},
+	{GUEST_IRQ_HANDLER_NAME(1, 0), GUEST_IRQ_HANDLER_NAME(1, 1),},
+};
+
+static void reset_priorities(struct test_args *args)
+{
+	int i;
+
+	for (i = 0; i < args->nr_irqs; i++)
+		gic_set_priority(i, IRQ_DEFAULT_PRIO_REG);
+}
+
+static void guest_set_irq_line(uint32_t intid, uint32_t level)
+{
+	kvm_inject_call(KVM_SET_IRQ_LINE, intid, 1, level, false);
+}
+
+static void test_inject_fail(struct test_args *args,
+		uint32_t intid, kvm_inject_cmd cmd)
+{
+	reset_stats();
+
+	_KVM_INJECT(cmd, intid, true);
+	/* no IRQ to handle on entry */
+
+	GUEST_ASSERT_EQ(irq_handled, 0);
+	GUEST_ASSERT_IAR_EMPTY();
+}
+
+static void guest_inject(struct test_args *args,
+		uint32_t first_intid, uint32_t num,
+		kvm_inject_cmd cmd)
+{
+	uint32_t i;
+
+	reset_stats();
+
+	/* Cycle over all priorities to make things more interesting. */
+	for (i = first_intid; i < num + first_intid; i++)
+		gic_set_priority(i, (i % (KVM_NUM_PRIOS - 1)) << 3);
+
+	asm volatile("msr daifset, #2" : : : "memory");
+	KVM_INJECT_MULTI(cmd, first_intid, num);
+
+	while (irq_handled < num) {
+		wfi();
+		local_irq_enable();
+		isb(); /* handle IRQ */
+		local_irq_disable();
+	}
+	local_irq_enable();
+
+	GUEST_ASSERT_EQ(irq_handled, num);
+	for (i = first_intid; i < num + first_intid; i++)
+		GUEST_ASSERT_EQ(irqnr_received[i], 1);
+	GUEST_ASSERT_IAR_EMPTY();
+
+	reset_priorities(args);
+}
+
+/*
+ * Restore the active state of multiple concurrent IRQs (given by
+ * concurrent_irqs).  This does what a live-migration would do on the
+ * destination side assuming there are some active IRQs that were not
+ * deactivated yet.
+ */
+static void guest_restore_active(struct test_args *args,
+		uint32_t first_intid, uint32_t num,
+		kvm_inject_cmd cmd)
+{
+	uint32_t prio, intid, ap1r;
+	int i;
+
+	/*
+	 * Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs
+	 * in descending order, so intid+1 can preempt intid.
+	 */
+	for (i = 0, prio = (num - 1) * 8; i < num; i++, prio -= 8) {
+		GUEST_ASSERT(prio >= 0);
+		intid = i + first_intid;
+		gic_set_priority(intid, prio);
+	}
+
+	/*
+	 * In a real migration, KVM would restore all GIC state before running
+	 * guest code.
+	 */
+	for (i = 0; i < num; i++) {
+		intid = i + first_intid;
+		KVM_ACTIVATE(cmd, intid);
+		ap1r = gic_read_ap1r0();
+		ap1r |= 1U << i;
+		gic_write_ap1r0(ap1r);
+	}
+
+	/* This is where the "migration" would occur. */
+
+	/* finish handling the IRQs starting with the highest priority one. */
+	for (i = 0; i < num; i++) {
+		intid = num - i - 1 + first_intid;
+		gic_set_eoi(intid);
+		if (args->eoi_split)
+			gic_set_dir(intid);
+	}
+
+	for (i = 0; i < num; i++)
+		GUEST_ASSERT(!gic_irq_get_active(i + first_intid));
+	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+	GUEST_ASSERT_IAR_EMPTY();
+}
+
+/*
+ * Polls the IAR until it's not a spurious interrupt.
+ *
+ * This function should only be used in test_inject_preemption (with IRQs
+ * masked).
+ */
+static uint32_t wait_for_and_activate_irq(void)
+{
+	uint32_t intid;
+
+	do {
+		asm volatile("wfi" : : : "memory");
+		intid = gic_get_and_ack_irq();
+	} while (intid == IAR_SPURIOUS);
+
+	return intid;
+}
+
+/*
+ * Inject multiple concurrent IRQs (num IRQs starting at first_intid) and
+ * handle them without handling the actual exceptions.  This is done by masking
+ * interrupts for the whole test.
+ */
+static void test_inject_preemption(struct test_args *args,
+		uint32_t first_intid, int num,
+		kvm_inject_cmd cmd)
+{
+	uint32_t intid, prio, step = KVM_PRIO_STEPS;
+	int i;
+
+	/* Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs
+	 * in descending order, so intid+1 can preempt intid.
+	 */
+	for (i = 0, prio = (num - 1) * step; i < num; i++, prio -= step) {
+		GUEST_ASSERT(prio >= 0);
+		intid = i + first_intid;
+		gic_set_priority(intid, prio);
+	}
+
+	local_irq_disable();
+
+	for (i = 0; i < num; i++) {
+		uint32_t tmp;
+		intid = i + first_intid;
+		KVM_INJECT(cmd, intid);
+		/* Each successive IRQ will preempt the previous one. */
+		tmp = wait_for_and_activate_irq();
+		GUEST_ASSERT_EQ(tmp, intid);
+		if (args->level_sensitive)
+			guest_set_irq_line(intid, 0);
+	}
+
+	/* finish handling the IRQs starting with the highest priority one. */
+	for (i = 0; i < num; i++) {
+		intid = num - i - 1 + first_intid;
+		gic_set_eoi(intid);
+		if (args->eoi_split)
+			gic_set_dir(intid);
+	}
+
+	local_irq_enable();
+
+	for (i = 0; i < num; i++)
+		GUEST_ASSERT(!gic_irq_get_active(i + first_intid));
+	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+	GUEST_ASSERT_IAR_EMPTY();
+
+	reset_priorities(args);
+}
+
+static void test_injection(struct test_args *args, struct kvm_inject_desc *f)
+{
+	uint32_t nr_irqs = args->nr_irqs;
+
+	if (f->sgi) {
+		guest_inject(args, MIN_SGI, 1, f->cmd);
+		guest_inject(args, 0, 16, f->cmd);
+	}
+
+	if (f->ppi)
+		guest_inject(args, MIN_PPI, 1, f->cmd);
+
+	if (f->spi) {
+		guest_inject(args, MIN_SPI, 1, f->cmd);
+		guest_inject(args, nr_irqs - 1, 1, f->cmd);
+		guest_inject(args, MIN_SPI, nr_irqs - MIN_SPI, f->cmd);
+	}
+}
+
+static void test_injection_failure(struct test_args *args,
+		struct kvm_inject_desc *f)
+{
+	uint32_t bad_intid[] = { args->nr_irqs, 1020, 1024, 1120, 5120, ~0U, };
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(bad_intid); i++)
+		test_inject_fail(args, bad_intid[i], f->cmd);
+}
+
+static void test_preemption(struct test_args *args, struct kvm_inject_desc *f)
+{
+	/*
+	 * Test up to 4 levels of preemption. The reason is that KVM doesn't
+	 * currently implement the ability to have more than the number-of-LRs
+	 * number of concurrently active IRQs. The number of LRs implemented is
+	 * IMPLEMENTATION DEFINED, however, it seems that most implement 4.
+	 */
+	if (f->sgi)
+		test_inject_preemption(args, MIN_SGI, 4, f->cmd);
+
+	if (f->ppi)
+		test_inject_preemption(args, MIN_PPI, 4, f->cmd);
+
+	if (f->spi)
+		test_inject_preemption(args, MIN_SPI, 4, f->cmd);
+}
+
+static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f)
+{
+	/* Test up to 4 active IRQs. Same reason as in test_preemption. */
+	if (f->sgi)
+		guest_restore_active(args, MIN_SGI, 4, f->cmd);
+
+	if (f->ppi)
+		guest_restore_active(args, MIN_PPI, 4, f->cmd);
+
+	if (f->spi)
+		guest_restore_active(args, MIN_SPI, 4, f->cmd);
+}
+
+static void guest_code(struct test_args *args)
+{
+	uint32_t i, nr_irqs = args->nr_irqs;
+	bool level_sensitive = args->level_sensitive;
+	struct kvm_inject_desc *f, *inject_fns;
+
+	gic_init(GIC_V3, 1);
+
+	for (i = 0; i < nr_irqs; i++)
+		gic_irq_enable(i);
+
+	for (i = MIN_SPI; i < nr_irqs; i++)
+		gic_irq_set_config(i, !level_sensitive);
+
+	gic_set_eoi_split(args->eoi_split);
+
+	reset_priorities(args);
+	gic_set_priority_mask(CPU_PRIO_MASK);
+
+	inject_fns  = level_sensitive ? inject_level_fns
+				      : inject_edge_fns;
+
+	local_irq_enable();
+
+	/* Start the tests. */
+	for_each_supported_inject_fn(args, inject_fns, f) {
+		test_injection(args, f);
+		test_preemption(args, f);
+		test_injection_failure(args, f);
+	}
+
+	/*
+	 * Restore the active state of IRQs. This would happen when live
+	 * migrating IRQs in the middle of being handled.
+	 */
+	for_each_supported_activate_fn(args, set_active_fns, f)
+		test_restore_active(args, f);
+
+	GUEST_DONE();
+}
+
+static void kvm_irq_line_check(struct kvm_vm *vm, uint32_t intid, int level,
+			struct test_args *test_args, bool expect_failure)
+{
+	int ret;
+
+	if (!expect_failure) {
+		kvm_arm_irq_line(vm, intid, level);
+	} else {
+		/* The interface doesn't allow larger intid's. */
+		if (intid > KVM_ARM_IRQ_NUM_MASK)
+			return;
+
+		ret = _kvm_arm_irq_line(vm, intid, level);
+		TEST_ASSERT(ret != 0 && errno == EINVAL,
+				"Bad intid %i did not cause KVM_IRQ_LINE "
+				"error: rc: %i errno: %i", intid, ret, errno);
+	}
+}
+
+void kvm_irq_set_level_info_check(int gic_fd, uint32_t intid, int level,
+			bool expect_failure)
+{
+	if (!expect_failure) {
+		kvm_irq_set_level_info(gic_fd, intid, level);
+	} else {
+		int ret = _kvm_irq_set_level_info(gic_fd, intid, level);
+		/*
+		 * The kernel silently fails for invalid SPIs and SGIs (which
+		 * are not level-sensitive). It only checks for intid to not
+		 * spill over 1U << 10 (the max reserved SPI). Also, callers
+		 * are supposed to mask the intid with 0x3ff (1023).
+		 */
+		if (intid > VGIC_MAX_RESERVED)
+			TEST_ASSERT(ret != 0 && errno == EINVAL,
+				"Bad intid %i did not cause VGIC_GRP_LEVEL_INFO "
+				"error: rc: %i errno: %i", intid, ret, errno);
+		else
+			TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO "
+				"for intid %i failed, rc: %i errno: %i",
+				intid, ret, errno);
+	}
+}
+
+static void kvm_set_gsi_routing_irqchip_check(struct kvm_vm *vm,
+		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
+		bool expect_failure)
+{
+	struct kvm_irq_routing *routing;
+	int ret;
+	uint64_t i;
+
+	assert(num <= kvm_max_routes && kvm_max_routes <= KVM_MAX_IRQ_ROUTES);
+
+	routing = kvm_gsi_routing_create();
+	for (i = intid; i < (uint64_t)intid + num; i++)
+		kvm_gsi_routing_irqchip_add(routing, i - MIN_SPI, i - MIN_SPI);
+
+	if (!expect_failure) {
+		kvm_gsi_routing_write(vm, routing);
+	} else {
+		ret = _kvm_gsi_routing_write(vm, routing);
+		/* The kernel only checks e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS */
+		if (((uint64_t)intid + num - 1 - MIN_SPI) >= KVM_IRQCHIP_NUM_PINS)
+			TEST_ASSERT(ret != 0 && errno == EINVAL,
+				"Bad intid %u did not cause KVM_SET_GSI_ROUTING "
+				"error: rc: %i errno: %i", intid, ret, errno);
+		else
+			TEST_ASSERT(ret == 0, "KVM_SET_GSI_ROUTING "
+				"for intid %i failed, rc: %i errno: %i",
+				intid, ret, errno);
+	}
+}
+
+static void kvm_irq_write_ispendr_check(int gic_fd, uint32_t intid,
+					struct kvm_vcpu *vcpu,
+					bool expect_failure)
+{
+	/*
+	 * Ignore this when expecting failure as invalid intids will lead to
+	 * either trying to inject SGIs when we configured the test to be
+	 * level_sensitive (or the reverse), or inject large intids which
+	 * will lead to writing above the ISPENDR register space (and we
+	 * don't want to do that either).
+	 */
+	if (!expect_failure)
+		kvm_irq_write_ispendr(gic_fd, intid, vcpu);
+}
+
+static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
+		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
+		bool expect_failure)
+{
+	int fd[MAX_SPI];
+	uint64_t val;
+	int ret, f;
+	uint64_t i;
+
+	/*
+	 * There is no way to try injecting an SGI or PPI as the interface
+	 * starts counting from the first SPI (above the private ones), so just
+	 * exit.
+	 */
+	if (INTID_IS_SGI(intid) || INTID_IS_PPI(intid))
+		return;
+
+	kvm_set_gsi_routing_irqchip_check(vm, intid, num,
+			kvm_max_routes, expect_failure);
+
+	/*
+	 * If expect_failure, then just to inject anyway. These
+	 * will silently fail. And in any case, the guest will check
+	 * that no actual interrupt was injected for those cases.
+	 */
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+		fd[f] = eventfd(0, 0);
+		TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f]));
+	}
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+		struct kvm_irqfd irqfd = {
+			.fd  = fd[f],
+			.gsi = i - MIN_SPI,
+		};
+		assert(i <= (uint64_t)UINT_MAX);
+		vm_ioctl(vm, KVM_IRQFD, &irqfd);
+	}
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+		val = 1;
+		ret = write(fd[f], &val, sizeof(uint64_t));
+		TEST_ASSERT(ret == sizeof(uint64_t),
+			    __KVM_SYSCALL_ERROR("write()", ret));
+	}
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++)
+		close(fd[f]);
+}
+
+/* handles the valid case: intid=0xffffffff num=1 */
+#define for_each_intid(first, num, tmp, i)					\
+	for ((tmp) = (i) = (first);						\
+		(tmp) < (uint64_t)(first) + (uint64_t)(num);			\
+		(tmp)++, (i)++)
+
+static void run_guest_cmd(struct kvm_vcpu *vcpu, int gic_fd,
+			  struct kvm_inject_args *inject_args,
+			  struct test_args *test_args)
+{
+	kvm_inject_cmd cmd = inject_args->cmd;
+	uint32_t intid = inject_args->first_intid;
+	uint32_t num = inject_args->num;
+	int level = inject_args->level;
+	bool expect_failure = inject_args->expect_failure;
+	struct kvm_vm *vm = vcpu->vm;
+	uint64_t tmp;
+	uint32_t i;
+
+	/* handles the valid case: intid=0xffffffff num=1 */
+	assert(intid < UINT_MAX - num || num == 1);
+
+	switch (cmd) {
+	case KVM_INJECT_EDGE_IRQ_LINE:
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, 1, test_args,
+					expect_failure);
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, 0, test_args,
+					expect_failure);
+		break;
+	case KVM_SET_IRQ_LINE:
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, level, test_args,
+					expect_failure);
+		break;
+	case KVM_SET_IRQ_LINE_HIGH:
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, 1, test_args,
+					expect_failure);
+		break;
+	case KVM_SET_LEVEL_INFO_HIGH:
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_set_level_info_check(gic_fd, i, 1,
+					expect_failure);
+		break;
+	case KVM_INJECT_IRQFD:
+		kvm_routing_and_irqfd_check(vm, intid, num,
+					test_args->kvm_max_routes,
+					expect_failure);
+		break;
+	case KVM_WRITE_ISPENDR:
+		for (i = intid; i < intid + num; i++)
+			kvm_irq_write_ispendr_check(gic_fd, i, vcpu,
+						    expect_failure);
+		break;
+	case KVM_WRITE_ISACTIVER:
+		for (i = intid; i < intid + num; i++)
+			kvm_irq_write_isactiver(gic_fd, i, vcpu);
+		break;
+	default:
+		break;
+	}
+}
+
+static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
+		struct kvm_inject_args *args)
+{
+	struct kvm_inject_args *kvm_args_hva;
+	vm_vaddr_t kvm_args_gva;
+
+	kvm_args_gva = uc->args[1];
+	kvm_args_hva = (struct kvm_inject_args *)addr_gva2hva(vm, kvm_args_gva);
+	memcpy(args, kvm_args_hva, sizeof(struct kvm_inject_args));
+}
+
+static void print_args(struct test_args *args)
+{
+	printf("nr-irqs=%d level-sensitive=%d eoi-split=%d\n",
+			args->nr_irqs, args->level_sensitive,
+			args->eoi_split);
+}
+
+static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split)
+{
+	struct ucall uc;
+	int gic_fd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_inject_args inject_args;
+	vm_vaddr_t args_gva;
+
+	struct test_args args = {
+		.nr_irqs = nr_irqs,
+		.level_sensitive = level_sensitive,
+		.eoi_split = eoi_split,
+		.kvm_max_routes = kvm_check_cap(KVM_CAP_IRQ_ROUTING),
+		.kvm_supports_irqfd = kvm_check_cap(KVM_CAP_IRQFD),
+	};
+
+	print_args(&args);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	/* Setup the guest args page (so it gets the args). */
+	args_gva = vm_vaddr_alloc_page(vm);
+	memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args));
+	vcpu_args_set(vcpu, 1, args_gva);
+
+	gic_fd = vgic_v3_setup(vm, 1, nr_irqs);
+	__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping");
+
+	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT,
+		guest_irq_handlers[args.eoi_split][args.level_sensitive]);
+
+	while (1) {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			kvm_inject_get_call(vm, &uc, &inject_args);
+			run_guest_cmd(vcpu, gic_fd, &inject_args, &args);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	close(gic_fd);
+	kvm_vm_free(vm);
+}
+
+static void help(const char *name)
+{
+	printf(
+	"\n"
+	"usage: %s [-n num_irqs] [-e eoi_split] [-l level_sensitive]\n", name);
+	printf(" -n: specify number of IRQs to setup the vgic with. "
+		"It has to be a multiple of 32 and between 64 and 1024.\n");
+	printf(" -e: if 1 then EOI is split into a write to DIR on top "
+		"of writing EOI.\n");
+	printf(" -l: specify whether the IRQs are level-sensitive (1) or not (0).");
+	puts("");
+	exit(1);
+}
+
+int main(int argc, char **argv)
+{
+	uint32_t nr_irqs = 64;
+	bool default_args = true;
+	bool level_sensitive = false;
+	int opt;
+	bool eoi_split = false;
+
+	while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) {
+		switch (opt) {
+		case 'n':
+			nr_irqs = atoi_non_negative("Number of IRQs", optarg);
+			if (nr_irqs > 1024 || nr_irqs % 32)
+				help(argv[0]);
+			break;
+		case 'e':
+			eoi_split = (bool)atoi_paranoid(optarg);
+			default_args = false;
+			break;
+		case 'l':
+			level_sensitive = (bool)atoi_paranoid(optarg);
+			default_args = false;
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	/*
+	 * If the user just specified nr_irqs and/or gic_version, then run all
+	 * combinations.
+	 */
+	if (default_args) {
+		test_vgic(nr_irqs, false /* level */, false /* eoi_split */);
+		test_vgic(nr_irqs, false /* level */, true /* eoi_split */);
+		test_vgic(nr_irqs, true /* level */, false /* eoi_split */);
+		test_vgic(nr_irqs, true /* level */, true /* eoi_split */);
+	} else {
+		test_vgic(nr_irqs, level_sensitive, eoi_split);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c
new file mode 100644
index 000000000000..fc4fe52fb6f8
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic_lpi_stress - Stress test for KVM's ITS emulation
+ *
+ * Copyright (c) 2024 Google LLC
+ */
+
+#include <linux/sizes.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <sys/sysinfo.h>
+
+#include "kvm_util.h"
+#include "gic.h"
+#include "gic_v3.h"
+#include "gic_v3_its.h"
+#include "processor.h"
+#include "ucall.h"
+#include "vgic.h"
+
+#define TEST_MEMSLOT_INDEX	1
+
+#define GIC_LPI_OFFSET	8192
+
+static size_t nr_iterations = 1000;
+static vm_paddr_t gpa_base;
+
+static struct kvm_vm *vm;
+static struct kvm_vcpu **vcpus;
+static int gic_fd, its_fd;
+
+static struct test_data {
+	bool		request_vcpus_stop;
+	u32		nr_cpus;
+	u32		nr_devices;
+	u32		nr_event_ids;
+
+	vm_paddr_t	device_table;
+	vm_paddr_t	collection_table;
+	vm_paddr_t	cmdq_base;
+	void		*cmdq_base_va;
+	vm_paddr_t	itt_tables;
+
+	vm_paddr_t	lpi_prop_table;
+	vm_paddr_t	lpi_pend_tables;
+} test_data =  {
+	.nr_cpus	= 1,
+	.nr_devices	= 1,
+	.nr_event_ids	= 16,
+};
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+	u32 intid = gic_get_and_ack_irq();
+
+	if (intid == IAR_SPURIOUS)
+		return;
+
+	GUEST_ASSERT(intid >= GIC_LPI_OFFSET);
+	gic_set_eoi(intid);
+}
+
+static void guest_setup_its_mappings(void)
+{
+	u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET;
+	u32 nr_events = test_data.nr_event_ids;
+	u32 nr_devices = test_data.nr_devices;
+	u32 nr_cpus = test_data.nr_cpus;
+
+	for (coll_id = 0; coll_id < nr_cpus; coll_id++)
+		its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true);
+
+	/* Round-robin the LPIs to all of the vCPUs in the VM */
+	coll_id = 0;
+	for (device_id = 0; device_id < nr_devices; device_id++) {
+		vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K);
+
+		its_send_mapd_cmd(test_data.cmdq_base_va, device_id,
+				  itt_base, SZ_64K, true);
+
+		for (event_id = 0; event_id < nr_events; event_id++) {
+			its_send_mapti_cmd(test_data.cmdq_base_va, device_id,
+					   event_id, coll_id, intid++);
+
+			coll_id = (coll_id + 1) % test_data.nr_cpus;
+		}
+	}
+}
+
+static void guest_invalidate_all_rdists(void)
+{
+	int i;
+
+	for (i = 0; i < test_data.nr_cpus; i++)
+		its_send_invall_cmd(test_data.cmdq_base_va, i);
+}
+
+static void guest_setup_gic(void)
+{
+	static atomic_int nr_cpus_ready = 0;
+	u32 cpuid = guest_get_vcpuid();
+
+	gic_init(GIC_V3, test_data.nr_cpus);
+	gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K,
+			      test_data.lpi_pend_tables + (cpuid * SZ_64K));
+
+	atomic_fetch_add(&nr_cpus_ready, 1);
+
+	if (cpuid > 0)
+		return;
+
+	while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus)
+		cpu_relax();
+
+	its_init(test_data.collection_table, SZ_64K,
+		 test_data.device_table, SZ_64K,
+		 test_data.cmdq_base, SZ_64K);
+
+	guest_setup_its_mappings();
+	guest_invalidate_all_rdists();
+}
+
+static void guest_code(size_t nr_lpis)
+{
+	guest_setup_gic();
+
+	GUEST_SYNC(0);
+
+	/*
+	 * Don't use WFI here to avoid blocking the vCPU thread indefinitely and
+	 * never getting the stop signal.
+	 */
+	while (!READ_ONCE(test_data.request_vcpus_stop))
+		cpu_relax();
+
+	GUEST_DONE();
+}
+
+static void setup_memslot(void)
+{
+	size_t pages;
+	size_t sz;
+
+	/*
+	 * For the ITS:
+	 *  - A single level device table
+	 *  - A single level collection table
+	 *  - The command queue
+	 *  - An ITT for each device
+	 */
+	sz = (3 + test_data.nr_devices) * SZ_64K;
+
+	/*
+	 * For the redistributors:
+	 *  - A shared LPI configuration table
+	 *  - An LPI pending table for each vCPU
+	 */
+	sz += (1 + test_data.nr_cpus) * SZ_64K;
+
+	pages = sz / vm->page_size;
+	gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz;
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base,
+				    TEST_MEMSLOT_INDEX, pages, 0);
+}
+
+#define LPI_PROP_DEFAULT_PRIO	0xa0
+
+static void configure_lpis(void)
+{
+	size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids;
+	u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table);
+	size_t i;
+
+	for (i = 0; i < nr_lpis; i++) {
+		tbl[i] = LPI_PROP_DEFAULT_PRIO |
+			 LPI_PROP_GROUP1 |
+			 LPI_PROP_ENABLED;
+	}
+}
+
+static void setup_test_data(void)
+{
+	size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K);
+	u32 nr_devices = test_data.nr_devices;
+	u32 nr_cpus = test_data.nr_cpus;
+	vm_paddr_t cmdq_base;
+
+	test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k,
+						    gpa_base,
+						    TEST_MEMSLOT_INDEX);
+
+	test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k,
+							gpa_base,
+							TEST_MEMSLOT_INDEX);
+
+	cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base,
+				       TEST_MEMSLOT_INDEX);
+	virt_map(vm, cmdq_base, cmdq_base, pages_per_64k);
+	test_data.cmdq_base = cmdq_base;
+	test_data.cmdq_base_va = (void *)cmdq_base;
+
+	test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices,
+						  gpa_base, TEST_MEMSLOT_INDEX);
+
+	test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k,
+						      gpa_base, TEST_MEMSLOT_INDEX);
+	configure_lpis();
+
+	test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus,
+						       gpa_base, TEST_MEMSLOT_INDEX);
+
+	sync_global_to_guest(vm, test_data);
+}
+
+static void setup_gic(void)
+{
+	gic_fd = vgic_v3_setup(vm, test_data.nr_cpus, 64);
+	__TEST_REQUIRE(gic_fd >= 0, "Failed to create GICv3");
+
+	its_fd = vgic_its_setup(vm);
+}
+
+static void signal_lpi(u32 device_id, u32 event_id)
+{
+	vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER;
+
+	struct kvm_msi msi = {
+		.address_lo	= db_addr,
+		.address_hi	= db_addr >> 32,
+		.data		= event_id,
+		.devid		= device_id,
+		.flags		= KVM_MSI_VALID_DEVID,
+	};
+
+	/*
+	 * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM,
+	 * which for arm64 implies having a valid translation in the ITS.
+	 */
+	TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1,
+		    "KVM_SIGNAL_MSI ioctl failed");
+}
+
+static pthread_barrier_t test_setup_barrier;
+
+static void *lpi_worker_thread(void *data)
+{
+	u32 device_id = (size_t)data;
+	u32 event_id;
+	size_t i;
+
+	pthread_barrier_wait(&test_setup_barrier);
+
+	for (i = 0; i < nr_iterations; i++)
+		for (event_id = 0; event_id < test_data.nr_event_ids; event_id++)
+			signal_lpi(device_id, event_id);
+
+	return NULL;
+}
+
+static void *vcpu_worker_thread(void *data)
+{
+	struct kvm_vcpu *vcpu = data;
+	struct ucall uc;
+
+	while (true) {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			pthread_barrier_wait(&test_setup_barrier);
+			continue;
+		case UCALL_DONE:
+			return NULL;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		default:
+			TEST_FAIL("Unknown ucall: %lu", uc.cmd);
+		}
+	}
+
+	return NULL;
+}
+
+static void report_stats(struct timespec delta)
+{
+	double nr_lpis;
+	double time;
+
+	nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations;
+
+	time = delta.tv_sec;
+	time += ((double)delta.tv_nsec) / NSEC_PER_SEC;
+
+	pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time);
+}
+
+static void run_test(void)
+{
+	u32 nr_devices = test_data.nr_devices;
+	u32 nr_vcpus = test_data.nr_cpus;
+	pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t));
+	pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t));
+	struct timespec start, delta;
+	size_t i;
+
+	TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays");
+
+	pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1);
+
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]);
+
+	for (i = 0; i < nr_devices; i++)
+		pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i);
+
+	pthread_barrier_wait(&test_setup_barrier);
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+
+	for (i = 0; i < nr_devices; i++)
+		pthread_join(lpi_threads[i], NULL);
+
+	delta = timespec_elapsed(start);
+	write_guest_global(vm, test_data.request_vcpus_stop, true);
+
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_join(vcpu_threads[i], NULL);
+
+	report_stats(delta);
+}
+
+static void setup_vm(void)
+{
+	int i;
+
+	vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu));
+	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
+
+	vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus);
+
+	vm_init_descriptor_tables(vm);
+	for (i = 0; i < test_data.nr_cpus; i++)
+		vcpu_init_descriptor_tables(vcpus[i]);
+
+	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
+
+	setup_memslot();
+
+	setup_gic();
+
+	setup_test_data();
+}
+
+static void destroy_vm(void)
+{
+	close(its_fd);
+	close(gic_fd);
+	kvm_vm_free(vm);
+	free(vcpus);
+}
+
+static void pr_usage(const char *name)
+{
+	pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name);
+	pr_info("  -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus);
+	pr_info("  -d:\tnumber of devices (default: %u)\n", test_data.nr_devices);
+	pr_info("  -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids);
+	pr_info("  -i:\tnumber of iterations (default: %lu)\n", nr_iterations);
+}
+
+int main(int argc, char **argv)
+{
+	u32 nr_threads;
+	int c;
+
+	while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) {
+		switch (c) {
+		case 'v':
+			test_data.nr_cpus = atoi(optarg);
+			break;
+		case 'd':
+			test_data.nr_devices = atoi(optarg);
+			break;
+		case 'e':
+			test_data.nr_event_ids = atoi(optarg);
+			break;
+		case 'i':
+			nr_iterations = strtoul(optarg, NULL, 0);
+			break;
+		case 'h':
+		default:
+			pr_usage(argv[0]);
+			return 1;
+		}
+	}
+
+	nr_threads = test_data.nr_cpus + test_data.nr_devices;
+	if (nr_threads > get_nprocs())
+		pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n",
+			 nr_threads, get_nprocs());
+
+	setup_vm();
+
+	run_test();
+
+	destroy_vm();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
new file mode 100644
index 000000000000..f16b3b27e32e
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
@@ -0,0 +1,648 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vpmu_counter_access - Test vPMU event counter access
+ *
+ * Copyright (c) 2023 Google LLC.
+ *
+ * This test checks if the guest can see the same number of the PMU event
+ * counters (PMCR_EL0.N) that userspace sets, if the guest can access
+ * those counters, and if the guest is prevented from accessing any
+ * other counters.
+ * It also checks if the userspace accesses to the PMU regsisters honor the
+ * PMCR.N value that's set for the guest.
+ * This test runs only when KVM_CAP_ARM_PMU_V3 is supported on the host.
+ */
+#include <kvm_util.h>
+#include <processor.h>
+#include <test_util.h>
+#include <vgic.h>
+#include <perf/arm_pmuv3.h>
+#include <linux/bitfield.h>
+
+/* The max number of the PMU event counters (excluding the cycle counter) */
+#define ARMV8_PMU_MAX_GENERAL_COUNTERS	(ARMV8_PMU_MAX_COUNTERS - 1)
+
+/* The cycle counter bit position that's common among the PMU registers */
+#define ARMV8_PMU_CYCLE_IDX		31
+
+struct vpmu_vm {
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	int gic_fd;
+};
+
+static struct vpmu_vm vpmu_vm;
+
+struct pmreg_sets {
+	uint64_t set_reg_id;
+	uint64_t clr_reg_id;
+};
+
+#define PMREG_SET(set, clr) {.set_reg_id = set, .clr_reg_id = clr}
+
+static uint64_t get_pmcr_n(uint64_t pmcr)
+{
+	return FIELD_GET(ARMV8_PMU_PMCR_N, pmcr);
+}
+
+static void set_pmcr_n(uint64_t *pmcr, uint64_t pmcr_n)
+{
+	u64p_replace_bits((__u64 *) pmcr, pmcr_n, ARMV8_PMU_PMCR_N);
+}
+
+static uint64_t get_counters_mask(uint64_t n)
+{
+	uint64_t mask = BIT(ARMV8_PMU_CYCLE_IDX);
+
+	if (n)
+		mask |= GENMASK(n - 1, 0);
+	return mask;
+}
+
+/* Read PMEVTCNTR<n>_EL0 through PMXEVCNTR_EL0 */
+static inline unsigned long read_sel_evcntr(int sel)
+{
+	write_sysreg(sel, pmselr_el0);
+	isb();
+	return read_sysreg(pmxevcntr_el0);
+}
+
+/* Write PMEVTCNTR<n>_EL0 through PMXEVCNTR_EL0 */
+static inline void write_sel_evcntr(int sel, unsigned long val)
+{
+	write_sysreg(sel, pmselr_el0);
+	isb();
+	write_sysreg(val, pmxevcntr_el0);
+	isb();
+}
+
+/* Read PMEVTYPER<n>_EL0 through PMXEVTYPER_EL0 */
+static inline unsigned long read_sel_evtyper(int sel)
+{
+	write_sysreg(sel, pmselr_el0);
+	isb();
+	return read_sysreg(pmxevtyper_el0);
+}
+
+/* Write PMEVTYPER<n>_EL0 through PMXEVTYPER_EL0 */
+static inline void write_sel_evtyper(int sel, unsigned long val)
+{
+	write_sysreg(sel, pmselr_el0);
+	isb();
+	write_sysreg(val, pmxevtyper_el0);
+	isb();
+}
+
+static void pmu_disable_reset(void)
+{
+	uint64_t pmcr = read_sysreg(pmcr_el0);
+
+	/* Reset all counters, disabling them */
+	pmcr &= ~ARMV8_PMU_PMCR_E;
+	write_sysreg(pmcr | ARMV8_PMU_PMCR_P, pmcr_el0);
+	isb();
+}
+
+#define RETURN_READ_PMEVCNTRN(n) \
+	return read_sysreg(pmevcntr##n##_el0)
+static unsigned long read_pmevcntrn(int n)
+{
+	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
+	return 0;
+}
+
+#define WRITE_PMEVCNTRN(n) \
+	write_sysreg(val, pmevcntr##n##_el0)
+static void write_pmevcntrn(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
+	isb();
+}
+
+#define READ_PMEVTYPERN(n) \
+	return read_sysreg(pmevtyper##n##_el0)
+static unsigned long read_pmevtypern(int n)
+{
+	PMEVN_SWITCH(n, READ_PMEVTYPERN);
+	return 0;
+}
+
+#define WRITE_PMEVTYPERN(n) \
+	write_sysreg(val, pmevtyper##n##_el0)
+static void write_pmevtypern(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
+	isb();
+}
+
+/*
+ * The pmc_accessor structure has pointers to PMEV{CNTR,TYPER}<n>_EL0
+ * accessors that test cases will use. Each of the accessors will
+ * either directly reads/writes PMEV{CNTR,TYPER}<n>_EL0
+ * (i.e. {read,write}_pmev{cnt,type}rn()), or reads/writes them through
+ * PMXEV{CNTR,TYPER}_EL0 (i.e. {read,write}_sel_ev{cnt,type}r()).
+ *
+ * This is used to test that combinations of those accessors provide
+ * the consistent behavior.
+ */
+struct pmc_accessor {
+	/* A function to be used to read PMEVTCNTR<n>_EL0 */
+	unsigned long	(*read_cntr)(int idx);
+	/* A function to be used to write PMEVTCNTR<n>_EL0 */
+	void		(*write_cntr)(int idx, unsigned long val);
+	/* A function to be used to read PMEVTYPER<n>_EL0 */
+	unsigned long	(*read_typer)(int idx);
+	/* A function to be used to write PMEVTYPER<n>_EL0 */
+	void		(*write_typer)(int idx, unsigned long val);
+};
+
+struct pmc_accessor pmc_accessors[] = {
+	/* test with all direct accesses */
+	{ read_pmevcntrn, write_pmevcntrn, read_pmevtypern, write_pmevtypern },
+	/* test with all indirect accesses */
+	{ read_sel_evcntr, write_sel_evcntr, read_sel_evtyper, write_sel_evtyper },
+	/* read with direct accesses, and write with indirect accesses */
+	{ read_pmevcntrn, write_sel_evcntr, read_pmevtypern, write_sel_evtyper },
+	/* read with indirect accesses, and write with direct accesses */
+	{ read_sel_evcntr, write_pmevcntrn, read_sel_evtyper, write_pmevtypern },
+};
+
+/*
+ * Convert a pointer of pmc_accessor to an index in pmc_accessors[],
+ * assuming that the pointer is one of the entries in pmc_accessors[].
+ */
+#define PMC_ACC_TO_IDX(acc)	(acc - &pmc_accessors[0])
+
+#define GUEST_ASSERT_BITMAP_REG(regname, mask, set_expected)			 \
+{										 \
+	uint64_t _tval = read_sysreg(regname);					 \
+										 \
+	if (set_expected)							 \
+		__GUEST_ASSERT((_tval & mask),					 \
+				"tval: 0x%lx; mask: 0x%lx; set_expected: %u",	 \
+				_tval, mask, set_expected);			 \
+	else									 \
+		__GUEST_ASSERT(!(_tval & mask),					 \
+				"tval: 0x%lx; mask: 0x%lx; set_expected: %u",	 \
+				_tval, mask, set_expected);			 \
+}
+
+/*
+ * Check if @mask bits in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers
+ * are set or cleared as specified in @set_expected.
+ */
+static void check_bitmap_pmu_regs(uint64_t mask, bool set_expected)
+{
+	GUEST_ASSERT_BITMAP_REG(pmcntenset_el0, mask, set_expected);
+	GUEST_ASSERT_BITMAP_REG(pmcntenclr_el0, mask, set_expected);
+	GUEST_ASSERT_BITMAP_REG(pmintenset_el1, mask, set_expected);
+	GUEST_ASSERT_BITMAP_REG(pmintenclr_el1, mask, set_expected);
+	GUEST_ASSERT_BITMAP_REG(pmovsset_el0, mask, set_expected);
+	GUEST_ASSERT_BITMAP_REG(pmovsclr_el0, mask, set_expected);
+}
+
+/*
+ * Check if the bit in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers corresponding
+ * to the specified counter (@pmc_idx) can be read/written as expected.
+ * When @set_op is true, it tries to set the bit for the counter in
+ * those registers by writing the SET registers (the bit won't be set
+ * if the counter is not implemented though).
+ * Otherwise, it tries to clear the bits in the registers by writing
+ * the CLR registers.
+ * Then, it checks if the values indicated in the registers are as expected.
+ */
+static void test_bitmap_pmu_regs(int pmc_idx, bool set_op)
+{
+	uint64_t pmcr_n, test_bit = BIT(pmc_idx);
+	bool set_expected = false;
+
+	if (set_op) {
+		write_sysreg(test_bit, pmcntenset_el0);
+		write_sysreg(test_bit, pmintenset_el1);
+		write_sysreg(test_bit, pmovsset_el0);
+
+		/* The bit will be set only if the counter is implemented */
+		pmcr_n = get_pmcr_n(read_sysreg(pmcr_el0));
+		set_expected = (pmc_idx < pmcr_n) ? true : false;
+	} else {
+		write_sysreg(test_bit, pmcntenclr_el0);
+		write_sysreg(test_bit, pmintenclr_el1);
+		write_sysreg(test_bit, pmovsclr_el0);
+	}
+	check_bitmap_pmu_regs(test_bit, set_expected);
+}
+
+/*
+ * Tests for reading/writing registers for the (implemented) event counter
+ * specified by @pmc_idx.
+ */
+static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
+{
+	uint64_t write_data, read_data;
+
+	/* Disable all PMCs and reset all PMCs to zero. */
+	pmu_disable_reset();
+
+	/*
+	 * Tests for reading/writing {PMCNTEN,PMINTEN,PMOVS}{SET,CLR}_EL1.
+	 */
+
+	/* Make sure that the bit in those registers are set to 0 */
+	test_bitmap_pmu_regs(pmc_idx, false);
+	/* Test if setting the bit in those registers works */
+	test_bitmap_pmu_regs(pmc_idx, true);
+	/* Test if clearing the bit in those registers works */
+	test_bitmap_pmu_regs(pmc_idx, false);
+
+	/*
+	 * Tests for reading/writing the event type register.
+	 */
+
+	/*
+	 * Set the event type register to an arbitrary value just for testing
+	 * of reading/writing the register.
+	 * Arm ARM says that for the event from 0x0000 to 0x003F,
+	 * the value indicated in the PMEVTYPER<n>_EL0.evtCount field is
+	 * the value written to the field even when the specified event
+	 * is not supported.
+	 */
+	write_data = (ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMUV3_PERFCTR_INST_RETIRED);
+	acc->write_typer(pmc_idx, write_data);
+	read_data = acc->read_typer(pmc_idx);
+	__GUEST_ASSERT(read_data == write_data,
+		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
+		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data);
+
+	/*
+	 * Tests for reading/writing the event count register.
+	 */
+
+	read_data = acc->read_cntr(pmc_idx);
+
+	/* The count value must be 0, as it is disabled and reset */
+	__GUEST_ASSERT(read_data == 0,
+		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx",
+		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data);
+
+	write_data = read_data + pmc_idx + 0x12345;
+	acc->write_cntr(pmc_idx, write_data);
+	read_data = acc->read_cntr(pmc_idx);
+	__GUEST_ASSERT(read_data == write_data,
+		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
+		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data);
+}
+
+#define INVALID_EC	(-1ul)
+uint64_t expected_ec = INVALID_EC;
+
+static void guest_sync_handler(struct ex_regs *regs)
+{
+	uint64_t esr, ec;
+
+	esr = read_sysreg(esr_el1);
+	ec = ESR_ELx_EC(esr);
+
+	__GUEST_ASSERT(expected_ec == ec,
+			"PC: 0x%lx; ESR: 0x%lx; EC: 0x%lx; EC expected: 0x%lx",
+			regs->pc, esr, ec, expected_ec);
+
+	/* skip the trapping instruction */
+	regs->pc += 4;
+
+	/* Use INVALID_EC to indicate an exception occurred */
+	expected_ec = INVALID_EC;
+}
+
+/*
+ * Run the given operation that should trigger an exception with the
+ * given exception class. The exception handler (guest_sync_handler)
+ * will reset op_end_addr to 0, expected_ec to INVALID_EC, and skip
+ * the instruction that trapped.
+ */
+#define TEST_EXCEPTION(ec, ops)				\
+({							\
+	GUEST_ASSERT(ec != INVALID_EC);			\
+	WRITE_ONCE(expected_ec, ec);			\
+	dsb(ish);					\
+	ops;						\
+	GUEST_ASSERT(expected_ec == INVALID_EC);	\
+})
+
+/*
+ * Tests for reading/writing registers for the unimplemented event counter
+ * specified by @pmc_idx (>= PMCR_EL0.N).
+ */
+static void test_access_invalid_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
+{
+	/*
+	 * Reading/writing the event count/type registers should cause
+	 * an UNDEFINED exception.
+	 */
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_cntr(pmc_idx));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_typer(pmc_idx));
+	TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_typer(pmc_idx, 0));
+	/*
+	 * The bit corresponding to the (unimplemented) counter in
+	 * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers should be RAZ.
+	 */
+	test_bitmap_pmu_regs(pmc_idx, 1);
+	test_bitmap_pmu_regs(pmc_idx, 0);
+}
+
+/*
+ * The guest is configured with PMUv3 with @expected_pmcr_n number of
+ * event counters.
+ * Check if @expected_pmcr_n is consistent with PMCR_EL0.N, and
+ * if reading/writing PMU registers for implemented or unimplemented
+ * counters works as expected.
+ */
+static void guest_code(uint64_t expected_pmcr_n)
+{
+	uint64_t pmcr, pmcr_n, unimp_mask;
+	int i, pmc;
+
+	__GUEST_ASSERT(expected_pmcr_n <= ARMV8_PMU_MAX_GENERAL_COUNTERS,
+			"Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%x",
+			expected_pmcr_n, ARMV8_PMU_MAX_GENERAL_COUNTERS);
+
+	pmcr = read_sysreg(pmcr_el0);
+	pmcr_n = get_pmcr_n(pmcr);
+
+	/* Make sure that PMCR_EL0.N indicates the value userspace set */
+	__GUEST_ASSERT(pmcr_n == expected_pmcr_n,
+			"Expected PMCR.N: 0x%lx, PMCR.N: 0x%lx",
+			expected_pmcr_n, pmcr_n);
+
+	/*
+	 * Make sure that (RAZ) bits corresponding to unimplemented event
+	 * counters in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers are reset
+	 * to zero.
+	 * (NOTE: bits for implemented event counters are reset to UNKNOWN)
+	 */
+	unimp_mask = GENMASK_ULL(ARMV8_PMU_MAX_GENERAL_COUNTERS - 1, pmcr_n);
+	check_bitmap_pmu_regs(unimp_mask, false);
+
+	/*
+	 * Tests for reading/writing PMU registers for implemented counters.
+	 * Use each combination of PMEV{CNTR,TYPER}<n>_EL0 accessor functions.
+	 */
+	for (i = 0; i < ARRAY_SIZE(pmc_accessors); i++) {
+		for (pmc = 0; pmc < pmcr_n; pmc++)
+			test_access_pmc_regs(&pmc_accessors[i], pmc);
+	}
+
+	/*
+	 * Tests for reading/writing PMU registers for unimplemented counters.
+	 * Use each combination of PMEV{CNTR,TYPER}<n>_EL0 accessor functions.
+	 */
+	for (i = 0; i < ARRAY_SIZE(pmc_accessors); i++) {
+		for (pmc = pmcr_n; pmc < ARMV8_PMU_MAX_GENERAL_COUNTERS; pmc++)
+			test_access_invalid_pmc_regs(&pmc_accessors[i], pmc);
+	}
+
+	GUEST_DONE();
+}
+
+/* Create a VM that has one vCPU with PMUv3 configured. */
+static void create_vpmu_vm(void *guest_code)
+{
+	struct kvm_vcpu_init init;
+	uint8_t pmuver, ec;
+	uint64_t dfr0, irq = 23;
+	struct kvm_device_attr irq_attr = {
+		.group = KVM_ARM_VCPU_PMU_V3_CTRL,
+		.attr = KVM_ARM_VCPU_PMU_V3_IRQ,
+		.addr = (uint64_t)&irq,
+	};
+	struct kvm_device_attr init_attr = {
+		.group = KVM_ARM_VCPU_PMU_V3_CTRL,
+		.attr = KVM_ARM_VCPU_PMU_V3_INIT,
+	};
+
+	/* The test creates the vpmu_vm multiple times. Ensure a clean state */
+	memset(&vpmu_vm, 0, sizeof(vpmu_vm));
+
+	vpmu_vm.vm = vm_create(1);
+	vm_init_descriptor_tables(vpmu_vm.vm);
+	for (ec = 0; ec < ESR_ELx_EC_MAX + 1; ec++) {
+		vm_install_sync_handler(vpmu_vm.vm, VECTOR_SYNC_CURRENT, ec,
+					guest_sync_handler);
+	}
+
+	/* Create vCPU with PMUv3 */
+	vm_ioctl(vpmu_vm.vm, KVM_ARM_PREFERRED_TARGET, &init);
+	init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3);
+	vpmu_vm.vcpu = aarch64_vcpu_add(vpmu_vm.vm, 0, &init, guest_code);
+	vcpu_init_descriptor_tables(vpmu_vm.vcpu);
+	vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64);
+	__TEST_REQUIRE(vpmu_vm.gic_fd >= 0,
+		       "Failed to create vgic-v3, skipping");
+
+	/* Make sure that PMUv3 support is indicated in the ID register */
+	dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
+	pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0);
+	TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF &&
+		    pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP,
+		    "Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver);
+
+	/* Initialize vPMU */
+	vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &irq_attr);
+	vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &init_attr);
+}
+
+static void destroy_vpmu_vm(void)
+{
+	close(vpmu_vm.gic_fd);
+	kvm_vm_free(vpmu_vm.vm);
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu, uint64_t pmcr_n)
+{
+	struct ucall uc;
+
+	vcpu_args_set(vcpu, 1, pmcr_n);
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		break;
+	}
+}
+
+static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail)
+{
+	struct kvm_vcpu *vcpu;
+	uint64_t pmcr, pmcr_orig;
+
+	create_vpmu_vm(guest_code);
+	vcpu = vpmu_vm.vcpu;
+
+	pmcr_orig = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
+	pmcr = pmcr_orig;
+
+	/*
+	 * Setting a larger value of PMCR.N should not modify the field, and
+	 * return a success.
+	 */
+	set_pmcr_n(&pmcr, pmcr_n);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), pmcr);
+	pmcr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
+
+	if (expect_fail)
+		TEST_ASSERT(pmcr_orig == pmcr,
+			    "PMCR.N modified by KVM to a larger value (PMCR: 0x%lx) for pmcr_n: 0x%lx",
+			    pmcr, pmcr_n);
+	else
+		TEST_ASSERT(pmcr_n == get_pmcr_n(pmcr),
+			    "Failed to update PMCR.N to %lu (received: %lu)",
+			    pmcr_n, get_pmcr_n(pmcr));
+}
+
+/*
+ * Create a guest with one vCPU, set the PMCR_EL0.N for the vCPU to @pmcr_n,
+ * and run the test.
+ */
+static void run_access_test(uint64_t pmcr_n)
+{
+	uint64_t sp;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu_init init;
+
+	pr_debug("Test with pmcr_n %lu\n", pmcr_n);
+
+	test_create_vpmu_vm_with_pmcr_n(pmcr_n, false);
+	vcpu = vpmu_vm.vcpu;
+
+	/* Save the initial sp to restore them later to run the guest again */
+	sp = vcpu_get_reg(vcpu, ARM64_CORE_REG(sp_el1));
+
+	run_vcpu(vcpu, pmcr_n);
+
+	/*
+	 * Reset and re-initialize the vCPU, and run the guest code again to
+	 * check if PMCR_EL0.N is preserved.
+	 */
+	vm_ioctl(vpmu_vm.vm, KVM_ARM_PREFERRED_TARGET, &init);
+	init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3);
+	aarch64_vcpu_setup(vcpu, &init);
+	vcpu_init_descriptor_tables(vcpu);
+	vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), sp);
+	vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+
+	run_vcpu(vcpu, pmcr_n);
+
+	destroy_vpmu_vm();
+}
+
+static struct pmreg_sets validity_check_reg_sets[] = {
+	PMREG_SET(SYS_PMCNTENSET_EL0, SYS_PMCNTENCLR_EL0),
+	PMREG_SET(SYS_PMINTENSET_EL1, SYS_PMINTENCLR_EL1),
+	PMREG_SET(SYS_PMOVSSET_EL0, SYS_PMOVSCLR_EL0),
+};
+
+/*
+ * Create a VM, and check if KVM handles the userspace accesses of
+ * the PMU register sets in @validity_check_reg_sets[] correctly.
+ */
+static void run_pmregs_validity_test(uint64_t pmcr_n)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	uint64_t set_reg_id, clr_reg_id, reg_val;
+	uint64_t valid_counters_mask, max_counters_mask;
+
+	test_create_vpmu_vm_with_pmcr_n(pmcr_n, false);
+	vcpu = vpmu_vm.vcpu;
+
+	valid_counters_mask = get_counters_mask(pmcr_n);
+	max_counters_mask = get_counters_mask(ARMV8_PMU_MAX_COUNTERS);
+
+	for (i = 0; i < ARRAY_SIZE(validity_check_reg_sets); i++) {
+		set_reg_id = validity_check_reg_sets[i].set_reg_id;
+		clr_reg_id = validity_check_reg_sets[i].clr_reg_id;
+
+		/*
+		 * Test if the 'set' and 'clr' variants of the registers
+		 * are initialized based on the number of valid counters.
+		 */
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id));
+		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
+			    "Initial read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
+			    KVM_ARM64_SYS_REG(set_reg_id), reg_val);
+
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id));
+		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
+			    "Initial read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
+			    KVM_ARM64_SYS_REG(clr_reg_id), reg_val);
+
+		/*
+		 * Using the 'set' variant, force-set the register to the
+		 * max number of possible counters and test if KVM discards
+		 * the bits for unimplemented counters as it should.
+		 */
+		vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), max_counters_mask);
+
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id));
+		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
+			    "Read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
+			    KVM_ARM64_SYS_REG(set_reg_id), reg_val);
+
+		reg_val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id));
+		TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0,
+			    "Read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx",
+			    KVM_ARM64_SYS_REG(clr_reg_id), reg_val);
+	}
+
+	destroy_vpmu_vm();
+}
+
+/*
+ * Create a guest with one vCPU, and attempt to set the PMCR_EL0.N for
+ * the vCPU to @pmcr_n, which is larger than the host value.
+ * The attempt should fail as @pmcr_n is too big to set for the vCPU.
+ */
+static void run_error_test(uint64_t pmcr_n)
+{
+	pr_debug("Error test with pmcr_n %lu (larger than the host)\n", pmcr_n);
+
+	test_create_vpmu_vm_with_pmcr_n(pmcr_n, true);
+	destroy_vpmu_vm();
+}
+
+/*
+ * Return the default number of implemented PMU event counters excluding
+ * the cycle counter (i.e. PMCR_EL0.N value) for the guest.
+ */
+static uint64_t get_pmcr_n_limit(void)
+{
+	uint64_t pmcr;
+
+	create_vpmu_vm(guest_code);
+	pmcr = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
+	destroy_vpmu_vm();
+	return get_pmcr_n(pmcr);
+}
+
+int main(void)
+{
+	uint64_t i, pmcr_n;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3));
+
+	pmcr_n = get_pmcr_n_limit();
+	for (i = 0; i <= pmcr_n; i++) {
+		run_access_test(i);
+		run_pmregs_validity_test(i);
+	}
+
+	for (i = pmcr_n + 1; i < ARMV8_PMU_MAX_COUNTERS; i++)
+		run_error_test(i);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 9f24303acb8c..e79817bd0e29 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -21,7 +21,7 @@
 #include "ucall_common.h"
 
 #ifdef __aarch64__
-#include "aarch64/vgic.h"
+#include "arm64/vgic.h"
 
 static int gic_fd;
 
diff --git a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h
deleted file mode 100644
index bf461de34785..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM Generic Timer specific interface
- */
-
-#ifndef SELFTEST_KVM_ARCH_TIMER_H
-#define SELFTEST_KVM_ARCH_TIMER_H
-
-#include "processor.h"
-
-enum arch_timer {
-	VIRTUAL,
-	PHYSICAL,
-};
-
-#define CTL_ENABLE	(1 << 0)
-#define CTL_IMASK	(1 << 1)
-#define CTL_ISTATUS	(1 << 2)
-
-#define msec_to_cycles(msec)	\
-	(timer_get_cntfrq() * (uint64_t)(msec) / 1000)
-
-#define usec_to_cycles(usec)	\
-	(timer_get_cntfrq() * (uint64_t)(usec) / 1000000)
-
-#define cycles_to_usec(cycles) \
-	((uint64_t)(cycles) * 1000000 / timer_get_cntfrq())
-
-static inline uint32_t timer_get_cntfrq(void)
-{
-	return read_sysreg(cntfrq_el0);
-}
-
-static inline uint64_t timer_get_cntct(enum arch_timer timer)
-{
-	isb();
-
-	switch (timer) {
-	case VIRTUAL:
-		return read_sysreg(cntvct_el0);
-	case PHYSICAL:
-		return read_sysreg(cntpct_el0);
-	default:
-		GUEST_FAIL("Unexpected timer type = %u", timer);
-	}
-
-	/* We should not reach here */
-	return 0;
-}
-
-static inline void timer_set_cval(enum arch_timer timer, uint64_t cval)
-{
-	switch (timer) {
-	case VIRTUAL:
-		write_sysreg(cval, cntv_cval_el0);
-		break;
-	case PHYSICAL:
-		write_sysreg(cval, cntp_cval_el0);
-		break;
-	default:
-		GUEST_FAIL("Unexpected timer type = %u", timer);
-	}
-
-	isb();
-}
-
-static inline uint64_t timer_get_cval(enum arch_timer timer)
-{
-	switch (timer) {
-	case VIRTUAL:
-		return read_sysreg(cntv_cval_el0);
-	case PHYSICAL:
-		return read_sysreg(cntp_cval_el0);
-	default:
-		GUEST_FAIL("Unexpected timer type = %u", timer);
-	}
-
-	/* We should not reach here */
-	return 0;
-}
-
-static inline void timer_set_tval(enum arch_timer timer, int32_t tval)
-{
-	switch (timer) {
-	case VIRTUAL:
-		write_sysreg(tval, cntv_tval_el0);
-		break;
-	case PHYSICAL:
-		write_sysreg(tval, cntp_tval_el0);
-		break;
-	default:
-		GUEST_FAIL("Unexpected timer type = %u", timer);
-	}
-
-	isb();
-}
-
-static inline int32_t timer_get_tval(enum arch_timer timer)
-{
-	isb();
-	switch (timer) {
-	case VIRTUAL:
-		return read_sysreg(cntv_tval_el0);
-	case PHYSICAL:
-		return read_sysreg(cntp_tval_el0);
-	default:
-		GUEST_FAIL("Could not get timer %d\n", timer);
-	}
-
-	/* We should not reach here */
-	return 0;
-}
-
-static inline void timer_set_ctl(enum arch_timer timer, uint32_t ctl)
-{
-	switch (timer) {
-	case VIRTUAL:
-		write_sysreg(ctl, cntv_ctl_el0);
-		break;
-	case PHYSICAL:
-		write_sysreg(ctl, cntp_ctl_el0);
-		break;
-	default:
-		GUEST_FAIL("Unexpected timer type = %u", timer);
-	}
-
-	isb();
-}
-
-static inline uint32_t timer_get_ctl(enum arch_timer timer)
-{
-	switch (timer) {
-	case VIRTUAL:
-		return read_sysreg(cntv_ctl_el0);
-	case PHYSICAL:
-		return read_sysreg(cntp_ctl_el0);
-	default:
-		GUEST_FAIL("Unexpected timer type = %u", timer);
-	}
-
-	/* We should not reach here */
-	return 0;
-}
-
-static inline void timer_set_next_cval_ms(enum arch_timer timer, uint32_t msec)
-{
-	uint64_t now_ct = timer_get_cntct(timer);
-	uint64_t next_ct = now_ct + msec_to_cycles(msec);
-
-	timer_set_cval(timer, next_ct);
-}
-
-static inline void timer_set_next_tval_ms(enum arch_timer timer, uint32_t msec)
-{
-	timer_set_tval(timer, msec_to_cycles(msec));
-}
-
-#endif /* SELFTEST_KVM_ARCH_TIMER_H */
diff --git a/tools/testing/selftests/kvm/include/aarch64/delay.h b/tools/testing/selftests/kvm/include/aarch64/delay.h
deleted file mode 100644
index 329e4f5079ea..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/delay.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM simple delay routines
- */
-
-#ifndef SELFTEST_KVM_ARM_DELAY_H
-#define SELFTEST_KVM_ARM_DELAY_H
-
-#include "arch_timer.h"
-
-static inline void __delay(uint64_t cycles)
-{
-	enum arch_timer timer = VIRTUAL;
-	uint64_t start = timer_get_cntct(timer);
-
-	while ((timer_get_cntct(timer) - start) < cycles)
-		cpu_relax();
-}
-
-static inline void udelay(unsigned long usec)
-{
-	__delay(usec_to_cycles(usec));
-}
-
-#endif /* SELFTEST_KVM_ARM_DELAY_H */
diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h
deleted file mode 100644
index baeb3c859389..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/gic.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM Generic Interrupt Controller (GIC) specific defines
- */
-
-#ifndef SELFTEST_KVM_GIC_H
-#define SELFTEST_KVM_GIC_H
-
-#include <asm/kvm.h>
-
-enum gic_type {
-	GIC_V3,
-	GIC_TYPE_MAX,
-};
-
-/*
- * Note that the redistributor frames are at the end, as the range scales
- * with the number of vCPUs in the VM.
- */
-#define GITS_BASE_GPA		0x8000000ULL
-#define GICD_BASE_GPA		(GITS_BASE_GPA + KVM_VGIC_V3_ITS_SIZE)
-#define GICR_BASE_GPA		(GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE)
-
-/* The GIC is identity-mapped into the guest at the time of setup. */
-#define GITS_BASE_GVA		((volatile void *)GITS_BASE_GPA)
-#define GICD_BASE_GVA		((volatile void *)GICD_BASE_GPA)
-#define GICR_BASE_GVA		((volatile void *)GICR_BASE_GPA)
-
-#define MIN_SGI			0
-#define MIN_PPI			16
-#define MIN_SPI			32
-#define MAX_SPI			1019
-#define IAR_SPURIOUS		1023
-
-#define INTID_IS_SGI(intid)	(0       <= (intid) && (intid) < MIN_PPI)
-#define INTID_IS_PPI(intid)	(MIN_PPI <= (intid) && (intid) < MIN_SPI)
-#define INTID_IS_SPI(intid)	(MIN_SPI <= (intid) && (intid) <= MAX_SPI)
-
-void gic_init(enum gic_type type, unsigned int nr_cpus);
-void gic_irq_enable(unsigned int intid);
-void gic_irq_disable(unsigned int intid);
-unsigned int gic_get_and_ack_irq(void);
-void gic_set_eoi(unsigned int intid);
-void gic_set_dir(unsigned int intid);
-
-/*
- * Sets the EOI mode. When split is false, EOI just drops the priority. When
- * split is true, EOI drops the priority and deactivates the interrupt.
- */
-void gic_set_eoi_split(bool split);
-void gic_set_priority_mask(uint64_t mask);
-void gic_set_priority(uint32_t intid, uint32_t prio);
-void gic_irq_set_active(unsigned int intid);
-void gic_irq_clear_active(unsigned int intid);
-bool gic_irq_get_active(unsigned int intid);
-void gic_irq_set_pending(unsigned int intid);
-void gic_irq_clear_pending(unsigned int intid);
-bool gic_irq_get_pending(unsigned int intid);
-void gic_irq_set_config(unsigned int intid, bool is_edge);
-
-void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size,
-			   vm_paddr_t pend_table);
-
-#endif /* SELFTEST_KVM_GIC_H */
diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
deleted file mode 100644
index a76615fa39a1..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
+++ /dev/null
@@ -1,604 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-#ifndef __SELFTESTS_GIC_V3_H
-#define __SELFTESTS_GIC_V3_H
-
-/*
- * Distributor registers. We assume we're running non-secure, with ARE
- * being set. Secure-only and non-ARE registers are not described.
- */
-#define GICD_CTLR			0x0000
-#define GICD_TYPER			0x0004
-#define GICD_IIDR			0x0008
-#define GICD_TYPER2			0x000C
-#define GICD_STATUSR			0x0010
-#define GICD_SETSPI_NSR			0x0040
-#define GICD_CLRSPI_NSR			0x0048
-#define GICD_SETSPI_SR			0x0050
-#define GICD_CLRSPI_SR			0x0058
-#define GICD_IGROUPR			0x0080
-#define GICD_ISENABLER			0x0100
-#define GICD_ICENABLER			0x0180
-#define GICD_ISPENDR			0x0200
-#define GICD_ICPENDR			0x0280
-#define GICD_ISACTIVER			0x0300
-#define GICD_ICACTIVER			0x0380
-#define GICD_IPRIORITYR			0x0400
-#define GICD_ICFGR			0x0C00
-#define GICD_IGRPMODR			0x0D00
-#define GICD_NSACR			0x0E00
-#define GICD_IGROUPRnE			0x1000
-#define GICD_ISENABLERnE		0x1200
-#define GICD_ICENABLERnE		0x1400
-#define GICD_ISPENDRnE			0x1600
-#define GICD_ICPENDRnE			0x1800
-#define GICD_ISACTIVERnE		0x1A00
-#define GICD_ICACTIVERnE		0x1C00
-#define GICD_IPRIORITYRnE		0x2000
-#define GICD_ICFGRnE			0x3000
-#define GICD_IROUTER			0x6000
-#define GICD_IROUTERnE			0x8000
-#define GICD_IDREGS			0xFFD0
-#define GICD_PIDR2			0xFFE8
-
-#define ESPI_BASE_INTID			4096
-
-/*
- * Those registers are actually from GICv2, but the spec demands that they
- * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
- */
-#define GICD_ITARGETSR			0x0800
-#define GICD_SGIR			0x0F00
-#define GICD_CPENDSGIR			0x0F10
-#define GICD_SPENDSGIR			0x0F20
-
-#define GICD_CTLR_RWP			(1U << 31)
-#define GICD_CTLR_nASSGIreq		(1U << 8)
-#define GICD_CTLR_DS			(1U << 6)
-#define GICD_CTLR_ARE_NS		(1U << 4)
-#define GICD_CTLR_ENABLE_G1A		(1U << 1)
-#define GICD_CTLR_ENABLE_G1		(1U << 0)
-
-#define GICD_IIDR_IMPLEMENTER_SHIFT	0
-#define GICD_IIDR_IMPLEMENTER_MASK	(0xfff << GICD_IIDR_IMPLEMENTER_SHIFT)
-#define GICD_IIDR_REVISION_SHIFT	12
-#define GICD_IIDR_REVISION_MASK		(0xf << GICD_IIDR_REVISION_SHIFT)
-#define GICD_IIDR_VARIANT_SHIFT		16
-#define GICD_IIDR_VARIANT_MASK		(0xf << GICD_IIDR_VARIANT_SHIFT)
-#define GICD_IIDR_PRODUCT_ID_SHIFT	24
-#define GICD_IIDR_PRODUCT_ID_MASK	(0xff << GICD_IIDR_PRODUCT_ID_SHIFT)
-
-
-/*
- * In systems with a single security state (what we emulate in KVM)
- * the meaning of the interrupt group enable bits is slightly different
- */
-#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
-#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
-
-#define GICD_TYPER_RSS			(1U << 26)
-#define GICD_TYPER_LPIS			(1U << 17)
-#define GICD_TYPER_MBIS			(1U << 16)
-#define GICD_TYPER_ESPI			(1U << 8)
-
-#define GICD_TYPER_ID_BITS(typer)	((((typer) >> 19) & 0x1f) + 1)
-#define GICD_TYPER_NUM_LPIS(typer)	((((typer) >> 11) & 0x1f) + 1)
-#define GICD_TYPER_SPIS(typer)		((((typer) & 0x1f) + 1) * 32)
-#define GICD_TYPER_ESPIS(typer)						\
-	(((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0)
-
-#define GICD_TYPER2_nASSGIcap		(1U << 8)
-#define GICD_TYPER2_VIL			(1U << 7)
-#define GICD_TYPER2_VID			GENMASK(4, 0)
-
-#define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
-#define GICD_IROUTER_SPI_MODE_ANY	(1U << 31)
-
-#define GIC_PIDR2_ARCH_MASK		0xf0
-#define GIC_PIDR2_ARCH_GICv3		0x30
-#define GIC_PIDR2_ARCH_GICv4		0x40
-
-#define GIC_V3_DIST_SIZE		0x10000
-
-#define GIC_PAGE_SIZE_4K		0ULL
-#define GIC_PAGE_SIZE_16K		1ULL
-#define GIC_PAGE_SIZE_64K		2ULL
-#define GIC_PAGE_SIZE_MASK		3ULL
-
-/*
- * Re-Distributor registers, offsets from RD_base
- */
-#define GICR_CTLR			GICD_CTLR
-#define GICR_IIDR			0x0004
-#define GICR_TYPER			0x0008
-#define GICR_STATUSR			GICD_STATUSR
-#define GICR_WAKER			0x0014
-#define GICR_SETLPIR			0x0040
-#define GICR_CLRLPIR			0x0048
-#define GICR_PROPBASER			0x0070
-#define GICR_PENDBASER			0x0078
-#define GICR_INVLPIR			0x00A0
-#define GICR_INVALLR			0x00B0
-#define GICR_SYNCR			0x00C0
-#define GICR_IDREGS			GICD_IDREGS
-#define GICR_PIDR2			GICD_PIDR2
-
-#define GICR_CTLR_ENABLE_LPIS		(1UL << 0)
-#define GICR_CTLR_CES			(1UL << 1)
-#define GICR_CTLR_IR			(1UL << 2)
-#define GICR_CTLR_RWP			(1UL << 3)
-
-#define GICR_TYPER_CPU_NUMBER(r)	(((r) >> 8) & 0xffff)
-
-#define EPPI_BASE_INTID			1056
-
-#define GICR_TYPER_NR_PPIS(r)						\
-	({								\
-		unsigned int __ppinum = ((r) >> 27) & 0x1f;		\
-		unsigned int __nr_ppis = 16;				\
-		if (__ppinum == 1 || __ppinum == 2)			\
-			__nr_ppis +=  __ppinum * 32;			\
-									\
-		__nr_ppis;						\
-	 })
-
-#define GICR_WAKER_ProcessorSleep	(1U << 1)
-#define GICR_WAKER_ChildrenAsleep	(1U << 2)
-
-#define GIC_BASER_CACHE_nCnB		0ULL
-#define GIC_BASER_CACHE_SameAsInner	0ULL
-#define GIC_BASER_CACHE_nC		1ULL
-#define GIC_BASER_CACHE_RaWt		2ULL
-#define GIC_BASER_CACHE_RaWb		3ULL
-#define GIC_BASER_CACHE_WaWt		4ULL
-#define GIC_BASER_CACHE_WaWb		5ULL
-#define GIC_BASER_CACHE_RaWaWt		6ULL
-#define GIC_BASER_CACHE_RaWaWb		7ULL
-#define GIC_BASER_CACHE_MASK		7ULL
-#define GIC_BASER_NonShareable		0ULL
-#define GIC_BASER_InnerShareable	1ULL
-#define GIC_BASER_OuterShareable	2ULL
-#define GIC_BASER_SHAREABILITY_MASK	3ULL
-
-#define GIC_BASER_CACHEABILITY(reg, inner_outer, type)			\
-	(GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
-
-#define GIC_BASER_SHAREABILITY(reg, type)				\
-	(GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
-
-/* encode a size field of width @w containing @n - 1 units */
-#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0))
-
-#define GICR_PROPBASER_SHAREABILITY_SHIFT		(10)
-#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT		(7)
-#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT		(56)
-#define GICR_PROPBASER_SHAREABILITY_MASK				\
-	GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
-#define GICR_PROPBASER_INNER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
-#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
-#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
-
-#define GICR_PROPBASER_InnerShareable					\
-	GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
-
-#define GICR_PROPBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
-#define GICR_PROPBASER_nC 	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
-#define GICR_PROPBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
-#define GICR_PROPBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)
-#define GICR_PROPBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
-#define GICR_PROPBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
-#define GICR_PROPBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
-#define GICR_PROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
-
-#define GICR_PROPBASER_IDBITS_MASK			(0x1f)
-#define GICR_PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 12))
-#define GICR_PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 16))
-
-#define GICR_PENDBASER_SHAREABILITY_SHIFT		(10)
-#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT		(7)
-#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT		(56)
-#define GICR_PENDBASER_SHAREABILITY_MASK				\
-	GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
-#define GICR_PENDBASER_INNER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
-#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
-#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
-
-#define GICR_PENDBASER_InnerShareable					\
-	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
-
-#define GICR_PENDBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
-#define GICR_PENDBASER_nC 	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
-#define GICR_PENDBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
-#define GICR_PENDBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)
-#define GICR_PENDBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
-#define GICR_PENDBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
-#define GICR_PENDBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
-#define GICR_PENDBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
-
-#define GICR_PENDBASER_PTZ				BIT_ULL(62)
-
-/*
- * Re-Distributor registers, offsets from SGI_base
- */
-#define GICR_IGROUPR0			GICD_IGROUPR
-#define GICR_ISENABLER0			GICD_ISENABLER
-#define GICR_ICENABLER0			GICD_ICENABLER
-#define GICR_ISPENDR0			GICD_ISPENDR
-#define GICR_ICPENDR0			GICD_ICPENDR
-#define GICR_ISACTIVER0			GICD_ISACTIVER
-#define GICR_ICACTIVER0			GICD_ICACTIVER
-#define GICR_IPRIORITYR0		GICD_IPRIORITYR
-#define GICR_ICFGR0			GICD_ICFGR
-#define GICR_IGRPMODR0			GICD_IGRPMODR
-#define GICR_NSACR			GICD_NSACR
-
-#define GICR_TYPER_PLPIS		(1U << 0)
-#define GICR_TYPER_VLPIS		(1U << 1)
-#define GICR_TYPER_DIRTY		(1U << 2)
-#define GICR_TYPER_DirectLPIS		(1U << 3)
-#define GICR_TYPER_LAST			(1U << 4)
-#define GICR_TYPER_RVPEID		(1U << 7)
-#define GICR_TYPER_COMMON_LPI_AFF	GENMASK_ULL(25, 24)
-#define GICR_TYPER_AFFINITY		GENMASK_ULL(63, 32)
-
-#define GICR_INVLPIR_INTID		GENMASK_ULL(31, 0)
-#define GICR_INVLPIR_VPEID		GENMASK_ULL(47, 32)
-#define GICR_INVLPIR_V			GENMASK_ULL(63, 63)
-
-#define GICR_INVALLR_VPEID		GICR_INVLPIR_VPEID
-#define GICR_INVALLR_V			GICR_INVLPIR_V
-
-#define GIC_V3_REDIST_SIZE		0x20000
-
-#define LPI_PROP_GROUP1			(1 << 1)
-#define LPI_PROP_ENABLED		(1 << 0)
-
-/*
- * Re-Distributor registers, offsets from VLPI_base
- */
-#define GICR_VPROPBASER			0x0070
-
-#define GICR_VPROPBASER_IDBITS_MASK	0x1f
-
-#define GICR_VPROPBASER_SHAREABILITY_SHIFT		(10)
-#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT	(7)
-#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT	(56)
-
-#define GICR_VPROPBASER_SHAREABILITY_MASK				\
-	GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK)
-#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK)
-#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK)
-#define GICR_VPROPBASER_CACHEABILITY_MASK				\
-	GICR_VPROPBASER_INNER_CACHEABILITY_MASK
-
-#define GICR_VPROPBASER_InnerShareable					\
-	GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable)
-
-#define GICR_VPROPBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB)
-#define GICR_VPROPBASER_nC 	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC)
-#define GICR_VPROPBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt)
-#define GICR_VPROPBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWb)
-#define GICR_VPROPBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt)
-#define GICR_VPROPBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb)
-#define GICR_VPROPBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt)
-#define GICR_VPROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb)
-
-/*
- * GICv4.1 VPROPBASER reinvention. A subtle mix between the old
- * VPROPBASER and ITS_BASER. Just not quite any of the two.
- */
-#define GICR_VPROPBASER_4_1_VALID	(1ULL << 63)
-#define GICR_VPROPBASER_4_1_ENTRY_SIZE	GENMASK_ULL(61, 59)
-#define GICR_VPROPBASER_4_1_INDIRECT	(1ULL << 55)
-#define GICR_VPROPBASER_4_1_PAGE_SIZE	GENMASK_ULL(54, 53)
-#define GICR_VPROPBASER_4_1_Z		(1ULL << 52)
-#define GICR_VPROPBASER_4_1_ADDR	GENMASK_ULL(51, 12)
-#define GICR_VPROPBASER_4_1_SIZE	GENMASK_ULL(6, 0)
-
-#define GICR_VPENDBASER			0x0078
-
-#define GICR_VPENDBASER_SHAREABILITY_SHIFT		(10)
-#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT	(7)
-#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT	(56)
-#define GICR_VPENDBASER_SHAREABILITY_MASK				\
-	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK)
-#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK)
-#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK)
-#define GICR_VPENDBASER_CACHEABILITY_MASK				\
-	GICR_VPENDBASER_INNER_CACHEABILITY_MASK
-
-#define GICR_VPENDBASER_NonShareable					\
-	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable)
-
-#define GICR_VPENDBASER_InnerShareable					\
-	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, InnerShareable)
-
-#define GICR_VPENDBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB)
-#define GICR_VPENDBASER_nC 	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC)
-#define GICR_VPENDBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt)
-#define GICR_VPENDBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWb)
-#define GICR_VPENDBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt)
-#define GICR_VPENDBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb)
-#define GICR_VPENDBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt)
-#define GICR_VPENDBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb)
-
-#define GICR_VPENDBASER_Dirty		(1ULL << 60)
-#define GICR_VPENDBASER_PendingLast	(1ULL << 61)
-#define GICR_VPENDBASER_IDAI		(1ULL << 62)
-#define GICR_VPENDBASER_Valid		(1ULL << 63)
-
-/*
- * GICv4.1 VPENDBASER, used for VPE residency. On top of these fields,
- * also use the above Valid, PendingLast and Dirty.
- */
-#define GICR_VPENDBASER_4_1_DB		(1ULL << 62)
-#define GICR_VPENDBASER_4_1_VGRP0EN	(1ULL << 59)
-#define GICR_VPENDBASER_4_1_VGRP1EN	(1ULL << 58)
-#define GICR_VPENDBASER_4_1_VPEID	GENMASK_ULL(15, 0)
-
-#define GICR_VSGIR			0x0080
-
-#define GICR_VSGIR_VPEID		GENMASK(15, 0)
-
-#define GICR_VSGIPENDR			0x0088
-
-#define GICR_VSGIPENDR_BUSY		(1U << 31)
-#define GICR_VSGIPENDR_PENDING		GENMASK(15, 0)
-
-/*
- * ITS registers, offsets from ITS_base
- */
-#define GITS_CTLR			0x0000
-#define GITS_IIDR			0x0004
-#define GITS_TYPER			0x0008
-#define GITS_MPIDR			0x0018
-#define GITS_CBASER			0x0080
-#define GITS_CWRITER			0x0088
-#define GITS_CREADR			0x0090
-#define GITS_BASER			0x0100
-#define GITS_IDREGS_BASE		0xffd0
-#define GITS_PIDR0			0xffe0
-#define GITS_PIDR1			0xffe4
-#define GITS_PIDR2			GICR_PIDR2
-#define GITS_PIDR4			0xffd0
-#define GITS_CIDR0			0xfff0
-#define GITS_CIDR1			0xfff4
-#define GITS_CIDR2			0xfff8
-#define GITS_CIDR3			0xfffc
-
-#define GITS_TRANSLATER			0x10040
-
-#define GITS_SGIR			0x20020
-
-#define GITS_SGIR_VPEID			GENMASK_ULL(47, 32)
-#define GITS_SGIR_VINTID		GENMASK_ULL(3, 0)
-
-#define GITS_CTLR_ENABLE		(1U << 0)
-#define GITS_CTLR_ImDe			(1U << 1)
-#define	GITS_CTLR_ITS_NUMBER_SHIFT	4
-#define	GITS_CTLR_ITS_NUMBER		(0xFU << GITS_CTLR_ITS_NUMBER_SHIFT)
-#define GITS_CTLR_QUIESCENT		(1U << 31)
-
-#define GITS_TYPER_PLPIS		(1UL << 0)
-#define GITS_TYPER_VLPIS		(1UL << 1)
-#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT	4
-#define GITS_TYPER_ITT_ENTRY_SIZE	GENMASK_ULL(7, 4)
-#define GITS_TYPER_IDBITS_SHIFT		8
-#define GITS_TYPER_DEVBITS_SHIFT	13
-#define GITS_TYPER_DEVBITS		GENMASK_ULL(17, 13)
-#define GITS_TYPER_PTA			(1UL << 19)
-#define GITS_TYPER_HCC_SHIFT		24
-#define GITS_TYPER_HCC(r)		(((r) >> GITS_TYPER_HCC_SHIFT) & 0xff)
-#define GITS_TYPER_VMOVP		(1ULL << 37)
-#define GITS_TYPER_VMAPP		(1ULL << 40)
-#define GITS_TYPER_SVPET		GENMASK_ULL(42, 41)
-
-#define GITS_IIDR_REV_SHIFT		12
-#define GITS_IIDR_REV_MASK		(0xf << GITS_IIDR_REV_SHIFT)
-#define GITS_IIDR_REV(r)		(((r) >> GITS_IIDR_REV_SHIFT) & 0xf)
-#define GITS_IIDR_PRODUCTID_SHIFT	24
-
-#define GITS_CBASER_VALID			(1ULL << 63)
-#define GITS_CBASER_SHAREABILITY_SHIFT		(10)
-#define GITS_CBASER_INNER_CACHEABILITY_SHIFT	(59)
-#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT	(53)
-#define GITS_CBASER_SHAREABILITY_MASK					\
-	GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
-#define GITS_CBASER_INNER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
-#define GITS_CBASER_OUTER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
-#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
-
-#define GITS_CBASER_InnerShareable					\
-	GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
-
-#define GITS_CBASER_nCnB	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
-#define GITS_CBASER_nC		GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
-#define GITS_CBASER_RaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
-#define GITS_CBASER_RaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWb)
-#define GITS_CBASER_WaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
-#define GITS_CBASER_WaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
-#define GITS_CBASER_RaWaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
-#define GITS_CBASER_RaWaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
-
-#define GITS_CBASER_ADDRESS(cbaser)	((cbaser) & GENMASK_ULL(51, 12))
-
-#define GITS_BASER_NR_REGS		8
-
-#define GITS_BASER_VALID			(1ULL << 63)
-#define GITS_BASER_INDIRECT			(1ULL << 62)
-
-#define GITS_BASER_INNER_CACHEABILITY_SHIFT	(59)
-#define GITS_BASER_OUTER_CACHEABILITY_SHIFT	(53)
-#define GITS_BASER_INNER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
-#define GITS_BASER_CACHEABILITY_MASK		GITS_BASER_INNER_CACHEABILITY_MASK
-#define GITS_BASER_OUTER_CACHEABILITY_MASK				\
-	GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
-#define GITS_BASER_SHAREABILITY_MASK					\
-	GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
-
-#define GITS_BASER_nCnB		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
-#define GITS_BASER_nC		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
-#define GITS_BASER_RaWt		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
-#define GITS_BASER_RaWb		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)
-#define GITS_BASER_WaWt		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
-#define GITS_BASER_WaWb		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
-#define GITS_BASER_RaWaWt	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
-#define GITS_BASER_RaWaWb	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
-
-#define GITS_BASER_TYPE_SHIFT			(56)
-#define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
-#define GITS_BASER_ENTRY_SIZE_SHIFT		(48)
-#define GITS_BASER_ENTRY_SIZE(r)	((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1)
-#define GITS_BASER_ENTRY_SIZE_MASK	GENMASK_ULL(52, 48)
-#define GITS_BASER_PHYS_52_to_48(phys)					\
-	(((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12)
-#define GITS_BASER_ADDR_48_to_52(baser)					\
-	(((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48)
-
-#define GITS_BASER_SHAREABILITY_SHIFT	(10)
-#define GITS_BASER_InnerShareable					\
-	GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
-#define GITS_BASER_PAGE_SIZE_SHIFT	(8)
-#define __GITS_BASER_PSZ(sz)		(GIC_PAGE_SIZE_ ## sz << GITS_BASER_PAGE_SIZE_SHIFT)
-#define GITS_BASER_PAGE_SIZE_4K		__GITS_BASER_PSZ(4K)
-#define GITS_BASER_PAGE_SIZE_16K	__GITS_BASER_PSZ(16K)
-#define GITS_BASER_PAGE_SIZE_64K	__GITS_BASER_PSZ(64K)
-#define GITS_BASER_PAGE_SIZE_MASK	__GITS_BASER_PSZ(MASK)
-#define GITS_BASER_PAGES_MAX		256
-#define GITS_BASER_PAGES_SHIFT		(0)
-#define GITS_BASER_NR_PAGES(r)		(((r) & 0xff) + 1)
-
-#define GITS_BASER_TYPE_NONE		0
-#define GITS_BASER_TYPE_DEVICE		1
-#define GITS_BASER_TYPE_VCPU		2
-#define GITS_BASER_TYPE_RESERVED3	3
-#define GITS_BASER_TYPE_COLLECTION	4
-#define GITS_BASER_TYPE_RESERVED5	5
-#define GITS_BASER_TYPE_RESERVED6	6
-#define GITS_BASER_TYPE_RESERVED7	7
-
-#define GITS_LVL1_ENTRY_SIZE           (8UL)
-
-/*
- * ITS commands
- */
-#define GITS_CMD_MAPD			0x08
-#define GITS_CMD_MAPC			0x09
-#define GITS_CMD_MAPTI			0x0a
-#define GITS_CMD_MAPI			0x0b
-#define GITS_CMD_MOVI			0x01
-#define GITS_CMD_DISCARD		0x0f
-#define GITS_CMD_INV			0x0c
-#define GITS_CMD_MOVALL			0x0e
-#define GITS_CMD_INVALL			0x0d
-#define GITS_CMD_INT			0x03
-#define GITS_CMD_CLEAR			0x04
-#define GITS_CMD_SYNC			0x05
-
-/*
- * GICv4 ITS specific commands
- */
-#define GITS_CMD_GICv4(x)		((x) | 0x20)
-#define GITS_CMD_VINVALL		GITS_CMD_GICv4(GITS_CMD_INVALL)
-#define GITS_CMD_VMAPP			GITS_CMD_GICv4(GITS_CMD_MAPC)
-#define GITS_CMD_VMAPTI			GITS_CMD_GICv4(GITS_CMD_MAPTI)
-#define GITS_CMD_VMOVI			GITS_CMD_GICv4(GITS_CMD_MOVI)
-#define GITS_CMD_VSYNC			GITS_CMD_GICv4(GITS_CMD_SYNC)
-/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */
-#define GITS_CMD_VMOVP			GITS_CMD_GICv4(2)
-#define GITS_CMD_VSGI			GITS_CMD_GICv4(3)
-#define GITS_CMD_INVDB			GITS_CMD_GICv4(0xe)
-
-/*
- * ITS error numbers
- */
-#define E_ITS_MOVI_UNMAPPED_INTERRUPT		0x010107
-#define E_ITS_MOVI_UNMAPPED_COLLECTION		0x010109
-#define E_ITS_INT_UNMAPPED_INTERRUPT		0x010307
-#define E_ITS_CLEAR_UNMAPPED_INTERRUPT		0x010507
-#define E_ITS_MAPD_DEVICE_OOR			0x010801
-#define E_ITS_MAPD_ITTSIZE_OOR			0x010802
-#define E_ITS_MAPC_PROCNUM_OOR			0x010902
-#define E_ITS_MAPC_COLLECTION_OOR		0x010903
-#define E_ITS_MAPTI_UNMAPPED_DEVICE		0x010a04
-#define E_ITS_MAPTI_ID_OOR			0x010a05
-#define E_ITS_MAPTI_PHYSICALID_OOR		0x010a06
-#define E_ITS_INV_UNMAPPED_INTERRUPT		0x010c07
-#define E_ITS_INVALL_UNMAPPED_COLLECTION	0x010d09
-#define E_ITS_MOVALL_PROCNUM_OOR		0x010e01
-#define E_ITS_DISCARD_UNMAPPED_INTERRUPT	0x010f07
-
-/*
- * CPU interface registers
- */
-#define ICC_CTLR_EL1_EOImode_SHIFT	(1)
-#define ICC_CTLR_EL1_EOImode_drop_dir	(0U << ICC_CTLR_EL1_EOImode_SHIFT)
-#define ICC_CTLR_EL1_EOImode_drop	(1U << ICC_CTLR_EL1_EOImode_SHIFT)
-#define ICC_CTLR_EL1_EOImode_MASK	(1 << ICC_CTLR_EL1_EOImode_SHIFT)
-#define ICC_CTLR_EL1_CBPR_SHIFT		0
-#define ICC_CTLR_EL1_CBPR_MASK		(1 << ICC_CTLR_EL1_CBPR_SHIFT)
-#define ICC_CTLR_EL1_PMHE_SHIFT		6
-#define ICC_CTLR_EL1_PMHE_MASK		(1 << ICC_CTLR_EL1_PMHE_SHIFT)
-#define ICC_CTLR_EL1_PRI_BITS_SHIFT	8
-#define ICC_CTLR_EL1_PRI_BITS_MASK	(0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT)
-#define ICC_CTLR_EL1_ID_BITS_SHIFT	11
-#define ICC_CTLR_EL1_ID_BITS_MASK	(0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT)
-#define ICC_CTLR_EL1_SEIS_SHIFT		14
-#define ICC_CTLR_EL1_SEIS_MASK		(0x1 << ICC_CTLR_EL1_SEIS_SHIFT)
-#define ICC_CTLR_EL1_A3V_SHIFT		15
-#define ICC_CTLR_EL1_A3V_MASK		(0x1 << ICC_CTLR_EL1_A3V_SHIFT)
-#define ICC_CTLR_EL1_RSS		(0x1 << 18)
-#define ICC_CTLR_EL1_ExtRange		(0x1 << 19)
-#define ICC_PMR_EL1_SHIFT		0
-#define ICC_PMR_EL1_MASK		(0xff << ICC_PMR_EL1_SHIFT)
-#define ICC_BPR0_EL1_SHIFT		0
-#define ICC_BPR0_EL1_MASK		(0x7 << ICC_BPR0_EL1_SHIFT)
-#define ICC_BPR1_EL1_SHIFT		0
-#define ICC_BPR1_EL1_MASK		(0x7 << ICC_BPR1_EL1_SHIFT)
-#define ICC_IGRPEN0_EL1_SHIFT		0
-#define ICC_IGRPEN0_EL1_MASK		(1 << ICC_IGRPEN0_EL1_SHIFT)
-#define ICC_IGRPEN1_EL1_SHIFT		0
-#define ICC_IGRPEN1_EL1_MASK		(1 << ICC_IGRPEN1_EL1_SHIFT)
-#define ICC_SRE_EL1_DIB			(1U << 2)
-#define ICC_SRE_EL1_DFB			(1U << 1)
-#define ICC_SRE_EL1_SRE			(1U << 0)
-
-/* These are for GICv2 emulation only */
-#define GICH_LR_VIRTUALID		(0x3ffUL << 0)
-#define GICH_LR_PHYSID_CPUID_SHIFT	(10)
-#define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
-
-#define ICC_IAR1_EL1_SPURIOUS		0x3ff
-
-#define ICC_SRE_EL2_SRE			(1 << 0)
-#define ICC_SRE_EL2_ENABLE		(1 << 3)
-
-#define ICC_SGI1R_TARGET_LIST_SHIFT	0
-#define ICC_SGI1R_TARGET_LIST_MASK	(0xffff << ICC_SGI1R_TARGET_LIST_SHIFT)
-#define ICC_SGI1R_AFFINITY_1_SHIFT	16
-#define ICC_SGI1R_AFFINITY_1_MASK	(0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
-#define ICC_SGI1R_SGI_ID_SHIFT		24
-#define ICC_SGI1R_SGI_ID_MASK		(0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
-#define ICC_SGI1R_AFFINITY_2_SHIFT	32
-#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
-#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
-#define ICC_SGI1R_RS_SHIFT		44
-#define ICC_SGI1R_RS_MASK		(0xfULL << ICC_SGI1R_RS_SHIFT)
-#define ICC_SGI1R_AFFINITY_3_SHIFT	48
-#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h
deleted file mode 100644
index 3722ed9c8f96..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __SELFTESTS_GIC_V3_ITS_H__
-#define __SELFTESTS_GIC_V3_ITS_H__
-
-#include <linux/sizes.h>
-
-void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz,
-	      vm_paddr_t device_tbl, size_t device_tbl_sz,
-	      vm_paddr_t cmdq, size_t cmdq_size);
-
-void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base,
-		       size_t itt_size, bool valid);
-void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid);
-void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id,
-			u32 collection_id, u32 intid);
-void its_send_invall_cmd(void *cmdq_base, u32 collection_id);
-
-#endif // __SELFTESTS_GIC_V3_ITS_H__
diff --git a/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h
deleted file mode 100644
index e43a57d99b56..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_UTIL_ARCH_H
-#define SELFTEST_KVM_UTIL_ARCH_H
-
-struct kvm_vm_arch {};
-
-#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
deleted file mode 100644
index 1e8d0d531fbd..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * AArch64 processor specific defines
- *
- * Copyright (C) 2018, Red Hat, Inc.
- */
-#ifndef SELFTEST_KVM_PROCESSOR_H
-#define SELFTEST_KVM_PROCESSOR_H
-
-#include "kvm_util.h"
-#include "ucall_common.h"
-
-#include <linux/stringify.h>
-#include <linux/types.h>
-#include <asm/brk-imm.h>
-#include <asm/esr.h>
-#include <asm/sysreg.h>
-
-
-#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
-			   KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
-
-/*
- * KVM_ARM64_SYS_REG(sys_reg_id): Helper macro to convert
- * SYS_* register definitions in asm/sysreg.h to use in KVM
- * calls such as vcpu_get_reg() and vcpu_set_reg().
- */
-#define KVM_ARM64_SYS_REG(sys_reg_id)			\
-	ARM64_SYS_REG(sys_reg_Op0(sys_reg_id),		\
-			sys_reg_Op1(sys_reg_id),	\
-			sys_reg_CRn(sys_reg_id),	\
-			sys_reg_CRm(sys_reg_id),	\
-			sys_reg_Op2(sys_reg_id))
-
-/*
- * Default MAIR
- *                  index   attribute
- * DEVICE_nGnRnE      0     0000:0000
- * DEVICE_nGnRE       1     0000:0100
- * DEVICE_GRE         2     0000:1100
- * NORMAL_NC          3     0100:0100
- * NORMAL             4     1111:1111
- * NORMAL_WT          5     1011:1011
- */
-
-/* Linux doesn't use these memory types, so let's define them. */
-#define MAIR_ATTR_DEVICE_GRE	UL(0x0c)
-#define MAIR_ATTR_NORMAL_WT	UL(0xbb)
-
-#define MT_DEVICE_nGnRnE	0
-#define MT_DEVICE_nGnRE		1
-#define MT_DEVICE_GRE		2
-#define MT_NORMAL_NC		3
-#define MT_NORMAL		4
-#define MT_NORMAL_WT		5
-
-#define DEFAULT_MAIR_EL1							\
-	(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |		\
-	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |		\
-	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |			\
-	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |			\
-	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |				\
-	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT))
-
-void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init);
-struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  struct kvm_vcpu_init *init, void *guest_code);
-
-struct ex_regs {
-	u64 regs[31];
-	u64 sp;
-	u64 pc;
-	u64 pstate;
-};
-
-#define VECTOR_NUM	16
-
-enum {
-	VECTOR_SYNC_CURRENT_SP0,
-	VECTOR_IRQ_CURRENT_SP0,
-	VECTOR_FIQ_CURRENT_SP0,
-	VECTOR_ERROR_CURRENT_SP0,
-
-	VECTOR_SYNC_CURRENT,
-	VECTOR_IRQ_CURRENT,
-	VECTOR_FIQ_CURRENT,
-	VECTOR_ERROR_CURRENT,
-
-	VECTOR_SYNC_LOWER_64,
-	VECTOR_IRQ_LOWER_64,
-	VECTOR_FIQ_LOWER_64,
-	VECTOR_ERROR_LOWER_64,
-
-	VECTOR_SYNC_LOWER_32,
-	VECTOR_IRQ_LOWER_32,
-	VECTOR_FIQ_LOWER_32,
-	VECTOR_ERROR_LOWER_32,
-};
-
-#define VECTOR_IS_SYNC(v) ((v) == VECTOR_SYNC_CURRENT_SP0 || \
-			   (v) == VECTOR_SYNC_CURRENT     || \
-			   (v) == VECTOR_SYNC_LOWER_64    || \
-			   (v) == VECTOR_SYNC_LOWER_32)
-
-/* Access flag */
-#define PTE_AF			(1ULL << 10)
-
-/* Access flag update enable/disable */
-#define TCR_EL1_HA		(1ULL << 39)
-
-void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k,
-					uint32_t *ipa16k, uint32_t *ipa64k);
-
-void vm_init_descriptor_tables(struct kvm_vm *vm);
-void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu);
-
-typedef void(*handler_fn)(struct ex_regs *);
-void vm_install_exception_handler(struct kvm_vm *vm,
-		int vector, handler_fn handler);
-void vm_install_sync_handler(struct kvm_vm *vm,
-		int vector, int ec, handler_fn handler);
-
-uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva);
-
-static inline void cpu_relax(void)
-{
-	asm volatile("yield" ::: "memory");
-}
-
-#define isb()		asm volatile("isb" : : : "memory")
-#define dsb(opt)	asm volatile("dsb " #opt : : : "memory")
-#define dmb(opt)	asm volatile("dmb " #opt : : : "memory")
-
-#define dma_wmb()	dmb(oshst)
-#define __iowmb()	dma_wmb()
-
-#define dma_rmb()	dmb(oshld)
-
-#define __iormb(v)							\
-({									\
-	unsigned long tmp;						\
-									\
-	dma_rmb();							\
-									\
-	/*								\
-	 * Courtesy of arch/arm64/include/asm/io.h:			\
-	 * Create a dummy control dependency from the IO read to any	\
-	 * later instructions. This ensures that a subsequent call	\
-	 * to udelay() will be ordered due to the ISB in __delay().	\
-	 */								\
-	asm volatile("eor	%0, %1, %1\n"				\
-		     "cbnz	%0, ."					\
-		     : "=r" (tmp) : "r" ((unsigned long)(v))		\
-		     : "memory");					\
-})
-
-static __always_inline void __raw_writel(u32 val, volatile void *addr)
-{
-	asm volatile("str %w0, [%1]" : : "rZ" (val), "r" (addr));
-}
-
-static __always_inline u32 __raw_readl(const volatile void *addr)
-{
-	u32 val;
-	asm volatile("ldr %w0, [%1]" : "=r" (val) : "r" (addr));
-	return val;
-}
-
-static __always_inline void __raw_writeq(u64 val, volatile void *addr)
-{
-	asm volatile("str %0, [%1]" : : "rZ" (val), "r" (addr));
-}
-
-static __always_inline u64 __raw_readq(const volatile void *addr)
-{
-	u64 val;
-	asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr));
-	return val;
-}
-
-#define writel_relaxed(v,c)	((void)__raw_writel((__force u32)cpu_to_le32(v),(c)))
-#define readl_relaxed(c)	({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; })
-#define writeq_relaxed(v,c)	((void)__raw_writeq((__force u64)cpu_to_le64(v),(c)))
-#define readq_relaxed(c)	({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; })
-
-#define writel(v,c)		({ __iowmb(); writel_relaxed((v),(c));})
-#define readl(c)		({ u32 __v = readl_relaxed(c); __iormb(__v); __v; })
-#define writeq(v,c)		({ __iowmb(); writeq_relaxed((v),(c));})
-#define readq(c)		({ u64 __v = readq_relaxed(c); __iormb(__v); __v; })
-
-
-static inline void local_irq_enable(void)
-{
-	asm volatile("msr daifclr, #3" : : : "memory");
-}
-
-static inline void local_irq_disable(void)
-{
-	asm volatile("msr daifset, #3" : : : "memory");
-}
-
-/**
- * struct arm_smccc_res - Result from SMC/HVC call
- * @a0-a3 result values from registers 0 to 3
- */
-struct arm_smccc_res {
-	unsigned long a0;
-	unsigned long a1;
-	unsigned long a2;
-	unsigned long a3;
-};
-
-/**
- * smccc_hvc - Invoke a SMCCC function using the hvc conduit
- * @function_id: the SMCCC function to be called
- * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7
- * @res: pointer to write the return values from registers x0-x3
- *
- */
-void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
-	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
-	       uint64_t arg6, struct arm_smccc_res *res);
-
-/**
- * smccc_smc - Invoke a SMCCC function using the smc conduit
- * @function_id: the SMCCC function to be called
- * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7
- * @res: pointer to write the return values from registers x0-x3
- *
- */
-void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
-	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
-	       uint64_t arg6, struct arm_smccc_res *res);
-
-/* Execute a Wait For Interrupt instruction. */
-void wfi(void);
-
-#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/aarch64/spinlock.h b/tools/testing/selftests/kvm/include/aarch64/spinlock.h
deleted file mode 100644
index cf0984106d14..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/spinlock.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef SELFTEST_KVM_ARM64_SPINLOCK_H
-#define SELFTEST_KVM_ARM64_SPINLOCK_H
-
-struct spinlock {
-	int v;
-};
-
-extern void spin_lock(struct spinlock *lock);
-extern void spin_unlock(struct spinlock *lock);
-
-#endif /* SELFTEST_KVM_ARM64_SPINLOCK_H */
diff --git a/tools/testing/selftests/kvm/include/aarch64/ucall.h b/tools/testing/selftests/kvm/include/aarch64/ucall.h
deleted file mode 100644
index 4ec801f37f00..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/ucall.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_UCALL_H
-#define SELFTEST_KVM_UCALL_H
-
-#include "kvm_util.h"
-
-#define UCALL_EXIT_REASON       KVM_EXIT_MMIO
-
-/*
- * ucall_exit_mmio_addr holds per-VM values (global data is duplicated by each
- * VM), it must not be accessed from host code.
- */
-extern vm_vaddr_t *ucall_exit_mmio_addr;
-
-static inline void ucall_arch_do_ucall(vm_vaddr_t uc)
-{
-	WRITE_ONCE(*ucall_exit_mmio_addr, uc);
-}
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h
deleted file mode 100644
index c481d0c00a5d..000000000000
--- a/tools/testing/selftests/kvm/include/aarch64/vgic.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM Generic Interrupt Controller (GIC) host specific defines
- */
-
-#ifndef SELFTEST_KVM_VGIC_H
-#define SELFTEST_KVM_VGIC_H
-
-#include <linux/kvm.h>
-
-#include "kvm_util.h"
-
-#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) \
-	(((uint64_t)(count) << 52) | \
-	((uint64_t)((base) >> 16) << 16) | \
-	((uint64_t)(flags) << 12) | \
-	index)
-
-int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs);
-
-#define VGIC_MAX_RESERVED	1023
-
-void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level);
-int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level);
-
-void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
-int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
-
-/* The vcpu arg only applies to private interrupts. */
-void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu);
-void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu);
-
-#define KVM_IRQCHIP_NUM_PINS	(1020 - 32)
-
-int vgic_its_setup(struct kvm_vm *vm);
-
-#endif // SELFTEST_KVM_VGIC_H
diff --git a/tools/testing/selftests/kvm/include/arm64/arch_timer.h b/tools/testing/selftests/kvm/include/arm64/arch_timer.h
new file mode 100644
index 000000000000..bf461de34785
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/arch_timer.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM Generic Timer specific interface
+ */
+
+#ifndef SELFTEST_KVM_ARCH_TIMER_H
+#define SELFTEST_KVM_ARCH_TIMER_H
+
+#include "processor.h"
+
+enum arch_timer {
+	VIRTUAL,
+	PHYSICAL,
+};
+
+#define CTL_ENABLE	(1 << 0)
+#define CTL_IMASK	(1 << 1)
+#define CTL_ISTATUS	(1 << 2)
+
+#define msec_to_cycles(msec)	\
+	(timer_get_cntfrq() * (uint64_t)(msec) / 1000)
+
+#define usec_to_cycles(usec)	\
+	(timer_get_cntfrq() * (uint64_t)(usec) / 1000000)
+
+#define cycles_to_usec(cycles) \
+	((uint64_t)(cycles) * 1000000 / timer_get_cntfrq())
+
+static inline uint32_t timer_get_cntfrq(void)
+{
+	return read_sysreg(cntfrq_el0);
+}
+
+static inline uint64_t timer_get_cntct(enum arch_timer timer)
+{
+	isb();
+
+	switch (timer) {
+	case VIRTUAL:
+		return read_sysreg(cntvct_el0);
+	case PHYSICAL:
+		return read_sysreg(cntpct_el0);
+	default:
+		GUEST_FAIL("Unexpected timer type = %u", timer);
+	}
+
+	/* We should not reach here */
+	return 0;
+}
+
+static inline void timer_set_cval(enum arch_timer timer, uint64_t cval)
+{
+	switch (timer) {
+	case VIRTUAL:
+		write_sysreg(cval, cntv_cval_el0);
+		break;
+	case PHYSICAL:
+		write_sysreg(cval, cntp_cval_el0);
+		break;
+	default:
+		GUEST_FAIL("Unexpected timer type = %u", timer);
+	}
+
+	isb();
+}
+
+static inline uint64_t timer_get_cval(enum arch_timer timer)
+{
+	switch (timer) {
+	case VIRTUAL:
+		return read_sysreg(cntv_cval_el0);
+	case PHYSICAL:
+		return read_sysreg(cntp_cval_el0);
+	default:
+		GUEST_FAIL("Unexpected timer type = %u", timer);
+	}
+
+	/* We should not reach here */
+	return 0;
+}
+
+static inline void timer_set_tval(enum arch_timer timer, int32_t tval)
+{
+	switch (timer) {
+	case VIRTUAL:
+		write_sysreg(tval, cntv_tval_el0);
+		break;
+	case PHYSICAL:
+		write_sysreg(tval, cntp_tval_el0);
+		break;
+	default:
+		GUEST_FAIL("Unexpected timer type = %u", timer);
+	}
+
+	isb();
+}
+
+static inline int32_t timer_get_tval(enum arch_timer timer)
+{
+	isb();
+	switch (timer) {
+	case VIRTUAL:
+		return read_sysreg(cntv_tval_el0);
+	case PHYSICAL:
+		return read_sysreg(cntp_tval_el0);
+	default:
+		GUEST_FAIL("Could not get timer %d\n", timer);
+	}
+
+	/* We should not reach here */
+	return 0;
+}
+
+static inline void timer_set_ctl(enum arch_timer timer, uint32_t ctl)
+{
+	switch (timer) {
+	case VIRTUAL:
+		write_sysreg(ctl, cntv_ctl_el0);
+		break;
+	case PHYSICAL:
+		write_sysreg(ctl, cntp_ctl_el0);
+		break;
+	default:
+		GUEST_FAIL("Unexpected timer type = %u", timer);
+	}
+
+	isb();
+}
+
+static inline uint32_t timer_get_ctl(enum arch_timer timer)
+{
+	switch (timer) {
+	case VIRTUAL:
+		return read_sysreg(cntv_ctl_el0);
+	case PHYSICAL:
+		return read_sysreg(cntp_ctl_el0);
+	default:
+		GUEST_FAIL("Unexpected timer type = %u", timer);
+	}
+
+	/* We should not reach here */
+	return 0;
+}
+
+static inline void timer_set_next_cval_ms(enum arch_timer timer, uint32_t msec)
+{
+	uint64_t now_ct = timer_get_cntct(timer);
+	uint64_t next_ct = now_ct + msec_to_cycles(msec);
+
+	timer_set_cval(timer, next_ct);
+}
+
+static inline void timer_set_next_tval_ms(enum arch_timer timer, uint32_t msec)
+{
+	timer_set_tval(timer, msec_to_cycles(msec));
+}
+
+#endif /* SELFTEST_KVM_ARCH_TIMER_H */
diff --git a/tools/testing/selftests/kvm/include/arm64/delay.h b/tools/testing/selftests/kvm/include/arm64/delay.h
new file mode 100644
index 000000000000..329e4f5079ea
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/delay.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM simple delay routines
+ */
+
+#ifndef SELFTEST_KVM_ARM_DELAY_H
+#define SELFTEST_KVM_ARM_DELAY_H
+
+#include "arch_timer.h"
+
+static inline void __delay(uint64_t cycles)
+{
+	enum arch_timer timer = VIRTUAL;
+	uint64_t start = timer_get_cntct(timer);
+
+	while ((timer_get_cntct(timer) - start) < cycles)
+		cpu_relax();
+}
+
+static inline void udelay(unsigned long usec)
+{
+	__delay(usec_to_cycles(usec));
+}
+
+#endif /* SELFTEST_KVM_ARM_DELAY_H */
diff --git a/tools/testing/selftests/kvm/include/arm64/gic.h b/tools/testing/selftests/kvm/include/arm64/gic.h
new file mode 100644
index 000000000000..baeb3c859389
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/gic.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM Generic Interrupt Controller (GIC) specific defines
+ */
+
+#ifndef SELFTEST_KVM_GIC_H
+#define SELFTEST_KVM_GIC_H
+
+#include <asm/kvm.h>
+
+enum gic_type {
+	GIC_V3,
+	GIC_TYPE_MAX,
+};
+
+/*
+ * Note that the redistributor frames are at the end, as the range scales
+ * with the number of vCPUs in the VM.
+ */
+#define GITS_BASE_GPA		0x8000000ULL
+#define GICD_BASE_GPA		(GITS_BASE_GPA + KVM_VGIC_V3_ITS_SIZE)
+#define GICR_BASE_GPA		(GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE)
+
+/* The GIC is identity-mapped into the guest at the time of setup. */
+#define GITS_BASE_GVA		((volatile void *)GITS_BASE_GPA)
+#define GICD_BASE_GVA		((volatile void *)GICD_BASE_GPA)
+#define GICR_BASE_GVA		((volatile void *)GICR_BASE_GPA)
+
+#define MIN_SGI			0
+#define MIN_PPI			16
+#define MIN_SPI			32
+#define MAX_SPI			1019
+#define IAR_SPURIOUS		1023
+
+#define INTID_IS_SGI(intid)	(0       <= (intid) && (intid) < MIN_PPI)
+#define INTID_IS_PPI(intid)	(MIN_PPI <= (intid) && (intid) < MIN_SPI)
+#define INTID_IS_SPI(intid)	(MIN_SPI <= (intid) && (intid) <= MAX_SPI)
+
+void gic_init(enum gic_type type, unsigned int nr_cpus);
+void gic_irq_enable(unsigned int intid);
+void gic_irq_disable(unsigned int intid);
+unsigned int gic_get_and_ack_irq(void);
+void gic_set_eoi(unsigned int intid);
+void gic_set_dir(unsigned int intid);
+
+/*
+ * Sets the EOI mode. When split is false, EOI just drops the priority. When
+ * split is true, EOI drops the priority and deactivates the interrupt.
+ */
+void gic_set_eoi_split(bool split);
+void gic_set_priority_mask(uint64_t mask);
+void gic_set_priority(uint32_t intid, uint32_t prio);
+void gic_irq_set_active(unsigned int intid);
+void gic_irq_clear_active(unsigned int intid);
+bool gic_irq_get_active(unsigned int intid);
+void gic_irq_set_pending(unsigned int intid);
+void gic_irq_clear_pending(unsigned int intid);
+bool gic_irq_get_pending(unsigned int intid);
+void gic_irq_set_config(unsigned int intid, bool is_edge);
+
+void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size,
+			   vm_paddr_t pend_table);
+
+#endif /* SELFTEST_KVM_GIC_H */
diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3.h b/tools/testing/selftests/kvm/include/arm64/gic_v3.h
new file mode 100644
index 000000000000..a76615fa39a1
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/gic_v3.h
@@ -0,0 +1,604 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+#ifndef __SELFTESTS_GIC_V3_H
+#define __SELFTESTS_GIC_V3_H
+
+/*
+ * Distributor registers. We assume we're running non-secure, with ARE
+ * being set. Secure-only and non-ARE registers are not described.
+ */
+#define GICD_CTLR			0x0000
+#define GICD_TYPER			0x0004
+#define GICD_IIDR			0x0008
+#define GICD_TYPER2			0x000C
+#define GICD_STATUSR			0x0010
+#define GICD_SETSPI_NSR			0x0040
+#define GICD_CLRSPI_NSR			0x0048
+#define GICD_SETSPI_SR			0x0050
+#define GICD_CLRSPI_SR			0x0058
+#define GICD_IGROUPR			0x0080
+#define GICD_ISENABLER			0x0100
+#define GICD_ICENABLER			0x0180
+#define GICD_ISPENDR			0x0200
+#define GICD_ICPENDR			0x0280
+#define GICD_ISACTIVER			0x0300
+#define GICD_ICACTIVER			0x0380
+#define GICD_IPRIORITYR			0x0400
+#define GICD_ICFGR			0x0C00
+#define GICD_IGRPMODR			0x0D00
+#define GICD_NSACR			0x0E00
+#define GICD_IGROUPRnE			0x1000
+#define GICD_ISENABLERnE		0x1200
+#define GICD_ICENABLERnE		0x1400
+#define GICD_ISPENDRnE			0x1600
+#define GICD_ICPENDRnE			0x1800
+#define GICD_ISACTIVERnE		0x1A00
+#define GICD_ICACTIVERnE		0x1C00
+#define GICD_IPRIORITYRnE		0x2000
+#define GICD_ICFGRnE			0x3000
+#define GICD_IROUTER			0x6000
+#define GICD_IROUTERnE			0x8000
+#define GICD_IDREGS			0xFFD0
+#define GICD_PIDR2			0xFFE8
+
+#define ESPI_BASE_INTID			4096
+
+/*
+ * Those registers are actually from GICv2, but the spec demands that they
+ * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3).
+ */
+#define GICD_ITARGETSR			0x0800
+#define GICD_SGIR			0x0F00
+#define GICD_CPENDSGIR			0x0F10
+#define GICD_SPENDSGIR			0x0F20
+
+#define GICD_CTLR_RWP			(1U << 31)
+#define GICD_CTLR_nASSGIreq		(1U << 8)
+#define GICD_CTLR_DS			(1U << 6)
+#define GICD_CTLR_ARE_NS		(1U << 4)
+#define GICD_CTLR_ENABLE_G1A		(1U << 1)
+#define GICD_CTLR_ENABLE_G1		(1U << 0)
+
+#define GICD_IIDR_IMPLEMENTER_SHIFT	0
+#define GICD_IIDR_IMPLEMENTER_MASK	(0xfff << GICD_IIDR_IMPLEMENTER_SHIFT)
+#define GICD_IIDR_REVISION_SHIFT	12
+#define GICD_IIDR_REVISION_MASK		(0xf << GICD_IIDR_REVISION_SHIFT)
+#define GICD_IIDR_VARIANT_SHIFT		16
+#define GICD_IIDR_VARIANT_MASK		(0xf << GICD_IIDR_VARIANT_SHIFT)
+#define GICD_IIDR_PRODUCT_ID_SHIFT	24
+#define GICD_IIDR_PRODUCT_ID_MASK	(0xff << GICD_IIDR_PRODUCT_ID_SHIFT)
+
+
+/*
+ * In systems with a single security state (what we emulate in KVM)
+ * the meaning of the interrupt group enable bits is slightly different
+ */
+#define GICD_CTLR_ENABLE_SS_G1		(1U << 1)
+#define GICD_CTLR_ENABLE_SS_G0		(1U << 0)
+
+#define GICD_TYPER_RSS			(1U << 26)
+#define GICD_TYPER_LPIS			(1U << 17)
+#define GICD_TYPER_MBIS			(1U << 16)
+#define GICD_TYPER_ESPI			(1U << 8)
+
+#define GICD_TYPER_ID_BITS(typer)	((((typer) >> 19) & 0x1f) + 1)
+#define GICD_TYPER_NUM_LPIS(typer)	((((typer) >> 11) & 0x1f) + 1)
+#define GICD_TYPER_SPIS(typer)		((((typer) & 0x1f) + 1) * 32)
+#define GICD_TYPER_ESPIS(typer)						\
+	(((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0)
+
+#define GICD_TYPER2_nASSGIcap		(1U << 8)
+#define GICD_TYPER2_VIL			(1U << 7)
+#define GICD_TYPER2_VID			GENMASK(4, 0)
+
+#define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
+#define GICD_IROUTER_SPI_MODE_ANY	(1U << 31)
+
+#define GIC_PIDR2_ARCH_MASK		0xf0
+#define GIC_PIDR2_ARCH_GICv3		0x30
+#define GIC_PIDR2_ARCH_GICv4		0x40
+
+#define GIC_V3_DIST_SIZE		0x10000
+
+#define GIC_PAGE_SIZE_4K		0ULL
+#define GIC_PAGE_SIZE_16K		1ULL
+#define GIC_PAGE_SIZE_64K		2ULL
+#define GIC_PAGE_SIZE_MASK		3ULL
+
+/*
+ * Re-Distributor registers, offsets from RD_base
+ */
+#define GICR_CTLR			GICD_CTLR
+#define GICR_IIDR			0x0004
+#define GICR_TYPER			0x0008
+#define GICR_STATUSR			GICD_STATUSR
+#define GICR_WAKER			0x0014
+#define GICR_SETLPIR			0x0040
+#define GICR_CLRLPIR			0x0048
+#define GICR_PROPBASER			0x0070
+#define GICR_PENDBASER			0x0078
+#define GICR_INVLPIR			0x00A0
+#define GICR_INVALLR			0x00B0
+#define GICR_SYNCR			0x00C0
+#define GICR_IDREGS			GICD_IDREGS
+#define GICR_PIDR2			GICD_PIDR2
+
+#define GICR_CTLR_ENABLE_LPIS		(1UL << 0)
+#define GICR_CTLR_CES			(1UL << 1)
+#define GICR_CTLR_IR			(1UL << 2)
+#define GICR_CTLR_RWP			(1UL << 3)
+
+#define GICR_TYPER_CPU_NUMBER(r)	(((r) >> 8) & 0xffff)
+
+#define EPPI_BASE_INTID			1056
+
+#define GICR_TYPER_NR_PPIS(r)						\
+	({								\
+		unsigned int __ppinum = ((r) >> 27) & 0x1f;		\
+		unsigned int __nr_ppis = 16;				\
+		if (__ppinum == 1 || __ppinum == 2)			\
+			__nr_ppis +=  __ppinum * 32;			\
+									\
+		__nr_ppis;						\
+	 })
+
+#define GICR_WAKER_ProcessorSleep	(1U << 1)
+#define GICR_WAKER_ChildrenAsleep	(1U << 2)
+
+#define GIC_BASER_CACHE_nCnB		0ULL
+#define GIC_BASER_CACHE_SameAsInner	0ULL
+#define GIC_BASER_CACHE_nC		1ULL
+#define GIC_BASER_CACHE_RaWt		2ULL
+#define GIC_BASER_CACHE_RaWb		3ULL
+#define GIC_BASER_CACHE_WaWt		4ULL
+#define GIC_BASER_CACHE_WaWb		5ULL
+#define GIC_BASER_CACHE_RaWaWt		6ULL
+#define GIC_BASER_CACHE_RaWaWb		7ULL
+#define GIC_BASER_CACHE_MASK		7ULL
+#define GIC_BASER_NonShareable		0ULL
+#define GIC_BASER_InnerShareable	1ULL
+#define GIC_BASER_OuterShareable	2ULL
+#define GIC_BASER_SHAREABILITY_MASK	3ULL
+
+#define GIC_BASER_CACHEABILITY(reg, inner_outer, type)			\
+	(GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+
+#define GIC_BASER_SHAREABILITY(reg, type)				\
+	(GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+
+/* encode a size field of width @w containing @n - 1 units */
+#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0))
+
+#define GICR_PROPBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT		(7)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT		(56)
+#define GICR_PROPBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+#define GICR_PROPBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PROPBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+
+#define GICR_PROPBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+#define GICR_PROPBASER_nC 	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)
+#define GICR_PROPBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+#define GICR_PROPBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+#define GICR_PROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+
+#define GICR_PROPBASER_IDBITS_MASK			(0x1f)
+#define GICR_PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 12))
+#define GICR_PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 16))
+
+#define GICR_PENDBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT		(7)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT		(56)
+#define GICR_PENDBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+#define GICR_PENDBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PENDBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+
+#define GICR_PENDBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+#define GICR_PENDBASER_nC 	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)
+#define GICR_PENDBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+#define GICR_PENDBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+#define GICR_PENDBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+
+#define GICR_PENDBASER_PTZ				BIT_ULL(62)
+
+/*
+ * Re-Distributor registers, offsets from SGI_base
+ */
+#define GICR_IGROUPR0			GICD_IGROUPR
+#define GICR_ISENABLER0			GICD_ISENABLER
+#define GICR_ICENABLER0			GICD_ICENABLER
+#define GICR_ISPENDR0			GICD_ISPENDR
+#define GICR_ICPENDR0			GICD_ICPENDR
+#define GICR_ISACTIVER0			GICD_ISACTIVER
+#define GICR_ICACTIVER0			GICD_ICACTIVER
+#define GICR_IPRIORITYR0		GICD_IPRIORITYR
+#define GICR_ICFGR0			GICD_ICFGR
+#define GICR_IGRPMODR0			GICD_IGRPMODR
+#define GICR_NSACR			GICD_NSACR
+
+#define GICR_TYPER_PLPIS		(1U << 0)
+#define GICR_TYPER_VLPIS		(1U << 1)
+#define GICR_TYPER_DIRTY		(1U << 2)
+#define GICR_TYPER_DirectLPIS		(1U << 3)
+#define GICR_TYPER_LAST			(1U << 4)
+#define GICR_TYPER_RVPEID		(1U << 7)
+#define GICR_TYPER_COMMON_LPI_AFF	GENMASK_ULL(25, 24)
+#define GICR_TYPER_AFFINITY		GENMASK_ULL(63, 32)
+
+#define GICR_INVLPIR_INTID		GENMASK_ULL(31, 0)
+#define GICR_INVLPIR_VPEID		GENMASK_ULL(47, 32)
+#define GICR_INVLPIR_V			GENMASK_ULL(63, 63)
+
+#define GICR_INVALLR_VPEID		GICR_INVLPIR_VPEID
+#define GICR_INVALLR_V			GICR_INVLPIR_V
+
+#define GIC_V3_REDIST_SIZE		0x20000
+
+#define LPI_PROP_GROUP1			(1 << 1)
+#define LPI_PROP_ENABLED		(1 << 0)
+
+/*
+ * Re-Distributor registers, offsets from VLPI_base
+ */
+#define GICR_VPROPBASER			0x0070
+
+#define GICR_VPROPBASER_IDBITS_MASK	0x1f
+
+#define GICR_VPROPBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT	(7)
+#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT	(56)
+
+#define GICR_VPROPBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK)
+#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK)
+#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK)
+#define GICR_VPROPBASER_CACHEABILITY_MASK				\
+	GICR_VPROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_VPROPBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable)
+
+#define GICR_VPROPBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB)
+#define GICR_VPROPBASER_nC 	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC)
+#define GICR_VPROPBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt)
+#define GICR_VPROPBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWb)
+#define GICR_VPROPBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt)
+#define GICR_VPROPBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb)
+#define GICR_VPROPBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt)
+#define GICR_VPROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb)
+
+/*
+ * GICv4.1 VPROPBASER reinvention. A subtle mix between the old
+ * VPROPBASER and ITS_BASER. Just not quite any of the two.
+ */
+#define GICR_VPROPBASER_4_1_VALID	(1ULL << 63)
+#define GICR_VPROPBASER_4_1_ENTRY_SIZE	GENMASK_ULL(61, 59)
+#define GICR_VPROPBASER_4_1_INDIRECT	(1ULL << 55)
+#define GICR_VPROPBASER_4_1_PAGE_SIZE	GENMASK_ULL(54, 53)
+#define GICR_VPROPBASER_4_1_Z		(1ULL << 52)
+#define GICR_VPROPBASER_4_1_ADDR	GENMASK_ULL(51, 12)
+#define GICR_VPROPBASER_4_1_SIZE	GENMASK_ULL(6, 0)
+
+#define GICR_VPENDBASER			0x0078
+
+#define GICR_VPENDBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT	(7)
+#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT	(56)
+#define GICR_VPENDBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK)
+#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK)
+#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK)
+#define GICR_VPENDBASER_CACHEABILITY_MASK				\
+	GICR_VPENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_VPENDBASER_NonShareable					\
+	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable)
+
+#define GICR_VPENDBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, InnerShareable)
+
+#define GICR_VPENDBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB)
+#define GICR_VPENDBASER_nC 	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC)
+#define GICR_VPENDBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt)
+#define GICR_VPENDBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWb)
+#define GICR_VPENDBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt)
+#define GICR_VPENDBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb)
+#define GICR_VPENDBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt)
+#define GICR_VPENDBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb)
+
+#define GICR_VPENDBASER_Dirty		(1ULL << 60)
+#define GICR_VPENDBASER_PendingLast	(1ULL << 61)
+#define GICR_VPENDBASER_IDAI		(1ULL << 62)
+#define GICR_VPENDBASER_Valid		(1ULL << 63)
+
+/*
+ * GICv4.1 VPENDBASER, used for VPE residency. On top of these fields,
+ * also use the above Valid, PendingLast and Dirty.
+ */
+#define GICR_VPENDBASER_4_1_DB		(1ULL << 62)
+#define GICR_VPENDBASER_4_1_VGRP0EN	(1ULL << 59)
+#define GICR_VPENDBASER_4_1_VGRP1EN	(1ULL << 58)
+#define GICR_VPENDBASER_4_1_VPEID	GENMASK_ULL(15, 0)
+
+#define GICR_VSGIR			0x0080
+
+#define GICR_VSGIR_VPEID		GENMASK(15, 0)
+
+#define GICR_VSGIPENDR			0x0088
+
+#define GICR_VSGIPENDR_BUSY		(1U << 31)
+#define GICR_VSGIPENDR_PENDING		GENMASK(15, 0)
+
+/*
+ * ITS registers, offsets from ITS_base
+ */
+#define GITS_CTLR			0x0000
+#define GITS_IIDR			0x0004
+#define GITS_TYPER			0x0008
+#define GITS_MPIDR			0x0018
+#define GITS_CBASER			0x0080
+#define GITS_CWRITER			0x0088
+#define GITS_CREADR			0x0090
+#define GITS_BASER			0x0100
+#define GITS_IDREGS_BASE		0xffd0
+#define GITS_PIDR0			0xffe0
+#define GITS_PIDR1			0xffe4
+#define GITS_PIDR2			GICR_PIDR2
+#define GITS_PIDR4			0xffd0
+#define GITS_CIDR0			0xfff0
+#define GITS_CIDR1			0xfff4
+#define GITS_CIDR2			0xfff8
+#define GITS_CIDR3			0xfffc
+
+#define GITS_TRANSLATER			0x10040
+
+#define GITS_SGIR			0x20020
+
+#define GITS_SGIR_VPEID			GENMASK_ULL(47, 32)
+#define GITS_SGIR_VINTID		GENMASK_ULL(3, 0)
+
+#define GITS_CTLR_ENABLE		(1U << 0)
+#define GITS_CTLR_ImDe			(1U << 1)
+#define	GITS_CTLR_ITS_NUMBER_SHIFT	4
+#define	GITS_CTLR_ITS_NUMBER		(0xFU << GITS_CTLR_ITS_NUMBER_SHIFT)
+#define GITS_CTLR_QUIESCENT		(1U << 31)
+
+#define GITS_TYPER_PLPIS		(1UL << 0)
+#define GITS_TYPER_VLPIS		(1UL << 1)
+#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT	4
+#define GITS_TYPER_ITT_ENTRY_SIZE	GENMASK_ULL(7, 4)
+#define GITS_TYPER_IDBITS_SHIFT		8
+#define GITS_TYPER_DEVBITS_SHIFT	13
+#define GITS_TYPER_DEVBITS		GENMASK_ULL(17, 13)
+#define GITS_TYPER_PTA			(1UL << 19)
+#define GITS_TYPER_HCC_SHIFT		24
+#define GITS_TYPER_HCC(r)		(((r) >> GITS_TYPER_HCC_SHIFT) & 0xff)
+#define GITS_TYPER_VMOVP		(1ULL << 37)
+#define GITS_TYPER_VMAPP		(1ULL << 40)
+#define GITS_TYPER_SVPET		GENMASK_ULL(42, 41)
+
+#define GITS_IIDR_REV_SHIFT		12
+#define GITS_IIDR_REV_MASK		(0xf << GITS_IIDR_REV_SHIFT)
+#define GITS_IIDR_REV(r)		(((r) >> GITS_IIDR_REV_SHIFT) & 0xf)
+#define GITS_IIDR_PRODUCTID_SHIFT	24
+
+#define GITS_CBASER_VALID			(1ULL << 63)
+#define GITS_CBASER_SHAREABILITY_SHIFT		(10)
+#define GITS_CBASER_INNER_CACHEABILITY_SHIFT	(59)
+#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT	(53)
+#define GITS_CBASER_SHAREABILITY_MASK					\
+	GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+#define GITS_CBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+#define GITS_CBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+
+#define GITS_CBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+
+#define GITS_CBASER_nCnB	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+#define GITS_CBASER_nC		GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_RaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_RaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWb)
+#define GITS_CBASER_WaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+#define GITS_CBASER_WaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+#define GITS_CBASER_RaWaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+#define GITS_CBASER_RaWaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
+
+#define GITS_CBASER_ADDRESS(cbaser)	((cbaser) & GENMASK_ULL(51, 12))
+
+#define GITS_BASER_NR_REGS		8
+
+#define GITS_BASER_VALID			(1ULL << 63)
+#define GITS_BASER_INDIRECT			(1ULL << 62)
+
+#define GITS_BASER_INNER_CACHEABILITY_SHIFT	(59)
+#define GITS_BASER_OUTER_CACHEABILITY_SHIFT	(53)
+#define GITS_BASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_CACHEABILITY_MASK		GITS_BASER_INNER_CACHEABILITY_MASK
+#define GITS_BASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+#define GITS_BASER_SHAREABILITY_MASK					\
+	GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+
+#define GITS_BASER_nCnB		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+#define GITS_BASER_nC		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_RaWt		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_RaWb		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)
+#define GITS_BASER_WaWt		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+#define GITS_BASER_WaWb		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_RaWaWt	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+#define GITS_BASER_RaWaWb	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+
+#define GITS_BASER_TYPE_SHIFT			(56)
+#define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
+#define GITS_BASER_ENTRY_SIZE_SHIFT		(48)
+#define GITS_BASER_ENTRY_SIZE(r)	((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1)
+#define GITS_BASER_ENTRY_SIZE_MASK	GENMASK_ULL(52, 48)
+#define GITS_BASER_PHYS_52_to_48(phys)					\
+	(((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12)
+#define GITS_BASER_ADDR_48_to_52(baser)					\
+	(((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48)
+
+#define GITS_BASER_SHAREABILITY_SHIFT	(10)
+#define GITS_BASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
+#define GITS_BASER_PAGE_SIZE_SHIFT	(8)
+#define __GITS_BASER_PSZ(sz)		(GIC_PAGE_SIZE_ ## sz << GITS_BASER_PAGE_SIZE_SHIFT)
+#define GITS_BASER_PAGE_SIZE_4K		__GITS_BASER_PSZ(4K)
+#define GITS_BASER_PAGE_SIZE_16K	__GITS_BASER_PSZ(16K)
+#define GITS_BASER_PAGE_SIZE_64K	__GITS_BASER_PSZ(64K)
+#define GITS_BASER_PAGE_SIZE_MASK	__GITS_BASER_PSZ(MASK)
+#define GITS_BASER_PAGES_MAX		256
+#define GITS_BASER_PAGES_SHIFT		(0)
+#define GITS_BASER_NR_PAGES(r)		(((r) & 0xff) + 1)
+
+#define GITS_BASER_TYPE_NONE		0
+#define GITS_BASER_TYPE_DEVICE		1
+#define GITS_BASER_TYPE_VCPU		2
+#define GITS_BASER_TYPE_RESERVED3	3
+#define GITS_BASER_TYPE_COLLECTION	4
+#define GITS_BASER_TYPE_RESERVED5	5
+#define GITS_BASER_TYPE_RESERVED6	6
+#define GITS_BASER_TYPE_RESERVED7	7
+
+#define GITS_LVL1_ENTRY_SIZE           (8UL)
+
+/*
+ * ITS commands
+ */
+#define GITS_CMD_MAPD			0x08
+#define GITS_CMD_MAPC			0x09
+#define GITS_CMD_MAPTI			0x0a
+#define GITS_CMD_MAPI			0x0b
+#define GITS_CMD_MOVI			0x01
+#define GITS_CMD_DISCARD		0x0f
+#define GITS_CMD_INV			0x0c
+#define GITS_CMD_MOVALL			0x0e
+#define GITS_CMD_INVALL			0x0d
+#define GITS_CMD_INT			0x03
+#define GITS_CMD_CLEAR			0x04
+#define GITS_CMD_SYNC			0x05
+
+/*
+ * GICv4 ITS specific commands
+ */
+#define GITS_CMD_GICv4(x)		((x) | 0x20)
+#define GITS_CMD_VINVALL		GITS_CMD_GICv4(GITS_CMD_INVALL)
+#define GITS_CMD_VMAPP			GITS_CMD_GICv4(GITS_CMD_MAPC)
+#define GITS_CMD_VMAPTI			GITS_CMD_GICv4(GITS_CMD_MAPTI)
+#define GITS_CMD_VMOVI			GITS_CMD_GICv4(GITS_CMD_MOVI)
+#define GITS_CMD_VSYNC			GITS_CMD_GICv4(GITS_CMD_SYNC)
+/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */
+#define GITS_CMD_VMOVP			GITS_CMD_GICv4(2)
+#define GITS_CMD_VSGI			GITS_CMD_GICv4(3)
+#define GITS_CMD_INVDB			GITS_CMD_GICv4(0xe)
+
+/*
+ * ITS error numbers
+ */
+#define E_ITS_MOVI_UNMAPPED_INTERRUPT		0x010107
+#define E_ITS_MOVI_UNMAPPED_COLLECTION		0x010109
+#define E_ITS_INT_UNMAPPED_INTERRUPT		0x010307
+#define E_ITS_CLEAR_UNMAPPED_INTERRUPT		0x010507
+#define E_ITS_MAPD_DEVICE_OOR			0x010801
+#define E_ITS_MAPD_ITTSIZE_OOR			0x010802
+#define E_ITS_MAPC_PROCNUM_OOR			0x010902
+#define E_ITS_MAPC_COLLECTION_OOR		0x010903
+#define E_ITS_MAPTI_UNMAPPED_DEVICE		0x010a04
+#define E_ITS_MAPTI_ID_OOR			0x010a05
+#define E_ITS_MAPTI_PHYSICALID_OOR		0x010a06
+#define E_ITS_INV_UNMAPPED_INTERRUPT		0x010c07
+#define E_ITS_INVALL_UNMAPPED_COLLECTION	0x010d09
+#define E_ITS_MOVALL_PROCNUM_OOR		0x010e01
+#define E_ITS_DISCARD_UNMAPPED_INTERRUPT	0x010f07
+
+/*
+ * CPU interface registers
+ */
+#define ICC_CTLR_EL1_EOImode_SHIFT	(1)
+#define ICC_CTLR_EL1_EOImode_drop_dir	(0U << ICC_CTLR_EL1_EOImode_SHIFT)
+#define ICC_CTLR_EL1_EOImode_drop	(1U << ICC_CTLR_EL1_EOImode_SHIFT)
+#define ICC_CTLR_EL1_EOImode_MASK	(1 << ICC_CTLR_EL1_EOImode_SHIFT)
+#define ICC_CTLR_EL1_CBPR_SHIFT		0
+#define ICC_CTLR_EL1_CBPR_MASK		(1 << ICC_CTLR_EL1_CBPR_SHIFT)
+#define ICC_CTLR_EL1_PMHE_SHIFT		6
+#define ICC_CTLR_EL1_PMHE_MASK		(1 << ICC_CTLR_EL1_PMHE_SHIFT)
+#define ICC_CTLR_EL1_PRI_BITS_SHIFT	8
+#define ICC_CTLR_EL1_PRI_BITS_MASK	(0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT)
+#define ICC_CTLR_EL1_ID_BITS_SHIFT	11
+#define ICC_CTLR_EL1_ID_BITS_MASK	(0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT)
+#define ICC_CTLR_EL1_SEIS_SHIFT		14
+#define ICC_CTLR_EL1_SEIS_MASK		(0x1 << ICC_CTLR_EL1_SEIS_SHIFT)
+#define ICC_CTLR_EL1_A3V_SHIFT		15
+#define ICC_CTLR_EL1_A3V_MASK		(0x1 << ICC_CTLR_EL1_A3V_SHIFT)
+#define ICC_CTLR_EL1_RSS		(0x1 << 18)
+#define ICC_CTLR_EL1_ExtRange		(0x1 << 19)
+#define ICC_PMR_EL1_SHIFT		0
+#define ICC_PMR_EL1_MASK		(0xff << ICC_PMR_EL1_SHIFT)
+#define ICC_BPR0_EL1_SHIFT		0
+#define ICC_BPR0_EL1_MASK		(0x7 << ICC_BPR0_EL1_SHIFT)
+#define ICC_BPR1_EL1_SHIFT		0
+#define ICC_BPR1_EL1_MASK		(0x7 << ICC_BPR1_EL1_SHIFT)
+#define ICC_IGRPEN0_EL1_SHIFT		0
+#define ICC_IGRPEN0_EL1_MASK		(1 << ICC_IGRPEN0_EL1_SHIFT)
+#define ICC_IGRPEN1_EL1_SHIFT		0
+#define ICC_IGRPEN1_EL1_MASK		(1 << ICC_IGRPEN1_EL1_SHIFT)
+#define ICC_SRE_EL1_DIB			(1U << 2)
+#define ICC_SRE_EL1_DFB			(1U << 1)
+#define ICC_SRE_EL1_SRE			(1U << 0)
+
+/* These are for GICv2 emulation only */
+#define GICH_LR_VIRTUALID		(0x3ffUL << 0)
+#define GICH_LR_PHYSID_CPUID_SHIFT	(10)
+#define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
+
+#define ICC_IAR1_EL1_SPURIOUS		0x3ff
+
+#define ICC_SRE_EL2_SRE			(1 << 0)
+#define ICC_SRE_EL2_ENABLE		(1 << 3)
+
+#define ICC_SGI1R_TARGET_LIST_SHIFT	0
+#define ICC_SGI1R_TARGET_LIST_MASK	(0xffff << ICC_SGI1R_TARGET_LIST_SHIFT)
+#define ICC_SGI1R_AFFINITY_1_SHIFT	16
+#define ICC_SGI1R_AFFINITY_1_MASK	(0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_SGI_ID_SHIFT		24
+#define ICC_SGI1R_SGI_ID_MASK		(0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
+#define ICC_SGI1R_AFFINITY_2_SHIFT	32
+#define ICC_SGI1R_AFFINITY_2_MASK	(0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
+#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT	40
+#define ICC_SGI1R_RS_SHIFT		44
+#define ICC_SGI1R_RS_MASK		(0xfULL << ICC_SGI1R_RS_SHIFT)
+#define ICC_SGI1R_AFFINITY_3_SHIFT	48
+#define ICC_SGI1R_AFFINITY_3_MASK	(0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h
new file mode 100644
index 000000000000..3722ed9c8f96
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SELFTESTS_GIC_V3_ITS_H__
+#define __SELFTESTS_GIC_V3_ITS_H__
+
+#include <linux/sizes.h>
+
+void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz,
+	      vm_paddr_t device_tbl, size_t device_tbl_sz,
+	      vm_paddr_t cmdq, size_t cmdq_size);
+
+void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base,
+		       size_t itt_size, bool valid);
+void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid);
+void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id,
+			u32 collection_id, u32 intid);
+void its_send_invall_cmd(void *cmdq_base, u32 collection_id);
+
+#endif // __SELFTESTS_GIC_V3_ITS_H__
diff --git a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h
new file mode 100644
index 000000000000..e43a57d99b56
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+struct kvm_vm_arch {};
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h
new file mode 100644
index 000000000000..1e8d0d531fbd
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/processor.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AArch64 processor specific defines
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include "kvm_util.h"
+#include "ucall_common.h"
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <asm/brk-imm.h>
+#include <asm/esr.h>
+#include <asm/sysreg.h>
+
+
+#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+			   KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+/*
+ * KVM_ARM64_SYS_REG(sys_reg_id): Helper macro to convert
+ * SYS_* register definitions in asm/sysreg.h to use in KVM
+ * calls such as vcpu_get_reg() and vcpu_set_reg().
+ */
+#define KVM_ARM64_SYS_REG(sys_reg_id)			\
+	ARM64_SYS_REG(sys_reg_Op0(sys_reg_id),		\
+			sys_reg_Op1(sys_reg_id),	\
+			sys_reg_CRn(sys_reg_id),	\
+			sys_reg_CRm(sys_reg_id),	\
+			sys_reg_Op2(sys_reg_id))
+
+/*
+ * Default MAIR
+ *                  index   attribute
+ * DEVICE_nGnRnE      0     0000:0000
+ * DEVICE_nGnRE       1     0000:0100
+ * DEVICE_GRE         2     0000:1100
+ * NORMAL_NC          3     0100:0100
+ * NORMAL             4     1111:1111
+ * NORMAL_WT          5     1011:1011
+ */
+
+/* Linux doesn't use these memory types, so let's define them. */
+#define MAIR_ATTR_DEVICE_GRE	UL(0x0c)
+#define MAIR_ATTR_NORMAL_WT	UL(0xbb)
+
+#define MT_DEVICE_nGnRnE	0
+#define MT_DEVICE_nGnRE		1
+#define MT_DEVICE_GRE		2
+#define MT_NORMAL_NC		3
+#define MT_NORMAL		4
+#define MT_NORMAL_WT		5
+
+#define DEFAULT_MAIR_EL1							\
+	(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |		\
+	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |		\
+	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |			\
+	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |			\
+	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |				\
+	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT))
+
+void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init);
+struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
+				  struct kvm_vcpu_init *init, void *guest_code);
+
+struct ex_regs {
+	u64 regs[31];
+	u64 sp;
+	u64 pc;
+	u64 pstate;
+};
+
+#define VECTOR_NUM	16
+
+enum {
+	VECTOR_SYNC_CURRENT_SP0,
+	VECTOR_IRQ_CURRENT_SP0,
+	VECTOR_FIQ_CURRENT_SP0,
+	VECTOR_ERROR_CURRENT_SP0,
+
+	VECTOR_SYNC_CURRENT,
+	VECTOR_IRQ_CURRENT,
+	VECTOR_FIQ_CURRENT,
+	VECTOR_ERROR_CURRENT,
+
+	VECTOR_SYNC_LOWER_64,
+	VECTOR_IRQ_LOWER_64,
+	VECTOR_FIQ_LOWER_64,
+	VECTOR_ERROR_LOWER_64,
+
+	VECTOR_SYNC_LOWER_32,
+	VECTOR_IRQ_LOWER_32,
+	VECTOR_FIQ_LOWER_32,
+	VECTOR_ERROR_LOWER_32,
+};
+
+#define VECTOR_IS_SYNC(v) ((v) == VECTOR_SYNC_CURRENT_SP0 || \
+			   (v) == VECTOR_SYNC_CURRENT     || \
+			   (v) == VECTOR_SYNC_LOWER_64    || \
+			   (v) == VECTOR_SYNC_LOWER_32)
+
+/* Access flag */
+#define PTE_AF			(1ULL << 10)
+
+/* Access flag update enable/disable */
+#define TCR_EL1_HA		(1ULL << 39)
+
+void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k,
+					uint32_t *ipa16k, uint32_t *ipa64k);
+
+void vm_init_descriptor_tables(struct kvm_vm *vm);
+void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu);
+
+typedef void(*handler_fn)(struct ex_regs *);
+void vm_install_exception_handler(struct kvm_vm *vm,
+		int vector, handler_fn handler);
+void vm_install_sync_handler(struct kvm_vm *vm,
+		int vector, int ec, handler_fn handler);
+
+uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva);
+
+static inline void cpu_relax(void)
+{
+	asm volatile("yield" ::: "memory");
+}
+
+#define isb()		asm volatile("isb" : : : "memory")
+#define dsb(opt)	asm volatile("dsb " #opt : : : "memory")
+#define dmb(opt)	asm volatile("dmb " #opt : : : "memory")
+
+#define dma_wmb()	dmb(oshst)
+#define __iowmb()	dma_wmb()
+
+#define dma_rmb()	dmb(oshld)
+
+#define __iormb(v)							\
+({									\
+	unsigned long tmp;						\
+									\
+	dma_rmb();							\
+									\
+	/*								\
+	 * Courtesy of arch/arm64/include/asm/io.h:			\
+	 * Create a dummy control dependency from the IO read to any	\
+	 * later instructions. This ensures that a subsequent call	\
+	 * to udelay() will be ordered due to the ISB in __delay().	\
+	 */								\
+	asm volatile("eor	%0, %1, %1\n"				\
+		     "cbnz	%0, ."					\
+		     : "=r" (tmp) : "r" ((unsigned long)(v))		\
+		     : "memory");					\
+})
+
+static __always_inline void __raw_writel(u32 val, volatile void *addr)
+{
+	asm volatile("str %w0, [%1]" : : "rZ" (val), "r" (addr));
+}
+
+static __always_inline u32 __raw_readl(const volatile void *addr)
+{
+	u32 val;
+	asm volatile("ldr %w0, [%1]" : "=r" (val) : "r" (addr));
+	return val;
+}
+
+static __always_inline void __raw_writeq(u64 val, volatile void *addr)
+{
+	asm volatile("str %0, [%1]" : : "rZ" (val), "r" (addr));
+}
+
+static __always_inline u64 __raw_readq(const volatile void *addr)
+{
+	u64 val;
+	asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr));
+	return val;
+}
+
+#define writel_relaxed(v,c)	((void)__raw_writel((__force u32)cpu_to_le32(v),(c)))
+#define readl_relaxed(c)	({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; })
+#define writeq_relaxed(v,c)	((void)__raw_writeq((__force u64)cpu_to_le64(v),(c)))
+#define readq_relaxed(c)	({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; })
+
+#define writel(v,c)		({ __iowmb(); writel_relaxed((v),(c));})
+#define readl(c)		({ u32 __v = readl_relaxed(c); __iormb(__v); __v; })
+#define writeq(v,c)		({ __iowmb(); writeq_relaxed((v),(c));})
+#define readq(c)		({ u64 __v = readq_relaxed(c); __iormb(__v); __v; })
+
+
+static inline void local_irq_enable(void)
+{
+	asm volatile("msr daifclr, #3" : : : "memory");
+}
+
+static inline void local_irq_disable(void)
+{
+	asm volatile("msr daifset, #3" : : : "memory");
+}
+
+/**
+ * struct arm_smccc_res - Result from SMC/HVC call
+ * @a0-a3 result values from registers 0 to 3
+ */
+struct arm_smccc_res {
+	unsigned long a0;
+	unsigned long a1;
+	unsigned long a2;
+	unsigned long a3;
+};
+
+/**
+ * smccc_hvc - Invoke a SMCCC function using the hvc conduit
+ * @function_id: the SMCCC function to be called
+ * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7
+ * @res: pointer to write the return values from registers x0-x3
+ *
+ */
+void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
+	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
+	       uint64_t arg6, struct arm_smccc_res *res);
+
+/**
+ * smccc_smc - Invoke a SMCCC function using the smc conduit
+ * @function_id: the SMCCC function to be called
+ * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7
+ * @res: pointer to write the return values from registers x0-x3
+ *
+ */
+void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
+	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
+	       uint64_t arg6, struct arm_smccc_res *res);
+
+/* Execute a Wait For Interrupt instruction. */
+void wfi(void);
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/arm64/spinlock.h b/tools/testing/selftests/kvm/include/arm64/spinlock.h
new file mode 100644
index 000000000000..cf0984106d14
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/spinlock.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef SELFTEST_KVM_ARM64_SPINLOCK_H
+#define SELFTEST_KVM_ARM64_SPINLOCK_H
+
+struct spinlock {
+	int v;
+};
+
+extern void spin_lock(struct spinlock *lock);
+extern void spin_unlock(struct spinlock *lock);
+
+#endif /* SELFTEST_KVM_ARM64_SPINLOCK_H */
diff --git a/tools/testing/selftests/kvm/include/arm64/ucall.h b/tools/testing/selftests/kvm/include/arm64/ucall.h
new file mode 100644
index 000000000000..4ec801f37f00
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/ucall.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UCALL_H
+#define SELFTEST_KVM_UCALL_H
+
+#include "kvm_util.h"
+
+#define UCALL_EXIT_REASON       KVM_EXIT_MMIO
+
+/*
+ * ucall_exit_mmio_addr holds per-VM values (global data is duplicated by each
+ * VM), it must not be accessed from host code.
+ */
+extern vm_vaddr_t *ucall_exit_mmio_addr;
+
+static inline void ucall_arch_do_ucall(vm_vaddr_t uc)
+{
+	WRITE_ONCE(*ucall_exit_mmio_addr, uc);
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/arm64/vgic.h b/tools/testing/selftests/kvm/include/arm64/vgic.h
new file mode 100644
index 000000000000..c481d0c00a5d
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/arm64/vgic.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM Generic Interrupt Controller (GIC) host specific defines
+ */
+
+#ifndef SELFTEST_KVM_VGIC_H
+#define SELFTEST_KVM_VGIC_H
+
+#include <linux/kvm.h>
+
+#include "kvm_util.h"
+
+#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) \
+	(((uint64_t)(count) << 52) | \
+	((uint64_t)((base) >> 16) << 16) | \
+	((uint64_t)(flags) << 12) | \
+	index)
+
+int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs);
+
+#define VGIC_MAX_RESERVED	1023
+
+void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level);
+int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level);
+
+void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
+int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
+
+/* The vcpu arg only applies to private interrupts. */
+void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu);
+void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu);
+
+#define KVM_IRQCHIP_NUM_PINS	(1020 - 32)
+
+int vgic_its_setup(struct kvm_vm *vm);
+
+#endif // SELFTEST_KVM_VGIC_H
diff --git a/tools/testing/selftests/kvm/include/s390/debug_print.h b/tools/testing/selftests/kvm/include/s390/debug_print.h
new file mode 100644
index 000000000000..1bf275631cc6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/debug_print.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Definition for kernel virtual machines on s390x
+ *
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Christoph Schlameuss <schlameuss@linux.ibm.com>
+ */
+
+#ifndef SELFTEST_KVM_DEBUG_PRINT_H
+#define SELFTEST_KVM_DEBUG_PRINT_H
+
+#include "asm/ptrace.h"
+#include "kvm_util.h"
+#include "sie.h"
+
+static inline void print_hex_bytes(const char *name, u64 addr, size_t len)
+{
+	u64 pos;
+
+	pr_debug("%s (%p)\n", name, (void *)addr);
+	pr_debug("            0/0x00---------|");
+	if (len > 8)
+		pr_debug(" 8/0x08---------|");
+	if (len > 16)
+		pr_debug(" 16/0x10--------|");
+	if (len > 24)
+		pr_debug(" 24/0x18--------|");
+	for (pos = 0; pos < len; pos += 8) {
+		if ((pos % 32) == 0)
+			pr_debug("\n %3lu 0x%.3lx ", pos, pos);
+		pr_debug(" %16lx", *((u64 *)(addr + pos)));
+	}
+	pr_debug("\n");
+}
+
+static inline void print_hex(const char *name, u64 addr)
+{
+	print_hex_bytes(name, addr, 512);
+}
+
+static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block)
+{
+	pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n",
+		 run->flags,
+		 run->psw_mask, run->psw_addr,
+		 run->exit_reason, exit_reason_str(run->exit_reason));
+	pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n",
+		 sie_block->psw_mask, sie_block->psw_addr);
+}
+
+static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block)
+{
+	print_hex_bytes("run", (u64)run, 0x150);
+	print_hex("sie_block", (u64)sie_block);
+	print_psw(run, sie_block);
+}
+
+static inline void print_regs(struct kvm_run *run)
+{
+	struct kvm_sync_regs *sync_regs = &run->s.regs;
+
+	print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS);
+	print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS);
+	print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS);
+}
+
+#endif /* SELFTEST_KVM_DEBUG_PRINT_H */
diff --git a/tools/testing/selftests/kvm/include/s390/diag318_test_handler.h b/tools/testing/selftests/kvm/include/s390/diag318_test_handler.h
new file mode 100644
index 000000000000..b0ed71302722
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/diag318_test_handler.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Test handler for the s390x DIAGNOSE 0x0318 instruction.
+ *
+ * Copyright (C) 2020, IBM
+ */
+
+#ifndef SELFTEST_KVM_DIAG318_TEST_HANDLER
+#define SELFTEST_KVM_DIAG318_TEST_HANDLER
+
+uint64_t get_diag318_info(void);
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/s390/facility.h b/tools/testing/selftests/kvm/include/s390/facility.h
new file mode 100644
index 000000000000..00a1ced6538b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/facility.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Hariharan Mari <hari55@linux.ibm.com>
+ *
+ * Get the facility bits with the STFLE instruction
+ */
+
+#ifndef SELFTEST_KVM_FACILITY_H
+#define SELFTEST_KVM_FACILITY_H
+
+#include <linux/bitops.h>
+
+/* alt_stfle_fac_list[16] + stfle_fac_list[16] */
+#define NB_STFL_DOUBLEWORDS 32
+
+extern uint64_t stfl_doublewords[NB_STFL_DOUBLEWORDS];
+extern bool stfle_flag;
+
+static inline bool test_bit_inv(unsigned long nr, const unsigned long *ptr)
+{
+	return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
+}
+
+static inline void stfle(uint64_t *fac, unsigned int nb_doublewords)
+{
+	register unsigned long r0 asm("0") = nb_doublewords - 1;
+
+	asm volatile("	.insn	s,0xb2b00000,0(%1)\n"
+			: "+d" (r0)
+			: "a" (fac)
+			: "memory", "cc");
+}
+
+static inline void setup_facilities(void)
+{
+	stfle(stfl_doublewords, NB_STFL_DOUBLEWORDS);
+	stfle_flag = true;
+}
+
+static inline bool test_facility(int nr)
+{
+	if (!stfle_flag)
+		setup_facilities();
+	return test_bit_inv(nr, stfl_doublewords);
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h
new file mode 100644
index 000000000000..e43a57d99b56
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+struct kvm_vm_arch {};
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/s390/processor.h b/tools/testing/selftests/kvm/include/s390/processor.h
new file mode 100644
index 000000000000..33fef6fd9617
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/processor.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * s390x processor specific defines
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include <linux/compiler.h>
+
+/* Bits in the region/segment table entry */
+#define REGION_ENTRY_ORIGIN	~0xfffUL /* region/segment table origin	   */
+#define REGION_ENTRY_PROTECT	0x200	 /* region protection bit	   */
+#define REGION_ENTRY_NOEXEC	0x100	 /* region no-execute bit	   */
+#define REGION_ENTRY_OFFSET	0xc0	 /* region table offset		   */
+#define REGION_ENTRY_INVALID	0x20	 /* invalid region table entry	   */
+#define REGION_ENTRY_TYPE	0x0c	 /* region/segment table type mask */
+#define REGION_ENTRY_LENGTH	0x03	 /* region third length		   */
+
+/* Bits in the page table entry */
+#define PAGE_INVALID	0x400		/* HW invalid bit    */
+#define PAGE_PROTECT	0x200		/* HW read-only bit  */
+#define PAGE_NOEXEC	0x100		/* HW no-execute bit */
+
+/* Page size definitions */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE BIT_ULL(PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE - 1))
+
+/* Is there a portable way to do this? */
+static inline void cpu_relax(void)
+{
+	barrier();
+}
+
+/* Get the instruction length */
+static inline int insn_length(unsigned char code)
+{
+	return ((((int)code + 64) >> 7) + 1) << 1;
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/s390/sie.h b/tools/testing/selftests/kvm/include/s390/sie.h
new file mode 100644
index 000000000000..160acd4a1db9
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/sie.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Definition for kernel virtual machines on s390.
+ *
+ * Adapted copy of struct definition kvm_s390_sie_block from
+ * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs.
+ *
+ * Copyright IBM Corp. 2008, 2024
+ *
+ * Authors:
+ *  Christoph Schlameuss <schlameuss@linux.ibm.com>
+ *  Carsten Otte <cotte@de.ibm.com>
+ */
+
+#ifndef SELFTEST_KVM_SIE_H
+#define SELFTEST_KVM_SIE_H
+
+#include <linux/types.h>
+
+struct kvm_s390_sie_block {
+#define CPUSTAT_STOPPED    0x80000000
+#define CPUSTAT_WAIT       0x10000000
+#define CPUSTAT_ECALL_PEND 0x08000000
+#define CPUSTAT_STOP_INT   0x04000000
+#define CPUSTAT_IO_INT     0x02000000
+#define CPUSTAT_EXT_INT    0x01000000
+#define CPUSTAT_RUNNING    0x00800000
+#define CPUSTAT_RETAINED   0x00400000
+#define CPUSTAT_TIMING_SUB 0x00020000
+#define CPUSTAT_SIE_SUB    0x00010000
+#define CPUSTAT_RRF        0x00008000
+#define CPUSTAT_SLSV       0x00004000
+#define CPUSTAT_SLSR       0x00002000
+#define CPUSTAT_ZARCH      0x00000800
+#define CPUSTAT_MCDS       0x00000100
+#define CPUSTAT_KSS        0x00000200
+#define CPUSTAT_SM         0x00000080
+#define CPUSTAT_IBS        0x00000040
+#define CPUSTAT_GED2       0x00000010
+#define CPUSTAT_G          0x00000008
+#define CPUSTAT_GED        0x00000004
+#define CPUSTAT_J          0x00000002
+#define CPUSTAT_P          0x00000001
+	__u32 cpuflags;			/* 0x0000 */
+	__u32: 1;			/* 0x0004 */
+	__u32 prefix : 18;
+	__u32: 1;
+	__u32 ibc : 12;
+	__u8	reserved08[4];		/* 0x0008 */
+#define PROG_IN_SIE BIT(0)
+	__u32	prog0c;			/* 0x000c */
+	union {
+		__u8	reserved10[16];	/* 0x0010 */
+		struct {
+			__u64	pv_handle_cpu;
+			__u64	pv_handle_config;
+		};
+	};
+#define PROG_BLOCK_SIE	BIT(0)
+#define PROG_REQUEST	BIT(1)
+	__u32	prog20;			/* 0x0020 */
+	__u8	reserved24[4];		/* 0x0024 */
+	__u64	cputm;			/* 0x0028 */
+	__u64	ckc;			/* 0x0030 */
+	__u64	epoch;			/* 0x0038 */
+	__u32	svcc;			/* 0x0040 */
+#define LCTL_CR0	0x8000
+#define LCTL_CR6	0x0200
+#define LCTL_CR9	0x0040
+#define LCTL_CR10	0x0020
+#define LCTL_CR11	0x0010
+#define LCTL_CR14	0x0002
+	__u16   lctl;			/* 0x0044 */
+	__s16	icpua;			/* 0x0046 */
+#define ICTL_OPEREXC	0x80000000
+#define ICTL_PINT	0x20000000
+#define ICTL_LPSW	0x00400000
+#define ICTL_STCTL	0x00040000
+#define ICTL_ISKE	0x00004000
+#define ICTL_SSKE	0x00002000
+#define ICTL_RRBE	0x00001000
+#define ICTL_TPROT	0x00000200
+	__u32	ictl;			/* 0x0048 */
+#define ECA_CEI		0x80000000
+#define ECA_IB		0x40000000
+#define ECA_SIGPI	0x10000000
+#define ECA_MVPGI	0x01000000
+#define ECA_AIV		0x00200000
+#define ECA_VX		0x00020000
+#define ECA_PROTEXCI	0x00002000
+#define ECA_APIE	0x00000008
+#define ECA_SII		0x00000001
+	__u32	eca;			/* 0x004c */
+#define ICPT_INST	0x04
+#define ICPT_PROGI	0x08
+#define ICPT_INSTPROGI	0x0C
+#define ICPT_EXTREQ	0x10
+#define ICPT_EXTINT	0x14
+#define ICPT_IOREQ	0x18
+#define ICPT_WAIT	0x1c
+#define ICPT_VALIDITY	0x20
+#define ICPT_STOP	0x28
+#define ICPT_OPEREXC	0x2C
+#define ICPT_PARTEXEC	0x38
+#define ICPT_IOINST	0x40
+#define ICPT_KSS	0x5c
+#define ICPT_MCHKREQ	0x60
+#define ICPT_INT_ENABLE	0x64
+#define ICPT_PV_INSTR	0x68
+#define ICPT_PV_NOTIFY	0x6c
+#define ICPT_PV_PREF	0x70
+	__u8	icptcode;		/* 0x0050 */
+	__u8	icptstatus;		/* 0x0051 */
+	__u16	ihcpu;			/* 0x0052 */
+	__u8	reserved54;		/* 0x0054 */
+#define IICTL_CODE_NONE		 0x00
+#define IICTL_CODE_MCHK		 0x01
+#define IICTL_CODE_EXT		 0x02
+#define IICTL_CODE_IO		 0x03
+#define IICTL_CODE_RESTART	 0x04
+#define IICTL_CODE_SPECIFICATION 0x10
+#define IICTL_CODE_OPERAND	 0x11
+	__u8	iictl;			/* 0x0055 */
+	__u16	ipa;			/* 0x0056 */
+	__u32	ipb;			/* 0x0058 */
+	__u32	scaoh;			/* 0x005c */
+#define FPF_BPBC	0x20
+	__u8	fpf;			/* 0x0060 */
+#define ECB_GS		0x40
+#define ECB_TE		0x10
+#define ECB_SPECI	0x08
+#define ECB_SRSI	0x04
+#define ECB_HOSTPROTINT	0x02
+#define ECB_PTF		0x01
+	__u8	ecb;			/* 0x0061 */
+#define ECB2_CMMA	0x80
+#define ECB2_IEP	0x20
+#define ECB2_PFMFI	0x08
+#define ECB2_ESCA	0x04
+#define ECB2_ZPCI_LSI	0x02
+	__u8	ecb2;			/* 0x0062 */
+#define ECB3_AISI	0x20
+#define ECB3_AISII	0x10
+#define ECB3_DEA	0x08
+#define ECB3_AES	0x04
+#define ECB3_RI		0x01
+	__u8	ecb3;			/* 0x0063 */
+#define ESCA_SCAOL_MASK ~0x3fU
+	__u32	scaol;			/* 0x0064 */
+	__u8	sdf;			/* 0x0068 */
+	__u8	epdx;			/* 0x0069 */
+	__u8	cpnc;			/* 0x006a */
+	__u8	reserved6b;		/* 0x006b */
+	__u32	todpr;			/* 0x006c */
+#define GISA_FORMAT1 0x00000001
+	__u32	gd;			/* 0x0070 */
+	__u8	reserved74[12];		/* 0x0074 */
+	__u64	mso;			/* 0x0080 */
+	__u64	msl;			/* 0x0088 */
+	__u64	psw_mask;		/* 0x0090 */
+	__u64	psw_addr;		/* 0x0098 */
+	__u64	gg14;			/* 0x00a0 */
+	__u64	gg15;			/* 0x00a8 */
+	__u8	reservedb0[8];		/* 0x00b0 */
+#define HPID_KVM	0x4
+#define HPID_VSIE	0x5
+	__u8	hpid;			/* 0x00b8 */
+	__u8	reservedb9[7];		/* 0x00b9 */
+	union {
+		struct {
+			__u32	eiparams;	/* 0x00c0 */
+			__u16	extcpuaddr;	/* 0x00c4 */
+			__u16	eic;		/* 0x00c6 */
+		};
+		__u64	mcic;			/* 0x00c0 */
+	} __packed;
+	__u32	reservedc8;		/* 0x00c8 */
+	union {
+		struct {
+			__u16	pgmilc;		/* 0x00cc */
+			__u16	iprcc;		/* 0x00ce */
+		};
+		__u32	edc;			/* 0x00cc */
+	} __packed;
+	union {
+		struct {
+			__u32	dxc;		/* 0x00d0 */
+			__u16	mcn;		/* 0x00d4 */
+			__u8	perc;		/* 0x00d6 */
+			__u8	peratmid;	/* 0x00d7 */
+		};
+		__u64	faddr;			/* 0x00d0 */
+	} __packed;
+	__u64	peraddr;		/* 0x00d8 */
+	__u8	eai;			/* 0x00e0 */
+	__u8	peraid;			/* 0x00e1 */
+	__u8	oai;			/* 0x00e2 */
+	__u8	armid;			/* 0x00e3 */
+	__u8	reservede4[4];		/* 0x00e4 */
+	union {
+		__u64	tecmc;		/* 0x00e8 */
+		struct {
+			__u16	subchannel_id;	/* 0x00e8 */
+			__u16	subchannel_nr;	/* 0x00ea */
+			__u32	io_int_parm;	/* 0x00ec */
+			__u32	io_int_word;	/* 0x00f0 */
+		};
+	} __packed;
+	__u8	reservedf4[8];		/* 0x00f4 */
+#define CRYCB_FORMAT_MASK	0x00000003
+#define CRYCB_FORMAT0		0x00000000
+#define CRYCB_FORMAT1		0x00000001
+#define CRYCB_FORMAT2		0x00000003
+	__u32	crycbd;			/* 0x00fc */
+	__u64	gcr[16];		/* 0x0100 */
+	union {
+		__u64	gbea;		/* 0x0180 */
+		__u64	sidad;
+	};
+	__u8    reserved188[8];		/* 0x0188 */
+	__u64   sdnxo;			/* 0x0190 */
+	__u8    reserved198[8];		/* 0x0198 */
+	__u32	fac;			/* 0x01a0 */
+	__u8	reserved1a4[20];	/* 0x01a4 */
+	__u64	cbrlo;			/* 0x01b8 */
+	__u8	reserved1c0[8];		/* 0x01c0 */
+#define ECD_HOSTREGMGMT	0x20000000
+#define ECD_MEF		0x08000000
+#define ECD_ETOKENF	0x02000000
+#define ECD_ECC		0x00200000
+	__u32	ecd;			/* 0x01c8 */
+	__u8	reserved1cc[18];	/* 0x01cc */
+	__u64	pp;			/* 0x01de */
+	__u8	reserved1e6[2];		/* 0x01e6 */
+	__u64	itdba;			/* 0x01e8 */
+	__u64   riccbd;			/* 0x01f0 */
+	__u64	gvrd;			/* 0x01f8 */
+} __packed __aligned(512);
+
+#endif /* SELFTEST_KVM_SIE_H */
diff --git a/tools/testing/selftests/kvm/include/s390/ucall.h b/tools/testing/selftests/kvm/include/s390/ucall.h
new file mode 100644
index 000000000000..8035a872a351
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390/ucall.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UCALL_H
+#define SELFTEST_KVM_UCALL_H
+
+#include "kvm_util.h"
+
+#define UCALL_EXIT_REASON       KVM_EXIT_S390_SIEIC
+
+static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
+{
+}
+
+static inline void ucall_arch_do_ucall(vm_vaddr_t uc)
+{
+	/* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */
+	asm volatile ("diag 0,%0,0x501" : : "a"(uc) : "memory");
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/s390x/debug_print.h b/tools/testing/selftests/kvm/include/s390x/debug_print.h
deleted file mode 100644
index 1bf275631cc6..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/debug_print.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Definition for kernel virtual machines on s390x
- *
- * Copyright IBM Corp. 2024
- *
- * Authors:
- *  Christoph Schlameuss <schlameuss@linux.ibm.com>
- */
-
-#ifndef SELFTEST_KVM_DEBUG_PRINT_H
-#define SELFTEST_KVM_DEBUG_PRINT_H
-
-#include "asm/ptrace.h"
-#include "kvm_util.h"
-#include "sie.h"
-
-static inline void print_hex_bytes(const char *name, u64 addr, size_t len)
-{
-	u64 pos;
-
-	pr_debug("%s (%p)\n", name, (void *)addr);
-	pr_debug("            0/0x00---------|");
-	if (len > 8)
-		pr_debug(" 8/0x08---------|");
-	if (len > 16)
-		pr_debug(" 16/0x10--------|");
-	if (len > 24)
-		pr_debug(" 24/0x18--------|");
-	for (pos = 0; pos < len; pos += 8) {
-		if ((pos % 32) == 0)
-			pr_debug("\n %3lu 0x%.3lx ", pos, pos);
-		pr_debug(" %16lx", *((u64 *)(addr + pos)));
-	}
-	pr_debug("\n");
-}
-
-static inline void print_hex(const char *name, u64 addr)
-{
-	print_hex_bytes(name, addr, 512);
-}
-
-static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block)
-{
-	pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n",
-		 run->flags,
-		 run->psw_mask, run->psw_addr,
-		 run->exit_reason, exit_reason_str(run->exit_reason));
-	pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n",
-		 sie_block->psw_mask, sie_block->psw_addr);
-}
-
-static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block)
-{
-	print_hex_bytes("run", (u64)run, 0x150);
-	print_hex("sie_block", (u64)sie_block);
-	print_psw(run, sie_block);
-}
-
-static inline void print_regs(struct kvm_run *run)
-{
-	struct kvm_sync_regs *sync_regs = &run->s.regs;
-
-	print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS);
-	print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS);
-	print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS);
-}
-
-#endif /* SELFTEST_KVM_DEBUG_PRINT_H */
diff --git a/tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h b/tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h
deleted file mode 100644
index b0ed71302722..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later
- *
- * Test handler for the s390x DIAGNOSE 0x0318 instruction.
- *
- * Copyright (C) 2020, IBM
- */
-
-#ifndef SELFTEST_KVM_DIAG318_TEST_HANDLER
-#define SELFTEST_KVM_DIAG318_TEST_HANDLER
-
-uint64_t get_diag318_info(void);
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/s390x/facility.h b/tools/testing/selftests/kvm/include/s390x/facility.h
deleted file mode 100644
index 00a1ced6538b..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/facility.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright IBM Corp. 2024
- *
- * Authors:
- *  Hariharan Mari <hari55@linux.ibm.com>
- *
- * Get the facility bits with the STFLE instruction
- */
-
-#ifndef SELFTEST_KVM_FACILITY_H
-#define SELFTEST_KVM_FACILITY_H
-
-#include <linux/bitops.h>
-
-/* alt_stfle_fac_list[16] + stfle_fac_list[16] */
-#define NB_STFL_DOUBLEWORDS 32
-
-extern uint64_t stfl_doublewords[NB_STFL_DOUBLEWORDS];
-extern bool stfle_flag;
-
-static inline bool test_bit_inv(unsigned long nr, const unsigned long *ptr)
-{
-	return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
-}
-
-static inline void stfle(uint64_t *fac, unsigned int nb_doublewords)
-{
-	register unsigned long r0 asm("0") = nb_doublewords - 1;
-
-	asm volatile("	.insn	s,0xb2b00000,0(%1)\n"
-			: "+d" (r0)
-			: "a" (fac)
-			: "memory", "cc");
-}
-
-static inline void setup_facilities(void)
-{
-	stfle(stfl_doublewords, NB_STFL_DOUBLEWORDS);
-	stfle_flag = true;
-}
-
-static inline bool test_facility(int nr)
-{
-	if (!stfle_flag)
-		setup_facilities();
-	return test_bit_inv(nr, stfl_doublewords);
-}
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h
deleted file mode 100644
index e43a57d99b56..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_UTIL_ARCH_H
-#define SELFTEST_KVM_UTIL_ARCH_H
-
-struct kvm_vm_arch {};
-
-#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h
deleted file mode 100644
index 33fef6fd9617..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/processor.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * s390x processor specific defines
- */
-#ifndef SELFTEST_KVM_PROCESSOR_H
-#define SELFTEST_KVM_PROCESSOR_H
-
-#include <linux/compiler.h>
-
-/* Bits in the region/segment table entry */
-#define REGION_ENTRY_ORIGIN	~0xfffUL /* region/segment table origin	   */
-#define REGION_ENTRY_PROTECT	0x200	 /* region protection bit	   */
-#define REGION_ENTRY_NOEXEC	0x100	 /* region no-execute bit	   */
-#define REGION_ENTRY_OFFSET	0xc0	 /* region table offset		   */
-#define REGION_ENTRY_INVALID	0x20	 /* invalid region table entry	   */
-#define REGION_ENTRY_TYPE	0x0c	 /* region/segment table type mask */
-#define REGION_ENTRY_LENGTH	0x03	 /* region third length		   */
-
-/* Bits in the page table entry */
-#define PAGE_INVALID	0x400		/* HW invalid bit    */
-#define PAGE_PROTECT	0x200		/* HW read-only bit  */
-#define PAGE_NOEXEC	0x100		/* HW no-execute bit */
-
-/* Page size definitions */
-#define PAGE_SHIFT 12
-#define PAGE_SIZE BIT_ULL(PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
-
-/* Is there a portable way to do this? */
-static inline void cpu_relax(void)
-{
-	barrier();
-}
-
-/* Get the instruction length */
-static inline int insn_length(unsigned char code)
-{
-	return ((((int)code + 64) >> 7) + 1) << 1;
-}
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/s390x/sie.h b/tools/testing/selftests/kvm/include/s390x/sie.h
deleted file mode 100644
index 160acd4a1db9..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/sie.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Definition for kernel virtual machines on s390.
- *
- * Adapted copy of struct definition kvm_s390_sie_block from
- * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs.
- *
- * Copyright IBM Corp. 2008, 2024
- *
- * Authors:
- *  Christoph Schlameuss <schlameuss@linux.ibm.com>
- *  Carsten Otte <cotte@de.ibm.com>
- */
-
-#ifndef SELFTEST_KVM_SIE_H
-#define SELFTEST_KVM_SIE_H
-
-#include <linux/types.h>
-
-struct kvm_s390_sie_block {
-#define CPUSTAT_STOPPED    0x80000000
-#define CPUSTAT_WAIT       0x10000000
-#define CPUSTAT_ECALL_PEND 0x08000000
-#define CPUSTAT_STOP_INT   0x04000000
-#define CPUSTAT_IO_INT     0x02000000
-#define CPUSTAT_EXT_INT    0x01000000
-#define CPUSTAT_RUNNING    0x00800000
-#define CPUSTAT_RETAINED   0x00400000
-#define CPUSTAT_TIMING_SUB 0x00020000
-#define CPUSTAT_SIE_SUB    0x00010000
-#define CPUSTAT_RRF        0x00008000
-#define CPUSTAT_SLSV       0x00004000
-#define CPUSTAT_SLSR       0x00002000
-#define CPUSTAT_ZARCH      0x00000800
-#define CPUSTAT_MCDS       0x00000100
-#define CPUSTAT_KSS        0x00000200
-#define CPUSTAT_SM         0x00000080
-#define CPUSTAT_IBS        0x00000040
-#define CPUSTAT_GED2       0x00000010
-#define CPUSTAT_G          0x00000008
-#define CPUSTAT_GED        0x00000004
-#define CPUSTAT_J          0x00000002
-#define CPUSTAT_P          0x00000001
-	__u32 cpuflags;			/* 0x0000 */
-	__u32: 1;			/* 0x0004 */
-	__u32 prefix : 18;
-	__u32: 1;
-	__u32 ibc : 12;
-	__u8	reserved08[4];		/* 0x0008 */
-#define PROG_IN_SIE BIT(0)
-	__u32	prog0c;			/* 0x000c */
-	union {
-		__u8	reserved10[16];	/* 0x0010 */
-		struct {
-			__u64	pv_handle_cpu;
-			__u64	pv_handle_config;
-		};
-	};
-#define PROG_BLOCK_SIE	BIT(0)
-#define PROG_REQUEST	BIT(1)
-	__u32	prog20;			/* 0x0020 */
-	__u8	reserved24[4];		/* 0x0024 */
-	__u64	cputm;			/* 0x0028 */
-	__u64	ckc;			/* 0x0030 */
-	__u64	epoch;			/* 0x0038 */
-	__u32	svcc;			/* 0x0040 */
-#define LCTL_CR0	0x8000
-#define LCTL_CR6	0x0200
-#define LCTL_CR9	0x0040
-#define LCTL_CR10	0x0020
-#define LCTL_CR11	0x0010
-#define LCTL_CR14	0x0002
-	__u16   lctl;			/* 0x0044 */
-	__s16	icpua;			/* 0x0046 */
-#define ICTL_OPEREXC	0x80000000
-#define ICTL_PINT	0x20000000
-#define ICTL_LPSW	0x00400000
-#define ICTL_STCTL	0x00040000
-#define ICTL_ISKE	0x00004000
-#define ICTL_SSKE	0x00002000
-#define ICTL_RRBE	0x00001000
-#define ICTL_TPROT	0x00000200
-	__u32	ictl;			/* 0x0048 */
-#define ECA_CEI		0x80000000
-#define ECA_IB		0x40000000
-#define ECA_SIGPI	0x10000000
-#define ECA_MVPGI	0x01000000
-#define ECA_AIV		0x00200000
-#define ECA_VX		0x00020000
-#define ECA_PROTEXCI	0x00002000
-#define ECA_APIE	0x00000008
-#define ECA_SII		0x00000001
-	__u32	eca;			/* 0x004c */
-#define ICPT_INST	0x04
-#define ICPT_PROGI	0x08
-#define ICPT_INSTPROGI	0x0C
-#define ICPT_EXTREQ	0x10
-#define ICPT_EXTINT	0x14
-#define ICPT_IOREQ	0x18
-#define ICPT_WAIT	0x1c
-#define ICPT_VALIDITY	0x20
-#define ICPT_STOP	0x28
-#define ICPT_OPEREXC	0x2C
-#define ICPT_PARTEXEC	0x38
-#define ICPT_IOINST	0x40
-#define ICPT_KSS	0x5c
-#define ICPT_MCHKREQ	0x60
-#define ICPT_INT_ENABLE	0x64
-#define ICPT_PV_INSTR	0x68
-#define ICPT_PV_NOTIFY	0x6c
-#define ICPT_PV_PREF	0x70
-	__u8	icptcode;		/* 0x0050 */
-	__u8	icptstatus;		/* 0x0051 */
-	__u16	ihcpu;			/* 0x0052 */
-	__u8	reserved54;		/* 0x0054 */
-#define IICTL_CODE_NONE		 0x00
-#define IICTL_CODE_MCHK		 0x01
-#define IICTL_CODE_EXT		 0x02
-#define IICTL_CODE_IO		 0x03
-#define IICTL_CODE_RESTART	 0x04
-#define IICTL_CODE_SPECIFICATION 0x10
-#define IICTL_CODE_OPERAND	 0x11
-	__u8	iictl;			/* 0x0055 */
-	__u16	ipa;			/* 0x0056 */
-	__u32	ipb;			/* 0x0058 */
-	__u32	scaoh;			/* 0x005c */
-#define FPF_BPBC	0x20
-	__u8	fpf;			/* 0x0060 */
-#define ECB_GS		0x40
-#define ECB_TE		0x10
-#define ECB_SPECI	0x08
-#define ECB_SRSI	0x04
-#define ECB_HOSTPROTINT	0x02
-#define ECB_PTF		0x01
-	__u8	ecb;			/* 0x0061 */
-#define ECB2_CMMA	0x80
-#define ECB2_IEP	0x20
-#define ECB2_PFMFI	0x08
-#define ECB2_ESCA	0x04
-#define ECB2_ZPCI_LSI	0x02
-	__u8	ecb2;			/* 0x0062 */
-#define ECB3_AISI	0x20
-#define ECB3_AISII	0x10
-#define ECB3_DEA	0x08
-#define ECB3_AES	0x04
-#define ECB3_RI		0x01
-	__u8	ecb3;			/* 0x0063 */
-#define ESCA_SCAOL_MASK ~0x3fU
-	__u32	scaol;			/* 0x0064 */
-	__u8	sdf;			/* 0x0068 */
-	__u8	epdx;			/* 0x0069 */
-	__u8	cpnc;			/* 0x006a */
-	__u8	reserved6b;		/* 0x006b */
-	__u32	todpr;			/* 0x006c */
-#define GISA_FORMAT1 0x00000001
-	__u32	gd;			/* 0x0070 */
-	__u8	reserved74[12];		/* 0x0074 */
-	__u64	mso;			/* 0x0080 */
-	__u64	msl;			/* 0x0088 */
-	__u64	psw_mask;		/* 0x0090 */
-	__u64	psw_addr;		/* 0x0098 */
-	__u64	gg14;			/* 0x00a0 */
-	__u64	gg15;			/* 0x00a8 */
-	__u8	reservedb0[8];		/* 0x00b0 */
-#define HPID_KVM	0x4
-#define HPID_VSIE	0x5
-	__u8	hpid;			/* 0x00b8 */
-	__u8	reservedb9[7];		/* 0x00b9 */
-	union {
-		struct {
-			__u32	eiparams;	/* 0x00c0 */
-			__u16	extcpuaddr;	/* 0x00c4 */
-			__u16	eic;		/* 0x00c6 */
-		};
-		__u64	mcic;			/* 0x00c0 */
-	} __packed;
-	__u32	reservedc8;		/* 0x00c8 */
-	union {
-		struct {
-			__u16	pgmilc;		/* 0x00cc */
-			__u16	iprcc;		/* 0x00ce */
-		};
-		__u32	edc;			/* 0x00cc */
-	} __packed;
-	union {
-		struct {
-			__u32	dxc;		/* 0x00d0 */
-			__u16	mcn;		/* 0x00d4 */
-			__u8	perc;		/* 0x00d6 */
-			__u8	peratmid;	/* 0x00d7 */
-		};
-		__u64	faddr;			/* 0x00d0 */
-	} __packed;
-	__u64	peraddr;		/* 0x00d8 */
-	__u8	eai;			/* 0x00e0 */
-	__u8	peraid;			/* 0x00e1 */
-	__u8	oai;			/* 0x00e2 */
-	__u8	armid;			/* 0x00e3 */
-	__u8	reservede4[4];		/* 0x00e4 */
-	union {
-		__u64	tecmc;		/* 0x00e8 */
-		struct {
-			__u16	subchannel_id;	/* 0x00e8 */
-			__u16	subchannel_nr;	/* 0x00ea */
-			__u32	io_int_parm;	/* 0x00ec */
-			__u32	io_int_word;	/* 0x00f0 */
-		};
-	} __packed;
-	__u8	reservedf4[8];		/* 0x00f4 */
-#define CRYCB_FORMAT_MASK	0x00000003
-#define CRYCB_FORMAT0		0x00000000
-#define CRYCB_FORMAT1		0x00000001
-#define CRYCB_FORMAT2		0x00000003
-	__u32	crycbd;			/* 0x00fc */
-	__u64	gcr[16];		/* 0x0100 */
-	union {
-		__u64	gbea;		/* 0x0180 */
-		__u64	sidad;
-	};
-	__u8    reserved188[8];		/* 0x0188 */
-	__u64   sdnxo;			/* 0x0190 */
-	__u8    reserved198[8];		/* 0x0198 */
-	__u32	fac;			/* 0x01a0 */
-	__u8	reserved1a4[20];	/* 0x01a4 */
-	__u64	cbrlo;			/* 0x01b8 */
-	__u8	reserved1c0[8];		/* 0x01c0 */
-#define ECD_HOSTREGMGMT	0x20000000
-#define ECD_MEF		0x08000000
-#define ECD_ETOKENF	0x02000000
-#define ECD_ECC		0x00200000
-	__u32	ecd;			/* 0x01c8 */
-	__u8	reserved1cc[18];	/* 0x01cc */
-	__u64	pp;			/* 0x01de */
-	__u8	reserved1e6[2];		/* 0x01e6 */
-	__u64	itdba;			/* 0x01e8 */
-	__u64   riccbd;			/* 0x01f0 */
-	__u64	gvrd;			/* 0x01f8 */
-} __packed __aligned(512);
-
-#endif /* SELFTEST_KVM_SIE_H */
diff --git a/tools/testing/selftests/kvm/include/s390x/ucall.h b/tools/testing/selftests/kvm/include/s390x/ucall.h
deleted file mode 100644
index 8035a872a351..000000000000
--- a/tools/testing/selftests/kvm/include/s390x/ucall.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_UCALL_H
-#define SELFTEST_KVM_UCALL_H
-
-#include "kvm_util.h"
-
-#define UCALL_EXIT_REASON       KVM_EXIT_S390_SIEIC
-
-static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
-{
-}
-
-static inline void ucall_arch_do_ucall(vm_vaddr_t uc)
-{
-	/* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */
-	asm volatile ("diag 0,%0,0x501" : : "a"(uc) : "memory");
-}
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h
new file mode 100644
index 000000000000..80fe9f69b38d
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/apic.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_APIC_H
+#define SELFTEST_KVM_APIC_H
+
+#include <stdint.h>
+
+#include "processor.h"
+#include "ucall_common.h"
+
+#define APIC_DEFAULT_GPA		0xfee00000ULL
+
+/* APIC base address MSR and fields */
+#define MSR_IA32_APICBASE		0x0000001b
+#define MSR_IA32_APICBASE_BSP		(1<<8)
+#define MSR_IA32_APICBASE_EXTD		(1<<10)
+#define MSR_IA32_APICBASE_ENABLE	(1<<11)
+#define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
+#define		GET_APIC_BASE(x)	(((x) >> 12) << 12)
+
+#define APIC_BASE_MSR	0x800
+#define X2APIC_ENABLE	(1UL << 10)
+#define	APIC_ID		0x20
+#define	APIC_LVR	0x30
+#define		GET_APIC_ID_FIELD(x)	(((x) >> 24) & 0xFF)
+#define	APIC_TASKPRI	0x80
+#define	APIC_PROCPRI	0xA0
+#define	APIC_EOI	0xB0
+#define	APIC_SPIV	0xF0
+#define		APIC_SPIV_FOCUS_DISABLED	(1 << 9)
+#define		APIC_SPIV_APIC_ENABLED		(1 << 8)
+#define APIC_IRR	0x200
+#define	APIC_ICR	0x300
+#define	APIC_LVTCMCI	0x2f0
+#define		APIC_DEST_SELF		0x40000
+#define		APIC_DEST_ALLINC	0x80000
+#define		APIC_DEST_ALLBUT	0xC0000
+#define		APIC_ICR_RR_MASK	0x30000
+#define		APIC_ICR_RR_INVALID	0x00000
+#define		APIC_ICR_RR_INPROG	0x10000
+#define		APIC_ICR_RR_VALID	0x20000
+#define		APIC_INT_LEVELTRIG	0x08000
+#define		APIC_INT_ASSERT		0x04000
+#define		APIC_ICR_BUSY		0x01000
+#define		APIC_DEST_LOGICAL	0x00800
+#define		APIC_DEST_PHYSICAL	0x00000
+#define		APIC_DM_FIXED		0x00000
+#define		APIC_DM_FIXED_MASK	0x00700
+#define		APIC_DM_LOWEST		0x00100
+#define		APIC_DM_SMI		0x00200
+#define		APIC_DM_REMRD		0x00300
+#define		APIC_DM_NMI		0x00400
+#define		APIC_DM_INIT		0x00500
+#define		APIC_DM_STARTUP		0x00600
+#define		APIC_DM_EXTINT		0x00700
+#define		APIC_VECTOR_MASK	0x000FF
+#define	APIC_ICR2	0x310
+#define		SET_APIC_DEST_FIELD(x)	((x) << 24)
+#define APIC_LVTT	0x320
+#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
+#define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
+#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
+#define		APIC_LVT_MASKED			(1 << 16)
+#define	APIC_TMICT	0x380
+#define	APIC_TMCCT	0x390
+#define	APIC_TDCR	0x3E0
+
+void apic_disable(void);
+void xapic_enable(void);
+void x2apic_enable(void);
+
+static inline uint32_t get_bsp_flag(void)
+{
+	return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP;
+}
+
+static inline uint32_t xapic_read_reg(unsigned int reg)
+{
+	return ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2];
+}
+
+static inline void xapic_write_reg(unsigned int reg, uint32_t val)
+{
+	((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2] = val;
+}
+
+static inline uint64_t x2apic_read_reg(unsigned int reg)
+{
+	return rdmsr(APIC_BASE_MSR + (reg >> 4));
+}
+
+static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value)
+{
+	return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value);
+}
+
+static inline void x2apic_write_reg(unsigned int reg, uint64_t value)
+{
+	uint8_t fault = x2apic_write_reg_safe(reg, value);
+
+	__GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n",
+		       fault, APIC_BASE_MSR + (reg >> 4), value);
+}
+
+static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value)
+{
+	uint8_t fault = x2apic_write_reg_safe(reg, value);
+
+	__GUEST_ASSERT(fault == GP_VECTOR,
+		       "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n",
+		       APIC_BASE_MSR + (reg >> 4), value, fault);
+}
+
+
+#endif /* SELFTEST_KVM_APIC_H */
diff --git a/tools/testing/selftests/kvm/include/x86/evmcs.h b/tools/testing/selftests/kvm/include/x86/evmcs.h
new file mode 100644
index 000000000000..5a74bb30e2f8
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/evmcs.h
@@ -0,0 +1,1276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#ifndef SELFTEST_KVM_EVMCS_H
+#define SELFTEST_KVM_EVMCS_H
+
+#include <stdint.h>
+#include "hyperv.h"
+#include "vmx.h"
+
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#define EVMCS_VERSION 1
+
+extern bool enable_evmcs;
+
+struct hv_enlightened_vmcs {
+	u32 revision_id;
+	u32 abort;
+
+	u16 host_es_selector;
+	u16 host_cs_selector;
+	u16 host_ss_selector;
+	u16 host_ds_selector;
+	u16 host_fs_selector;
+	u16 host_gs_selector;
+	u16 host_tr_selector;
+
+	u16 padding16_1;
+
+	u64 host_ia32_pat;
+	u64 host_ia32_efer;
+
+	u64 host_cr0;
+	u64 host_cr3;
+	u64 host_cr4;
+
+	u64 host_ia32_sysenter_esp;
+	u64 host_ia32_sysenter_eip;
+	u64 host_rip;
+	u32 host_ia32_sysenter_cs;
+
+	u32 pin_based_vm_exec_control;
+	u32 vm_exit_controls;
+	u32 secondary_vm_exec_control;
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+
+	u16 guest_es_selector;
+	u16 guest_cs_selector;
+	u16 guest_ss_selector;
+	u16 guest_ds_selector;
+	u16 guest_fs_selector;
+	u16 guest_gs_selector;
+	u16 guest_ldtr_selector;
+	u16 guest_tr_selector;
+
+	u32 guest_es_limit;
+	u32 guest_cs_limit;
+	u32 guest_ss_limit;
+	u32 guest_ds_limit;
+	u32 guest_fs_limit;
+	u32 guest_gs_limit;
+	u32 guest_ldtr_limit;
+	u32 guest_tr_limit;
+	u32 guest_gdtr_limit;
+	u32 guest_idtr_limit;
+
+	u32 guest_es_ar_bytes;
+	u32 guest_cs_ar_bytes;
+	u32 guest_ss_ar_bytes;
+	u32 guest_ds_ar_bytes;
+	u32 guest_fs_ar_bytes;
+	u32 guest_gs_ar_bytes;
+	u32 guest_ldtr_ar_bytes;
+	u32 guest_tr_ar_bytes;
+
+	u64 guest_es_base;
+	u64 guest_cs_base;
+	u64 guest_ss_base;
+	u64 guest_ds_base;
+	u64 guest_fs_base;
+	u64 guest_gs_base;
+	u64 guest_ldtr_base;
+	u64 guest_tr_base;
+	u64 guest_gdtr_base;
+	u64 guest_idtr_base;
+
+	u64 padding64_1[3];
+
+	u64 vm_exit_msr_store_addr;
+	u64 vm_exit_msr_load_addr;
+	u64 vm_entry_msr_load_addr;
+
+	u64 cr3_target_value0;
+	u64 cr3_target_value1;
+	u64 cr3_target_value2;
+	u64 cr3_target_value3;
+
+	u32 page_fault_error_code_mask;
+	u32 page_fault_error_code_match;
+
+	u32 cr3_target_count;
+	u32 vm_exit_msr_store_count;
+	u32 vm_exit_msr_load_count;
+	u32 vm_entry_msr_load_count;
+
+	u64 tsc_offset;
+	u64 virtual_apic_page_addr;
+	u64 vmcs_link_pointer;
+
+	u64 guest_ia32_debugctl;
+	u64 guest_ia32_pat;
+	u64 guest_ia32_efer;
+
+	u64 guest_pdptr0;
+	u64 guest_pdptr1;
+	u64 guest_pdptr2;
+	u64 guest_pdptr3;
+
+	u64 guest_pending_dbg_exceptions;
+	u64 guest_sysenter_esp;
+	u64 guest_sysenter_eip;
+
+	u32 guest_activity_state;
+	u32 guest_sysenter_cs;
+
+	u64 cr0_guest_host_mask;
+	u64 cr4_guest_host_mask;
+	u64 cr0_read_shadow;
+	u64 cr4_read_shadow;
+	u64 guest_cr0;
+	u64 guest_cr3;
+	u64 guest_cr4;
+	u64 guest_dr7;
+
+	u64 host_fs_base;
+	u64 host_gs_base;
+	u64 host_tr_base;
+	u64 host_gdtr_base;
+	u64 host_idtr_base;
+	u64 host_rsp;
+
+	u64 ept_pointer;
+
+	u16 virtual_processor_id;
+	u16 padding16_2[3];
+
+	u64 padding64_2[5];
+	u64 guest_physical_address;
+
+	u32 vm_instruction_error;
+	u32 vm_exit_reason;
+	u32 vm_exit_intr_info;
+	u32 vm_exit_intr_error_code;
+	u32 idt_vectoring_info_field;
+	u32 idt_vectoring_error_code;
+	u32 vm_exit_instruction_len;
+	u32 vmx_instruction_info;
+
+	u64 exit_qualification;
+	u64 exit_io_instruction_ecx;
+	u64 exit_io_instruction_esi;
+	u64 exit_io_instruction_edi;
+	u64 exit_io_instruction_eip;
+
+	u64 guest_linear_address;
+	u64 guest_rsp;
+	u64 guest_rflags;
+
+	u32 guest_interruptibility_info;
+	u32 cpu_based_vm_exec_control;
+	u32 exception_bitmap;
+	u32 vm_entry_controls;
+	u32 vm_entry_intr_info_field;
+	u32 vm_entry_exception_error_code;
+	u32 vm_entry_instruction_len;
+	u32 tpr_threshold;
+
+	u64 guest_rip;
+
+	u32 hv_clean_fields;
+	u32 padding32_1;
+	u32 hv_synthetic_controls;
+	struct {
+		u32 nested_flush_hypercall:1;
+		u32 msr_bitmap:1;
+		u32 reserved:30;
+	}  __packed hv_enlightenments_control;
+	u32 hv_vp_id;
+	u32 padding32_2;
+	u64 hv_vm_id;
+	u64 partition_assist_page;
+	u64 padding64_4[4];
+	u64 guest_bndcfgs;
+	u64 guest_ia32_perf_global_ctrl;
+	u64 guest_ia32_s_cet;
+	u64 guest_ssp;
+	u64 guest_ia32_int_ssp_table_addr;
+	u64 guest_ia32_lbr_ctl;
+	u64 padding64_5[2];
+	u64 xss_exit_bitmap;
+	u64 encls_exiting_bitmap;
+	u64 host_ia32_perf_global_ctrl;
+	u64 tsc_multiplier;
+	u64 host_ia32_s_cet;
+	u64 host_ssp;
+	u64 host_ia32_int_ssp_table_addr;
+	u64 padding64_6;
+} __packed;
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE                     0
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP                BIT(0)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP               BIT(1)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2             BIT(2)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1             BIT(3)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC             BIT(4)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT            BIT(5)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY            BIT(6)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN            BIT(7)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR                     BIT(8)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT             BIT(9)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC              BIT(10)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1               BIT(11)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2               BIT(12)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER             BIT(13)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1                BIT(14)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL    BIT(15)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL                      0xFFFF
+
+#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
+
+extern struct hv_enlightened_vmcs *current_evmcs;
+
+int vcpu_enable_evmcs(struct kvm_vcpu *vcpu);
+
+static inline void evmcs_enable(void)
+{
+	enable_evmcs = true;
+}
+
+static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
+{
+	current_vp_assist->current_nested_vmcs = vmcs_pa;
+	current_vp_assist->enlighten_vmentry = 1;
+
+	current_evmcs = vmcs;
+
+	return 0;
+}
+
+static inline bool load_evmcs(struct hyperv_test_pages *hv)
+{
+	if (evmcs_vmptrld(hv->enlightened_vmcs_gpa, hv->enlightened_vmcs))
+		return false;
+
+	current_evmcs->revision_id = EVMCS_VERSION;
+
+	return true;
+}
+
+static inline int evmcs_vmptrst(uint64_t *value)
+{
+	*value = current_vp_assist->current_nested_vmcs &
+		~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+	return 0;
+}
+
+static inline int evmcs_vmread(uint64_t encoding, uint64_t *value)
+{
+	switch (encoding) {
+	case GUEST_RIP:
+		*value = current_evmcs->guest_rip;
+		break;
+	case GUEST_RSP:
+		*value = current_evmcs->guest_rsp;
+		break;
+	case GUEST_RFLAGS:
+		*value = current_evmcs->guest_rflags;
+		break;
+	case HOST_IA32_PAT:
+		*value = current_evmcs->host_ia32_pat;
+		break;
+	case HOST_IA32_EFER:
+		*value = current_evmcs->host_ia32_efer;
+		break;
+	case HOST_CR0:
+		*value = current_evmcs->host_cr0;
+		break;
+	case HOST_CR3:
+		*value = current_evmcs->host_cr3;
+		break;
+	case HOST_CR4:
+		*value = current_evmcs->host_cr4;
+		break;
+	case HOST_IA32_SYSENTER_ESP:
+		*value = current_evmcs->host_ia32_sysenter_esp;
+		break;
+	case HOST_IA32_SYSENTER_EIP:
+		*value = current_evmcs->host_ia32_sysenter_eip;
+		break;
+	case HOST_RIP:
+		*value = current_evmcs->host_rip;
+		break;
+	case IO_BITMAP_A:
+		*value = current_evmcs->io_bitmap_a;
+		break;
+	case IO_BITMAP_B:
+		*value = current_evmcs->io_bitmap_b;
+		break;
+	case MSR_BITMAP:
+		*value = current_evmcs->msr_bitmap;
+		break;
+	case GUEST_ES_BASE:
+		*value = current_evmcs->guest_es_base;
+		break;
+	case GUEST_CS_BASE:
+		*value = current_evmcs->guest_cs_base;
+		break;
+	case GUEST_SS_BASE:
+		*value = current_evmcs->guest_ss_base;
+		break;
+	case GUEST_DS_BASE:
+		*value = current_evmcs->guest_ds_base;
+		break;
+	case GUEST_FS_BASE:
+		*value = current_evmcs->guest_fs_base;
+		break;
+	case GUEST_GS_BASE:
+		*value = current_evmcs->guest_gs_base;
+		break;
+	case GUEST_LDTR_BASE:
+		*value = current_evmcs->guest_ldtr_base;
+		break;
+	case GUEST_TR_BASE:
+		*value = current_evmcs->guest_tr_base;
+		break;
+	case GUEST_GDTR_BASE:
+		*value = current_evmcs->guest_gdtr_base;
+		break;
+	case GUEST_IDTR_BASE:
+		*value = current_evmcs->guest_idtr_base;
+		break;
+	case TSC_OFFSET:
+		*value = current_evmcs->tsc_offset;
+		break;
+	case VIRTUAL_APIC_PAGE_ADDR:
+		*value = current_evmcs->virtual_apic_page_addr;
+		break;
+	case VMCS_LINK_POINTER:
+		*value = current_evmcs->vmcs_link_pointer;
+		break;
+	case GUEST_IA32_DEBUGCTL:
+		*value = current_evmcs->guest_ia32_debugctl;
+		break;
+	case GUEST_IA32_PAT:
+		*value = current_evmcs->guest_ia32_pat;
+		break;
+	case GUEST_IA32_EFER:
+		*value = current_evmcs->guest_ia32_efer;
+		break;
+	case GUEST_PDPTR0:
+		*value = current_evmcs->guest_pdptr0;
+		break;
+	case GUEST_PDPTR1:
+		*value = current_evmcs->guest_pdptr1;
+		break;
+	case GUEST_PDPTR2:
+		*value = current_evmcs->guest_pdptr2;
+		break;
+	case GUEST_PDPTR3:
+		*value = current_evmcs->guest_pdptr3;
+		break;
+	case GUEST_PENDING_DBG_EXCEPTIONS:
+		*value = current_evmcs->guest_pending_dbg_exceptions;
+		break;
+	case GUEST_SYSENTER_ESP:
+		*value = current_evmcs->guest_sysenter_esp;
+		break;
+	case GUEST_SYSENTER_EIP:
+		*value = current_evmcs->guest_sysenter_eip;
+		break;
+	case CR0_GUEST_HOST_MASK:
+		*value = current_evmcs->cr0_guest_host_mask;
+		break;
+	case CR4_GUEST_HOST_MASK:
+		*value = current_evmcs->cr4_guest_host_mask;
+		break;
+	case CR0_READ_SHADOW:
+		*value = current_evmcs->cr0_read_shadow;
+		break;
+	case CR4_READ_SHADOW:
+		*value = current_evmcs->cr4_read_shadow;
+		break;
+	case GUEST_CR0:
+		*value = current_evmcs->guest_cr0;
+		break;
+	case GUEST_CR3:
+		*value = current_evmcs->guest_cr3;
+		break;
+	case GUEST_CR4:
+		*value = current_evmcs->guest_cr4;
+		break;
+	case GUEST_DR7:
+		*value = current_evmcs->guest_dr7;
+		break;
+	case HOST_FS_BASE:
+		*value = current_evmcs->host_fs_base;
+		break;
+	case HOST_GS_BASE:
+		*value = current_evmcs->host_gs_base;
+		break;
+	case HOST_TR_BASE:
+		*value = current_evmcs->host_tr_base;
+		break;
+	case HOST_GDTR_BASE:
+		*value = current_evmcs->host_gdtr_base;
+		break;
+	case HOST_IDTR_BASE:
+		*value = current_evmcs->host_idtr_base;
+		break;
+	case HOST_RSP:
+		*value = current_evmcs->host_rsp;
+		break;
+	case EPT_POINTER:
+		*value = current_evmcs->ept_pointer;
+		break;
+	case GUEST_BNDCFGS:
+		*value = current_evmcs->guest_bndcfgs;
+		break;
+	case XSS_EXIT_BITMAP:
+		*value = current_evmcs->xss_exit_bitmap;
+		break;
+	case GUEST_PHYSICAL_ADDRESS:
+		*value = current_evmcs->guest_physical_address;
+		break;
+	case EXIT_QUALIFICATION:
+		*value = current_evmcs->exit_qualification;
+		break;
+	case GUEST_LINEAR_ADDRESS:
+		*value = current_evmcs->guest_linear_address;
+		break;
+	case VM_EXIT_MSR_STORE_ADDR:
+		*value = current_evmcs->vm_exit_msr_store_addr;
+		break;
+	case VM_EXIT_MSR_LOAD_ADDR:
+		*value = current_evmcs->vm_exit_msr_load_addr;
+		break;
+	case VM_ENTRY_MSR_LOAD_ADDR:
+		*value = current_evmcs->vm_entry_msr_load_addr;
+		break;
+	case CR3_TARGET_VALUE0:
+		*value = current_evmcs->cr3_target_value0;
+		break;
+	case CR3_TARGET_VALUE1:
+		*value = current_evmcs->cr3_target_value1;
+		break;
+	case CR3_TARGET_VALUE2:
+		*value = current_evmcs->cr3_target_value2;
+		break;
+	case CR3_TARGET_VALUE3:
+		*value = current_evmcs->cr3_target_value3;
+		break;
+	case TPR_THRESHOLD:
+		*value = current_evmcs->tpr_threshold;
+		break;
+	case GUEST_INTERRUPTIBILITY_INFO:
+		*value = current_evmcs->guest_interruptibility_info;
+		break;
+	case CPU_BASED_VM_EXEC_CONTROL:
+		*value = current_evmcs->cpu_based_vm_exec_control;
+		break;
+	case EXCEPTION_BITMAP:
+		*value = current_evmcs->exception_bitmap;
+		break;
+	case VM_ENTRY_CONTROLS:
+		*value = current_evmcs->vm_entry_controls;
+		break;
+	case VM_ENTRY_INTR_INFO_FIELD:
+		*value = current_evmcs->vm_entry_intr_info_field;
+		break;
+	case VM_ENTRY_EXCEPTION_ERROR_CODE:
+		*value = current_evmcs->vm_entry_exception_error_code;
+		break;
+	case VM_ENTRY_INSTRUCTION_LEN:
+		*value = current_evmcs->vm_entry_instruction_len;
+		break;
+	case HOST_IA32_SYSENTER_CS:
+		*value = current_evmcs->host_ia32_sysenter_cs;
+		break;
+	case PIN_BASED_VM_EXEC_CONTROL:
+		*value = current_evmcs->pin_based_vm_exec_control;
+		break;
+	case VM_EXIT_CONTROLS:
+		*value = current_evmcs->vm_exit_controls;
+		break;
+	case SECONDARY_VM_EXEC_CONTROL:
+		*value = current_evmcs->secondary_vm_exec_control;
+		break;
+	case GUEST_ES_LIMIT:
+		*value = current_evmcs->guest_es_limit;
+		break;
+	case GUEST_CS_LIMIT:
+		*value = current_evmcs->guest_cs_limit;
+		break;
+	case GUEST_SS_LIMIT:
+		*value = current_evmcs->guest_ss_limit;
+		break;
+	case GUEST_DS_LIMIT:
+		*value = current_evmcs->guest_ds_limit;
+		break;
+	case GUEST_FS_LIMIT:
+		*value = current_evmcs->guest_fs_limit;
+		break;
+	case GUEST_GS_LIMIT:
+		*value = current_evmcs->guest_gs_limit;
+		break;
+	case GUEST_LDTR_LIMIT:
+		*value = current_evmcs->guest_ldtr_limit;
+		break;
+	case GUEST_TR_LIMIT:
+		*value = current_evmcs->guest_tr_limit;
+		break;
+	case GUEST_GDTR_LIMIT:
+		*value = current_evmcs->guest_gdtr_limit;
+		break;
+	case GUEST_IDTR_LIMIT:
+		*value = current_evmcs->guest_idtr_limit;
+		break;
+	case GUEST_ES_AR_BYTES:
+		*value = current_evmcs->guest_es_ar_bytes;
+		break;
+	case GUEST_CS_AR_BYTES:
+		*value = current_evmcs->guest_cs_ar_bytes;
+		break;
+	case GUEST_SS_AR_BYTES:
+		*value = current_evmcs->guest_ss_ar_bytes;
+		break;
+	case GUEST_DS_AR_BYTES:
+		*value = current_evmcs->guest_ds_ar_bytes;
+		break;
+	case GUEST_FS_AR_BYTES:
+		*value = current_evmcs->guest_fs_ar_bytes;
+		break;
+	case GUEST_GS_AR_BYTES:
+		*value = current_evmcs->guest_gs_ar_bytes;
+		break;
+	case GUEST_LDTR_AR_BYTES:
+		*value = current_evmcs->guest_ldtr_ar_bytes;
+		break;
+	case GUEST_TR_AR_BYTES:
+		*value = current_evmcs->guest_tr_ar_bytes;
+		break;
+	case GUEST_ACTIVITY_STATE:
+		*value = current_evmcs->guest_activity_state;
+		break;
+	case GUEST_SYSENTER_CS:
+		*value = current_evmcs->guest_sysenter_cs;
+		break;
+	case VM_INSTRUCTION_ERROR:
+		*value = current_evmcs->vm_instruction_error;
+		break;
+	case VM_EXIT_REASON:
+		*value = current_evmcs->vm_exit_reason;
+		break;
+	case VM_EXIT_INTR_INFO:
+		*value = current_evmcs->vm_exit_intr_info;
+		break;
+	case VM_EXIT_INTR_ERROR_CODE:
+		*value = current_evmcs->vm_exit_intr_error_code;
+		break;
+	case IDT_VECTORING_INFO_FIELD:
+		*value = current_evmcs->idt_vectoring_info_field;
+		break;
+	case IDT_VECTORING_ERROR_CODE:
+		*value = current_evmcs->idt_vectoring_error_code;
+		break;
+	case VM_EXIT_INSTRUCTION_LEN:
+		*value = current_evmcs->vm_exit_instruction_len;
+		break;
+	case VMX_INSTRUCTION_INFO:
+		*value = current_evmcs->vmx_instruction_info;
+		break;
+	case PAGE_FAULT_ERROR_CODE_MASK:
+		*value = current_evmcs->page_fault_error_code_mask;
+		break;
+	case PAGE_FAULT_ERROR_CODE_MATCH:
+		*value = current_evmcs->page_fault_error_code_match;
+		break;
+	case CR3_TARGET_COUNT:
+		*value = current_evmcs->cr3_target_count;
+		break;
+	case VM_EXIT_MSR_STORE_COUNT:
+		*value = current_evmcs->vm_exit_msr_store_count;
+		break;
+	case VM_EXIT_MSR_LOAD_COUNT:
+		*value = current_evmcs->vm_exit_msr_load_count;
+		break;
+	case VM_ENTRY_MSR_LOAD_COUNT:
+		*value = current_evmcs->vm_entry_msr_load_count;
+		break;
+	case HOST_ES_SELECTOR:
+		*value = current_evmcs->host_es_selector;
+		break;
+	case HOST_CS_SELECTOR:
+		*value = current_evmcs->host_cs_selector;
+		break;
+	case HOST_SS_SELECTOR:
+		*value = current_evmcs->host_ss_selector;
+		break;
+	case HOST_DS_SELECTOR:
+		*value = current_evmcs->host_ds_selector;
+		break;
+	case HOST_FS_SELECTOR:
+		*value = current_evmcs->host_fs_selector;
+		break;
+	case HOST_GS_SELECTOR:
+		*value = current_evmcs->host_gs_selector;
+		break;
+	case HOST_TR_SELECTOR:
+		*value = current_evmcs->host_tr_selector;
+		break;
+	case GUEST_ES_SELECTOR:
+		*value = current_evmcs->guest_es_selector;
+		break;
+	case GUEST_CS_SELECTOR:
+		*value = current_evmcs->guest_cs_selector;
+		break;
+	case GUEST_SS_SELECTOR:
+		*value = current_evmcs->guest_ss_selector;
+		break;
+	case GUEST_DS_SELECTOR:
+		*value = current_evmcs->guest_ds_selector;
+		break;
+	case GUEST_FS_SELECTOR:
+		*value = current_evmcs->guest_fs_selector;
+		break;
+	case GUEST_GS_SELECTOR:
+		*value = current_evmcs->guest_gs_selector;
+		break;
+	case GUEST_LDTR_SELECTOR:
+		*value = current_evmcs->guest_ldtr_selector;
+		break;
+	case GUEST_TR_SELECTOR:
+		*value = current_evmcs->guest_tr_selector;
+		break;
+	case VIRTUAL_PROCESSOR_ID:
+		*value = current_evmcs->virtual_processor_id;
+		break;
+	case HOST_IA32_PERF_GLOBAL_CTRL:
+		*value = current_evmcs->host_ia32_perf_global_ctrl;
+		break;
+	case GUEST_IA32_PERF_GLOBAL_CTRL:
+		*value = current_evmcs->guest_ia32_perf_global_ctrl;
+		break;
+	case ENCLS_EXITING_BITMAP:
+		*value = current_evmcs->encls_exiting_bitmap;
+		break;
+	case TSC_MULTIPLIER:
+		*value = current_evmcs->tsc_multiplier;
+		break;
+	default: return 1;
+	}
+
+	return 0;
+}
+
+static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
+{
+	switch (encoding) {
+	case GUEST_RIP:
+		current_evmcs->guest_rip = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case GUEST_RSP:
+		current_evmcs->guest_rsp = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
+		break;
+	case GUEST_RFLAGS:
+		current_evmcs->guest_rflags = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
+		break;
+	case HOST_IA32_PAT:
+		current_evmcs->host_ia32_pat = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_IA32_EFER:
+		current_evmcs->host_ia32_efer = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_CR0:
+		current_evmcs->host_cr0 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_CR3:
+		current_evmcs->host_cr3 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_CR4:
+		current_evmcs->host_cr4 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_IA32_SYSENTER_ESP:
+		current_evmcs->host_ia32_sysenter_esp = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_IA32_SYSENTER_EIP:
+		current_evmcs->host_ia32_sysenter_eip = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_RIP:
+		current_evmcs->host_rip = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case IO_BITMAP_A:
+		current_evmcs->io_bitmap_a = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP;
+		break;
+	case IO_BITMAP_B:
+		current_evmcs->io_bitmap_b = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP;
+		break;
+	case MSR_BITMAP:
+		current_evmcs->msr_bitmap = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+		break;
+	case GUEST_ES_BASE:
+		current_evmcs->guest_es_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_CS_BASE:
+		current_evmcs->guest_cs_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_SS_BASE:
+		current_evmcs->guest_ss_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_DS_BASE:
+		current_evmcs->guest_ds_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_FS_BASE:
+		current_evmcs->guest_fs_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_GS_BASE:
+		current_evmcs->guest_gs_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_LDTR_BASE:
+		current_evmcs->guest_ldtr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_TR_BASE:
+		current_evmcs->guest_tr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_GDTR_BASE:
+		current_evmcs->guest_gdtr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_IDTR_BASE:
+		current_evmcs->guest_idtr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case TSC_OFFSET:
+		current_evmcs->tsc_offset = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
+	case VIRTUAL_APIC_PAGE_ADDR:
+		current_evmcs->virtual_apic_page_addr = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
+	case VMCS_LINK_POINTER:
+		current_evmcs->vmcs_link_pointer = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_IA32_DEBUGCTL:
+		current_evmcs->guest_ia32_debugctl = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_IA32_PAT:
+		current_evmcs->guest_ia32_pat = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_IA32_EFER:
+		current_evmcs->guest_ia32_efer = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_PDPTR0:
+		current_evmcs->guest_pdptr0 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_PDPTR1:
+		current_evmcs->guest_pdptr1 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_PDPTR2:
+		current_evmcs->guest_pdptr2 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_PDPTR3:
+		current_evmcs->guest_pdptr3 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_PENDING_DBG_EXCEPTIONS:
+		current_evmcs->guest_pending_dbg_exceptions = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_SYSENTER_ESP:
+		current_evmcs->guest_sysenter_esp = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_SYSENTER_EIP:
+		current_evmcs->guest_sysenter_eip = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case CR0_GUEST_HOST_MASK:
+		current_evmcs->cr0_guest_host_mask = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case CR4_GUEST_HOST_MASK:
+		current_evmcs->cr4_guest_host_mask = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case CR0_READ_SHADOW:
+		current_evmcs->cr0_read_shadow = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case CR4_READ_SHADOW:
+		current_evmcs->cr4_read_shadow = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case GUEST_CR0:
+		current_evmcs->guest_cr0 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case GUEST_CR3:
+		current_evmcs->guest_cr3 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case GUEST_CR4:
+		current_evmcs->guest_cr4 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case GUEST_DR7:
+		current_evmcs->guest_dr7 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
+		break;
+	case HOST_FS_BASE:
+		current_evmcs->host_fs_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+		break;
+	case HOST_GS_BASE:
+		current_evmcs->host_gs_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+		break;
+	case HOST_TR_BASE:
+		current_evmcs->host_tr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+		break;
+	case HOST_GDTR_BASE:
+		current_evmcs->host_gdtr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+		break;
+	case HOST_IDTR_BASE:
+		current_evmcs->host_idtr_base = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+		break;
+	case HOST_RSP:
+		current_evmcs->host_rsp = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+		break;
+	case EPT_POINTER:
+		current_evmcs->ept_pointer = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
+		break;
+	case GUEST_BNDCFGS:
+		current_evmcs->guest_bndcfgs = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case XSS_EXIT_BITMAP:
+		current_evmcs->xss_exit_bitmap = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
+	case GUEST_PHYSICAL_ADDRESS:
+		current_evmcs->guest_physical_address = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case EXIT_QUALIFICATION:
+		current_evmcs->exit_qualification = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case GUEST_LINEAR_ADDRESS:
+		current_evmcs->guest_linear_address = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case VM_EXIT_MSR_STORE_ADDR:
+		current_evmcs->vm_exit_msr_store_addr = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case VM_EXIT_MSR_LOAD_ADDR:
+		current_evmcs->vm_exit_msr_load_addr = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case VM_ENTRY_MSR_LOAD_ADDR:
+		current_evmcs->vm_entry_msr_load_addr = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case CR3_TARGET_VALUE0:
+		current_evmcs->cr3_target_value0 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case CR3_TARGET_VALUE1:
+		current_evmcs->cr3_target_value1 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case CR3_TARGET_VALUE2:
+		current_evmcs->cr3_target_value2 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case CR3_TARGET_VALUE3:
+		current_evmcs->cr3_target_value3 = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case TPR_THRESHOLD:
+		current_evmcs->tpr_threshold = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case GUEST_INTERRUPTIBILITY_INFO:
+		current_evmcs->guest_interruptibility_info = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
+		break;
+	case CPU_BASED_VM_EXEC_CONTROL:
+		current_evmcs->cpu_based_vm_exec_control = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC;
+		break;
+	case EXCEPTION_BITMAP:
+		current_evmcs->exception_bitmap = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN;
+		break;
+	case VM_ENTRY_CONTROLS:
+		current_evmcs->vm_entry_controls = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY;
+		break;
+	case VM_ENTRY_INTR_INFO_FIELD:
+		current_evmcs->vm_entry_intr_info_field = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
+		break;
+	case VM_ENTRY_EXCEPTION_ERROR_CODE:
+		current_evmcs->vm_entry_exception_error_code = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
+		break;
+	case VM_ENTRY_INSTRUCTION_LEN:
+		current_evmcs->vm_entry_instruction_len = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
+		break;
+	case HOST_IA32_SYSENTER_CS:
+		current_evmcs->host_ia32_sysenter_cs = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case PIN_BASED_VM_EXEC_CONTROL:
+		current_evmcs->pin_based_vm_exec_control = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
+		break;
+	case VM_EXIT_CONTROLS:
+		current_evmcs->vm_exit_controls = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
+		break;
+	case SECONDARY_VM_EXEC_CONTROL:
+		current_evmcs->secondary_vm_exec_control = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
+		break;
+	case GUEST_ES_LIMIT:
+		current_evmcs->guest_es_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_CS_LIMIT:
+		current_evmcs->guest_cs_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_SS_LIMIT:
+		current_evmcs->guest_ss_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_DS_LIMIT:
+		current_evmcs->guest_ds_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_FS_LIMIT:
+		current_evmcs->guest_fs_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_GS_LIMIT:
+		current_evmcs->guest_gs_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_LDTR_LIMIT:
+		current_evmcs->guest_ldtr_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_TR_LIMIT:
+		current_evmcs->guest_tr_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_GDTR_LIMIT:
+		current_evmcs->guest_gdtr_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_IDTR_LIMIT:
+		current_evmcs->guest_idtr_limit = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_ES_AR_BYTES:
+		current_evmcs->guest_es_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_CS_AR_BYTES:
+		current_evmcs->guest_cs_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_SS_AR_BYTES:
+		current_evmcs->guest_ss_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_DS_AR_BYTES:
+		current_evmcs->guest_ds_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_FS_AR_BYTES:
+		current_evmcs->guest_fs_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_GS_AR_BYTES:
+		current_evmcs->guest_gs_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_LDTR_AR_BYTES:
+		current_evmcs->guest_ldtr_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_TR_AR_BYTES:
+		current_evmcs->guest_tr_ar_bytes = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_ACTIVITY_STATE:
+		current_evmcs->guest_activity_state = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case GUEST_SYSENTER_CS:
+		current_evmcs->guest_sysenter_cs = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case VM_INSTRUCTION_ERROR:
+		current_evmcs->vm_instruction_error = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case VM_EXIT_REASON:
+		current_evmcs->vm_exit_reason = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case VM_EXIT_INTR_INFO:
+		current_evmcs->vm_exit_intr_info = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case VM_EXIT_INTR_ERROR_CODE:
+		current_evmcs->vm_exit_intr_error_code = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case IDT_VECTORING_INFO_FIELD:
+		current_evmcs->idt_vectoring_info_field = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case IDT_VECTORING_ERROR_CODE:
+		current_evmcs->idt_vectoring_error_code = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case VM_EXIT_INSTRUCTION_LEN:
+		current_evmcs->vm_exit_instruction_len = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case VMX_INSTRUCTION_INFO:
+		current_evmcs->vmx_instruction_info = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
+		break;
+	case PAGE_FAULT_ERROR_CODE_MASK:
+		current_evmcs->page_fault_error_code_mask = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case PAGE_FAULT_ERROR_CODE_MATCH:
+		current_evmcs->page_fault_error_code_match = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case CR3_TARGET_COUNT:
+		current_evmcs->cr3_target_count = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case VM_EXIT_MSR_STORE_COUNT:
+		current_evmcs->vm_exit_msr_store_count = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case VM_EXIT_MSR_LOAD_COUNT:
+		current_evmcs->vm_exit_msr_load_count = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case VM_ENTRY_MSR_LOAD_COUNT:
+		current_evmcs->vm_entry_msr_load_count = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+		break;
+	case HOST_ES_SELECTOR:
+		current_evmcs->host_es_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_CS_SELECTOR:
+		current_evmcs->host_cs_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_SS_SELECTOR:
+		current_evmcs->host_ss_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_DS_SELECTOR:
+		current_evmcs->host_ds_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_FS_SELECTOR:
+		current_evmcs->host_fs_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_GS_SELECTOR:
+		current_evmcs->host_gs_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case HOST_TR_SELECTOR:
+		current_evmcs->host_tr_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case GUEST_ES_SELECTOR:
+		current_evmcs->guest_es_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_CS_SELECTOR:
+		current_evmcs->guest_cs_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_SS_SELECTOR:
+		current_evmcs->guest_ss_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_DS_SELECTOR:
+		current_evmcs->guest_ds_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_FS_SELECTOR:
+		current_evmcs->guest_fs_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_GS_SELECTOR:
+		current_evmcs->guest_gs_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_LDTR_SELECTOR:
+		current_evmcs->guest_ldtr_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case GUEST_TR_SELECTOR:
+		current_evmcs->guest_tr_selector = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
+		break;
+	case VIRTUAL_PROCESSOR_ID:
+		current_evmcs->virtual_processor_id = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
+		break;
+	case HOST_IA32_PERF_GLOBAL_CTRL:
+		current_evmcs->host_ia32_perf_global_ctrl = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case GUEST_IA32_PERF_GLOBAL_CTRL:
+		current_evmcs->guest_ia32_perf_global_ctrl = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case ENCLS_EXITING_BITMAP:
+		current_evmcs->encls_exiting_bitmap = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
+	case TSC_MULTIPLIER:
+		current_evmcs->tsc_multiplier = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
+	default: return 1;
+	}
+
+	return 0;
+}
+
+static inline int evmcs_vmlaunch(void)
+{
+	int ret;
+
+	current_evmcs->hv_clean_fields = 0;
+
+	__asm__ __volatile__("push %%rbp;"
+			     "push %%rcx;"
+			     "push %%rdx;"
+			     "push %%rsi;"
+			     "push %%rdi;"
+			     "push $0;"
+			     "mov %%rsp, (%[host_rsp]);"
+			     "lea 1f(%%rip), %%rax;"
+			     "mov %%rax, (%[host_rip]);"
+			     "vmlaunch;"
+			     "incq (%%rsp);"
+			     "1: pop %%rax;"
+			     "pop %%rdi;"
+			     "pop %%rsi;"
+			     "pop %%rdx;"
+			     "pop %%rcx;"
+			     "pop %%rbp;"
+			     : [ret]"=&a"(ret)
+			     : [host_rsp]"r"
+			       ((uint64_t)&current_evmcs->host_rsp),
+			       [host_rip]"r"
+			       ((uint64_t)&current_evmcs->host_rip)
+			     : "memory", "cc", "rbx", "r8", "r9", "r10",
+			       "r11", "r12", "r13", "r14", "r15");
+	return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int evmcs_vmresume(void)
+{
+	int ret;
+
+	/* HOST_RIP */
+	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+	/* HOST_RSP */
+	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
+
+	__asm__ __volatile__("push %%rbp;"
+			     "push %%rcx;"
+			     "push %%rdx;"
+			     "push %%rsi;"
+			     "push %%rdi;"
+			     "push $0;"
+			     "mov %%rsp, (%[host_rsp]);"
+			     "lea 1f(%%rip), %%rax;"
+			     "mov %%rax, (%[host_rip]);"
+			     "vmresume;"
+			     "incq (%%rsp);"
+			     "1: pop %%rax;"
+			     "pop %%rdi;"
+			     "pop %%rsi;"
+			     "pop %%rdx;"
+			     "pop %%rcx;"
+			     "pop %%rbp;"
+			     : [ret]"=&a"(ret)
+			     : [host_rsp]"r"
+			       ((uint64_t)&current_evmcs->host_rsp),
+			       [host_rip]"r"
+			       ((uint64_t)&current_evmcs->host_rip)
+			     : "memory", "cc", "rbx", "r8", "r9", "r10",
+			       "r11", "r12", "r13", "r14", "r15");
+	return ret;
+}
+
+#endif /* !SELFTEST_KVM_EVMCS_H */
diff --git a/tools/testing/selftests/kvm/include/x86/hyperv.h b/tools/testing/selftests/kvm/include/x86/hyperv.h
new file mode 100644
index 000000000000..f13e532be240
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/hyperv.h
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2021, Red Hat, Inc.
+ */
+
+#ifndef SELFTEST_KVM_HYPERV_H
+#define SELFTEST_KVM_HYPERV_H
+
+#include "processor.h"
+
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS	0x40000000
+#define HYPERV_CPUID_INTERFACE			0x40000001
+#define HYPERV_CPUID_VERSION			0x40000002
+#define HYPERV_CPUID_FEATURES			0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005
+#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES	0x40000007
+#define HYPERV_CPUID_NESTED_FEATURES		0x4000000A
+#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS	0x40000080
+#define HYPERV_CPUID_SYNDBG_INTERFACE			0x40000081
+#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES	0x40000082
+
+#define HV_X64_MSR_GUEST_OS_ID			0x40000000
+#define HV_X64_MSR_HYPERCALL			0x40000001
+#define HV_X64_MSR_VP_INDEX			0x40000002
+#define HV_X64_MSR_RESET			0x40000003
+#define HV_X64_MSR_VP_RUNTIME			0x40000010
+#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
+#define HV_X64_MSR_REFERENCE_TSC		0x40000021
+#define HV_X64_MSR_TSC_FREQUENCY		0x40000022
+#define HV_X64_MSR_APIC_FREQUENCY		0x40000023
+#define HV_X64_MSR_EOI				0x40000070
+#define HV_X64_MSR_ICR				0x40000071
+#define HV_X64_MSR_TPR				0x40000072
+#define HV_X64_MSR_VP_ASSIST_PAGE		0x40000073
+#define HV_X64_MSR_SCONTROL			0x40000080
+#define HV_X64_MSR_SVERSION			0x40000081
+#define HV_X64_MSR_SIEFP			0x40000082
+#define HV_X64_MSR_SIMP				0x40000083
+#define HV_X64_MSR_EOM				0x40000084
+#define HV_X64_MSR_SINT0			0x40000090
+#define HV_X64_MSR_SINT1			0x40000091
+#define HV_X64_MSR_SINT2			0x40000092
+#define HV_X64_MSR_SINT3			0x40000093
+#define HV_X64_MSR_SINT4			0x40000094
+#define HV_X64_MSR_SINT5			0x40000095
+#define HV_X64_MSR_SINT6			0x40000096
+#define HV_X64_MSR_SINT7			0x40000097
+#define HV_X64_MSR_SINT8			0x40000098
+#define HV_X64_MSR_SINT9			0x40000099
+#define HV_X64_MSR_SINT10			0x4000009A
+#define HV_X64_MSR_SINT11			0x4000009B
+#define HV_X64_MSR_SINT12			0x4000009C
+#define HV_X64_MSR_SINT13			0x4000009D
+#define HV_X64_MSR_SINT14			0x4000009E
+#define HV_X64_MSR_SINT15			0x4000009F
+#define HV_X64_MSR_STIMER0_CONFIG		0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT		0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG		0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT		0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG		0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT		0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT		0x400000B7
+#define HV_X64_MSR_GUEST_IDLE			0x400000F0
+#define HV_X64_MSR_CRASH_P0			0x40000100
+#define HV_X64_MSR_CRASH_P1			0x40000101
+#define HV_X64_MSR_CRASH_P2			0x40000102
+#define HV_X64_MSR_CRASH_P3			0x40000103
+#define HV_X64_MSR_CRASH_P4			0x40000104
+#define HV_X64_MSR_CRASH_CTL			0x40000105
+#define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
+#define HV_X64_MSR_TSC_EMULATION_CONTROL	0x40000107
+#define HV_X64_MSR_TSC_EMULATION_STATUS		0x40000108
+#define HV_X64_MSR_TSC_INVARIANT_CONTROL	0x40000118
+
+#define HV_X64_MSR_SYNDBG_CONTROL		0x400000F1
+#define HV_X64_MSR_SYNDBG_STATUS		0x400000F2
+#define HV_X64_MSR_SYNDBG_SEND_BUFFER		0x400000F3
+#define HV_X64_MSR_SYNDBG_RECV_BUFFER		0x400000F4
+#define HV_X64_MSR_SYNDBG_PENDING_BUFFER	0x400000F5
+#define HV_X64_MSR_SYNDBG_OPTIONS		0x400000FF
+
+/* HYPERV_CPUID_FEATURES.EAX */
+#define HV_MSR_VP_RUNTIME_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 0)
+#define HV_MSR_TIME_REF_COUNT_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 1)
+#define HV_MSR_SYNIC_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 2)
+#define HV_MSR_SYNTIMER_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 3)
+#define HV_MSR_APIC_ACCESS_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 4)
+#define HV_MSR_HYPERCALL_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 5)
+#define HV_MSR_VP_INDEX_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 6)
+#define HV_MSR_RESET_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 7)
+#define HV_MSR_STAT_PAGES_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 8)
+#define HV_MSR_REFERENCE_TSC_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 9)
+#define HV_MSR_GUEST_IDLE_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 10)
+#define HV_ACCESS_FREQUENCY_MSRS		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 11)
+#define HV_ACCESS_REENLIGHTENMENT		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 13)
+#define HV_ACCESS_TSC_INVARIANT			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 15)
+
+/* HYPERV_CPUID_FEATURES.EBX */
+#define HV_CREATE_PARTITIONS		        \
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 0)
+#define HV_ACCESS_PARTITION_ID			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 1)
+#define HV_ACCESS_MEMORY_POOL			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 2)
+#define HV_ADJUST_MESSAGE_BUFFERS		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 3)
+#define HV_POST_MESSAGES			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 4)
+#define HV_SIGNAL_EVENTS			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 5)
+#define HV_CREATE_PORT				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 6)
+#define HV_CONNECT_PORT				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 7)
+#define HV_ACCESS_STATS				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 8)
+#define HV_DEBUGGING				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 11)
+#define HV_CPU_MANAGEMENT			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 12)
+#define HV_ENABLE_EXTENDED_HYPERCALLS		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 20)
+#define HV_ISOLATION				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 22)
+
+/* HYPERV_CPUID_FEATURES.EDX */
+#define HV_X64_MWAIT_AVAILABLE				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 0)
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 1)
+#define HV_X64_PERF_MONITOR_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 2)
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 3)
+#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 4)
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 5)
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 8)
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 10)
+#define HV_FEATURE_DEBUG_MSRS_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 11)
+#define HV_STIMER_DIRECT_MODE_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 19)
+
+/* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
+#define HV_X64_AS_SWITCH_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 0)
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 1)
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 2)
+#define HV_X64_APIC_ACCESS_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 3)
+#define HV_X64_SYSTEM_RESET_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 4)
+#define HV_X64_RELAXED_TIMING_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 5)
+#define HV_DEPRECATING_AEOI_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 9)
+#define HV_X64_CLUSTER_IPI_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 10)
+#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 11)
+#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14)
+
+/* HYPERV_CPUID_NESTED_FEATURES.EAX */
+#define HV_X64_NESTED_DIRECT_FLUSH			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17)
+#define HV_X64_NESTED_GUEST_MAPPING_FLUSH		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18)
+#define HV_X64_NESTED_MSR_BITMAP			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19)
+
+/* HYPERV_CPUID_NESTED_FEATURES.EBX */
+#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0)
+
+/* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING	\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1)
+
+/* Hypercalls */
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_SEND_IPI				0x000b
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX	0x0013
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX	0x0014
+#define HVCALL_SEND_IPI_EX			0x0015
+#define HVCALL_GET_PARTITION_ID			0x0046
+#define HVCALL_DEPOSIT_MEMORY			0x0048
+#define HVCALL_CREATE_VP			0x004e
+#define HVCALL_GET_VP_REGISTERS			0x0050
+#define HVCALL_SET_VP_REGISTERS			0x0051
+#define HVCALL_POST_MESSAGE			0x005c
+#define HVCALL_SIGNAL_EVENT			0x005d
+#define HVCALL_POST_DEBUG_DATA			0x0069
+#define HVCALL_RETRIEVE_DEBUG_DATA		0x006a
+#define HVCALL_RESET_DEBUG_SESSION		0x006b
+#define HVCALL_ADD_LOGICAL_PROCESSOR		0x0076
+#define HVCALL_MAP_DEVICE_INTERRUPT		0x007c
+#define HVCALL_UNMAP_DEVICE_INTERRUPT		0x007d
+#define HVCALL_RETARGET_INTERRUPT		0x007e
+#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
+#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
+
+/* Extended hypercalls */
+#define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
+
+#define HV_FLUSH_ALL_PROCESSORS			BIT(0)
+#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
+#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
+#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS			0
+#define HV_STATUS_INVALID_HYPERCALL_CODE	2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT	3
+#define HV_STATUS_INVALID_ALIGNMENT		4
+#define HV_STATUS_INVALID_PARAMETER		5
+#define HV_STATUS_ACCESS_DENIED			6
+#define HV_STATUS_OPERATION_DENIED		8
+#define HV_STATUS_INSUFFICIENT_MEMORY		11
+#define HV_STATUS_INVALID_PORT_ID		17
+#define HV_STATUS_INVALID_CONNECTION_ID		18
+#define HV_STATUS_INSUFFICIENT_BUFFERS		19
+
+/* hypercall options */
+#define HV_HYPERCALL_FAST_BIT		BIT(16)
+#define HV_HYPERCALL_VARHEAD_OFFSET	17
+#define HV_HYPERCALL_REP_COMP_OFFSET	32
+
+/*
+ * Issue a Hyper-V hypercall. Returns exception vector raised or 0, 'hv_status'
+ * is set to the hypercall status (if no exception occurred).
+ */
+static inline uint8_t __hyperv_hypercall(u64 control, vm_vaddr_t input_address,
+					 vm_vaddr_t output_address,
+					 uint64_t *hv_status)
+{
+	uint64_t error_code;
+	uint8_t vector;
+
+	/* Note both the hypercall and the "asm safe" clobber r9-r11. */
+	asm volatile("mov %[output_address], %%r8\n\t"
+		     KVM_ASM_SAFE("vmcall")
+		     : "=a" (*hv_status),
+		       "+c" (control), "+d" (input_address),
+		       KVM_ASM_SAFE_OUTPUTS(vector, error_code)
+		     : [output_address] "r"(output_address),
+		       "a" (-EFAULT)
+		     : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS);
+	return vector;
+}
+
+/* Issue a Hyper-V hypercall and assert that it succeeded. */
+static inline void hyperv_hypercall(u64 control, vm_vaddr_t input_address,
+				    vm_vaddr_t output_address)
+{
+	uint64_t hv_status;
+	uint8_t vector;
+
+	vector = __hyperv_hypercall(control, input_address, output_address, &hv_status);
+
+	GUEST_ASSERT(!vector);
+	GUEST_ASSERT((hv_status & 0xffff) == 0);
+}
+
+/* Write 'Fast' hypercall input 'data' to the first 'n_sse_regs' SSE regs */
+static inline void hyperv_write_xmm_input(void *data, int n_sse_regs)
+{
+	int i;
+
+	for (i = 0; i < n_sse_regs; i++)
+		write_sse_reg(i, (sse128_t *)(data + sizeof(sse128_t) * i));
+}
+
+/* Proper HV_X64_MSR_GUEST_OS_ID value */
+#define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48)
+
+#define HV_X64_MSR_VP_ASSIST_PAGE		0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE	0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT	12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK	\
+		(~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+struct hv_nested_enlightenments_control {
+	struct {
+		__u32 directhypercall:1;
+		__u32 reserved:31;
+	} features;
+	struct {
+		__u32 reserved;
+	} hypercallControls;
+} __packed;
+
+/* Define virtual processor assist page structure. */
+struct hv_vp_assist_page {
+	__u32 apic_assist;
+	__u32 reserved1;
+	__u64 vtl_control[3];
+	struct hv_nested_enlightenments_control nested_control;
+	__u8 enlighten_vmentry;
+	__u8 reserved2[7];
+	__u64 current_nested_vmcs;
+} __packed;
+
+extern struct hv_vp_assist_page *current_vp_assist;
+
+int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist);
+
+struct hyperv_test_pages {
+	/* VP assist page */
+	void *vp_assist_hva;
+	uint64_t vp_assist_gpa;
+	void *vp_assist;
+
+	/* Partition assist page */
+	void *partition_assist_hva;
+	uint64_t partition_assist_gpa;
+	void *partition_assist;
+
+	/* Enlightened VMCS */
+	void *enlightened_vmcs_hva;
+	uint64_t enlightened_vmcs_gpa;
+	void *enlightened_vmcs;
+};
+
+struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
+						       vm_vaddr_t *p_hv_pages_gva);
+
+/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
+#define HV_INVARIANT_TSC_EXPOSED               BIT_ULL(0)
+
+const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
+const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu);
+void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature);
+
+#endif /* !SELFTEST_KVM_HYPERV_H */
diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
new file mode 100644
index 000000000000..972bb1c4ab4c
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "kvm_util_types.h"
+#include "test_util.h"
+
+extern bool is_forced_emulation_enabled;
+
+struct kvm_vm_arch {
+	vm_vaddr_t gdt;
+	vm_vaddr_t tss;
+	vm_vaddr_t idt;
+
+	uint64_t c_bit;
+	uint64_t s_bit;
+	int sev_fd;
+	bool is_pt_protected;
+};
+
+static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
+{
+	return arch->c_bit || arch->s_bit;
+}
+
+#define vm_arch_has_protected_memory(vm) \
+	__vm_arch_has_protected_memory(&(vm)->arch)
+
+#define vcpu_arch_put_guest(mem, __val)							\
+do {											\
+	const typeof(mem) val = (__val);						\
+											\
+	if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) {		\
+		(mem) = val;								\
+	} else if (guest_random_bool(&guest_rng)) {					\
+		__asm__ __volatile__(KVM_FEP "mov %1, %0"				\
+				     : "+m" (mem)					\
+				     : "r" (val) : "memory");				\
+	} else {									\
+		uint64_t __old = READ_ONCE(mem);					\
+											\
+		__asm__ __volatile__(KVM_FEP LOCK_PREFIX "cmpxchg %[new], %[ptr]"	\
+				     : [ptr] "+m" (mem), [old] "+a" (__old)		\
+				     : [new]"r" (val) : "memory", "cc");		\
+	}										\
+} while (0)
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/x86/mce.h b/tools/testing/selftests/kvm/include/x86/mce.h
new file mode 100644
index 000000000000..295f2d554754
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/mce.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_MCE_H
+#define SELFTEST_KVM_MCE_H
+
+#define MCG_CTL_P		BIT_ULL(8)   /* MCG_CTL register available */
+#define MCG_SER_P		BIT_ULL(24)  /* MCA recovery/new status bits */
+#define MCG_LMCE_P		BIT_ULL(27)  /* Local machine check supported */
+#define MCG_CMCI_P		BIT_ULL(10)  /* CMCI supported */
+#define KVM_MAX_MCE_BANKS 32
+#define MCG_CAP_BANKS_MASK 0xff       /* Bit 0-7 of the MCG_CAP register are #banks */
+#define MCI_STATUS_VAL (1ULL << 63)   /* valid error */
+#define MCI_STATUS_UC (1ULL << 61)    /* uncorrected error */
+#define MCI_STATUS_EN (1ULL << 60)    /* error enabled */
+#define MCI_STATUS_MISCV (1ULL << 59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL << 58) /* addr reg. valid */
+#define MCM_ADDR_PHYS 2    /* physical address */
+#define MCI_CTL2_CMCI_EN		BIT_ULL(30)
+
+#endif /* SELFTEST_KVM_MCE_H */
diff --git a/tools/testing/selftests/kvm/include/x86/pmu.h b/tools/testing/selftests/kvm/include/x86/pmu.h
new file mode 100644
index 000000000000..3c10c4dc0ae8
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/pmu.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+#ifndef SELFTEST_KVM_PMU_H
+#define SELFTEST_KVM_PMU_H
+
+#include <stdint.h>
+
+#define KVM_PMU_EVENT_FILTER_MAX_EVENTS			300
+
+/*
+ * Encode an eventsel+umask pair into event-select MSR format.  Note, this is
+ * technically AMD's format, as Intel's format only supports 8 bits for the
+ * event selector, i.e. doesn't use bits 24:16 for the selector.  But, OR-ing
+ * in '0' is a nop and won't clobber the CMASK.
+ */
+#define RAW_EVENT(eventsel, umask) (((eventsel & 0xf00UL) << 24) |	\
+				    ((eventsel) & 0xff) |		\
+				    ((umask) & 0xff) << 8)
+
+/*
+ * These are technically Intel's definitions, but except for CMASK (see above),
+ * AMD's layout is compatible with Intel's.
+ */
+#define ARCH_PERFMON_EVENTSEL_EVENT		GENMASK_ULL(7, 0)
+#define ARCH_PERFMON_EVENTSEL_UMASK		GENMASK_ULL(15, 8)
+#define ARCH_PERFMON_EVENTSEL_USR		BIT_ULL(16)
+#define ARCH_PERFMON_EVENTSEL_OS		BIT_ULL(17)
+#define ARCH_PERFMON_EVENTSEL_EDGE		BIT_ULL(18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL	BIT_ULL(19)
+#define ARCH_PERFMON_EVENTSEL_INT		BIT_ULL(20)
+#define ARCH_PERFMON_EVENTSEL_ANY		BIT_ULL(21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE		BIT_ULL(22)
+#define ARCH_PERFMON_EVENTSEL_INV		BIT_ULL(23)
+#define ARCH_PERFMON_EVENTSEL_CMASK		GENMASK_ULL(31, 24)
+
+/* RDPMC control flags, Intel only. */
+#define INTEL_RDPMC_METRICS			BIT_ULL(29)
+#define INTEL_RDPMC_FIXED			BIT_ULL(30)
+#define INTEL_RDPMC_FAST			BIT_ULL(31)
+
+/* Fixed PMC controls, Intel only. */
+#define FIXED_PMC_GLOBAL_CTRL_ENABLE(_idx)	BIT_ULL((32 + (_idx)))
+
+#define FIXED_PMC_KERNEL			BIT_ULL(0)
+#define FIXED_PMC_USER				BIT_ULL(1)
+#define FIXED_PMC_ANYTHREAD			BIT_ULL(2)
+#define FIXED_PMC_ENABLE_PMI			BIT_ULL(3)
+#define FIXED_PMC_NR_BITS			4
+#define FIXED_PMC_CTRL(_idx, _val)		((_val) << ((_idx) * FIXED_PMC_NR_BITS))
+
+#define PMU_CAP_FW_WRITES			BIT_ULL(13)
+#define PMU_CAP_LBR_FMT				0x3f
+
+#define	INTEL_ARCH_CPU_CYCLES			RAW_EVENT(0x3c, 0x00)
+#define	INTEL_ARCH_INSTRUCTIONS_RETIRED		RAW_EVENT(0xc0, 0x00)
+#define	INTEL_ARCH_REFERENCE_CYCLES		RAW_EVENT(0x3c, 0x01)
+#define	INTEL_ARCH_LLC_REFERENCES		RAW_EVENT(0x2e, 0x4f)
+#define	INTEL_ARCH_LLC_MISSES			RAW_EVENT(0x2e, 0x41)
+#define	INTEL_ARCH_BRANCHES_RETIRED		RAW_EVENT(0xc4, 0x00)
+#define	INTEL_ARCH_BRANCHES_MISPREDICTED	RAW_EVENT(0xc5, 0x00)
+#define	INTEL_ARCH_TOPDOWN_SLOTS		RAW_EVENT(0xa4, 0x01)
+
+#define	AMD_ZEN_CORE_CYCLES			RAW_EVENT(0x76, 0x00)
+#define	AMD_ZEN_INSTRUCTIONS_RETIRED		RAW_EVENT(0xc0, 0x00)
+#define	AMD_ZEN_BRANCHES_RETIRED		RAW_EVENT(0xc2, 0x00)
+#define	AMD_ZEN_BRANCHES_MISPREDICTED		RAW_EVENT(0xc3, 0x00)
+
+/*
+ * Note!  The order and thus the index of the architectural events matters as
+ * support for each event is enumerated via CPUID using the index of the event.
+ */
+enum intel_pmu_architectural_events {
+	INTEL_ARCH_CPU_CYCLES_INDEX,
+	INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX,
+	INTEL_ARCH_REFERENCE_CYCLES_INDEX,
+	INTEL_ARCH_LLC_REFERENCES_INDEX,
+	INTEL_ARCH_LLC_MISSES_INDEX,
+	INTEL_ARCH_BRANCHES_RETIRED_INDEX,
+	INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX,
+	INTEL_ARCH_TOPDOWN_SLOTS_INDEX,
+	NR_INTEL_ARCH_EVENTS,
+};
+
+enum amd_pmu_zen_events {
+	AMD_ZEN_CORE_CYCLES_INDEX,
+	AMD_ZEN_INSTRUCTIONS_INDEX,
+	AMD_ZEN_BRANCHES_INDEX,
+	AMD_ZEN_BRANCH_MISSES_INDEX,
+	NR_AMD_ZEN_EVENTS,
+};
+
+extern const uint64_t intel_pmu_arch_events[];
+extern const uint64_t amd_pmu_zen_events[];
+
+#endif /* SELFTEST_KVM_PMU_H */
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
new file mode 100644
index 000000000000..9ec984cf8674
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -0,0 +1,1395 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <syscall.h>
+
+#include <asm/msr-index.h>
+#include <asm/prctl.h>
+
+#include <linux/kvm_para.h>
+#include <linux/stringify.h>
+
+#include "kvm_util.h"
+#include "ucall_common.h"
+
+extern bool host_cpu_is_intel;
+extern bool host_cpu_is_amd;
+extern uint64_t guest_tsc_khz;
+
+#ifndef MAX_NR_CPUID_ENTRIES
+#define MAX_NR_CPUID_ENTRIES 100
+#endif
+
+/* Forced emulation prefix, used to invoke the emulator unconditionally. */
+#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
+
+#define NMI_VECTOR		0x02
+
+#define X86_EFLAGS_FIXED	 (1u << 1)
+
+#define X86_CR4_VME		(1ul << 0)
+#define X86_CR4_PVI		(1ul << 1)
+#define X86_CR4_TSD		(1ul << 2)
+#define X86_CR4_DE		(1ul << 3)
+#define X86_CR4_PSE		(1ul << 4)
+#define X86_CR4_PAE		(1ul << 5)
+#define X86_CR4_MCE		(1ul << 6)
+#define X86_CR4_PGE		(1ul << 7)
+#define X86_CR4_PCE		(1ul << 8)
+#define X86_CR4_OSFXSR		(1ul << 9)
+#define X86_CR4_OSXMMEXCPT	(1ul << 10)
+#define X86_CR4_UMIP		(1ul << 11)
+#define X86_CR4_LA57		(1ul << 12)
+#define X86_CR4_VMXE		(1ul << 13)
+#define X86_CR4_SMXE		(1ul << 14)
+#define X86_CR4_FSGSBASE	(1ul << 16)
+#define X86_CR4_PCIDE		(1ul << 17)
+#define X86_CR4_OSXSAVE		(1ul << 18)
+#define X86_CR4_SMEP		(1ul << 20)
+#define X86_CR4_SMAP		(1ul << 21)
+#define X86_CR4_PKE		(1ul << 22)
+
+struct xstate_header {
+	u64				xstate_bv;
+	u64				xcomp_bv;
+	u64				reserved[6];
+} __attribute__((packed));
+
+struct xstate {
+	u8				i387[512];
+	struct xstate_header		header;
+	u8				extended_state_area[0];
+} __attribute__ ((packed, aligned (64)));
+
+#define XFEATURE_MASK_FP		BIT_ULL(0)
+#define XFEATURE_MASK_SSE		BIT_ULL(1)
+#define XFEATURE_MASK_YMM		BIT_ULL(2)
+#define XFEATURE_MASK_BNDREGS		BIT_ULL(3)
+#define XFEATURE_MASK_BNDCSR		BIT_ULL(4)
+#define XFEATURE_MASK_OPMASK		BIT_ULL(5)
+#define XFEATURE_MASK_ZMM_Hi256		BIT_ULL(6)
+#define XFEATURE_MASK_Hi16_ZMM		BIT_ULL(7)
+#define XFEATURE_MASK_PT		BIT_ULL(8)
+#define XFEATURE_MASK_PKRU		BIT_ULL(9)
+#define XFEATURE_MASK_PASID		BIT_ULL(10)
+#define XFEATURE_MASK_CET_USER		BIT_ULL(11)
+#define XFEATURE_MASK_CET_KERNEL	BIT_ULL(12)
+#define XFEATURE_MASK_LBR		BIT_ULL(15)
+#define XFEATURE_MASK_XTILE_CFG		BIT_ULL(17)
+#define XFEATURE_MASK_XTILE_DATA	BIT_ULL(18)
+
+#define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK | \
+					 XFEATURE_MASK_ZMM_Hi256 | \
+					 XFEATURE_MASK_Hi16_ZMM)
+#define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILE_DATA | \
+					 XFEATURE_MASK_XTILE_CFG)
+
+/* Note, these are ordered alphabetically to match kvm_cpuid_entry2.  Eww. */
+enum cpuid_output_regs {
+	KVM_CPUID_EAX,
+	KVM_CPUID_EBX,
+	KVM_CPUID_ECX,
+	KVM_CPUID_EDX
+};
+
+/*
+ * Pack the information into a 64-bit value so that each X86_FEATURE_XXX can be
+ * passed by value with no overhead.
+ */
+struct kvm_x86_cpu_feature {
+	u32	function;
+	u16	index;
+	u8	reg;
+	u8	bit;
+};
+#define	KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit)				\
+({										\
+	struct kvm_x86_cpu_feature feature = {					\
+		.function = fn,							\
+		.index = idx,							\
+		.reg = KVM_CPUID_##gpr,						\
+		.bit = __bit,							\
+	};									\
+										\
+	kvm_static_assert((fn & 0xc0000000) == 0 ||				\
+			  (fn & 0xc0000000) == 0x40000000 ||			\
+			  (fn & 0xc0000000) == 0x80000000 ||			\
+			  (fn & 0xc0000000) == 0xc0000000);			\
+	kvm_static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE));	\
+	feature;								\
+})
+
+/*
+ * Basic Leafs, a.k.a. Intel defined
+ */
+#define	X86_FEATURE_MWAIT		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 3)
+#define	X86_FEATURE_VMX			KVM_X86_CPU_FEATURE(0x1, 0, ECX, 5)
+#define	X86_FEATURE_SMX			KVM_X86_CPU_FEATURE(0x1, 0, ECX, 6)
+#define	X86_FEATURE_PDCM		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 15)
+#define	X86_FEATURE_PCID		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 17)
+#define X86_FEATURE_X2APIC		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 21)
+#define	X86_FEATURE_MOVBE		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 22)
+#define	X86_FEATURE_TSC_DEADLINE_TIMER	KVM_X86_CPU_FEATURE(0x1, 0, ECX, 24)
+#define	X86_FEATURE_XSAVE		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26)
+#define	X86_FEATURE_OSXSAVE		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27)
+#define	X86_FEATURE_RDRAND		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30)
+#define	X86_FEATURE_HYPERVISOR		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 31)
+#define X86_FEATURE_PAE			KVM_X86_CPU_FEATURE(0x1, 0, EDX, 6)
+#define	X86_FEATURE_MCE			KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7)
+#define	X86_FEATURE_APIC		KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9)
+#define	X86_FEATURE_CLFLUSH		KVM_X86_CPU_FEATURE(0x1, 0, EDX, 19)
+#define	X86_FEATURE_XMM			KVM_X86_CPU_FEATURE(0x1, 0, EDX, 25)
+#define	X86_FEATURE_XMM2		KVM_X86_CPU_FEATURE(0x1, 0, EDX, 26)
+#define	X86_FEATURE_FSGSBASE		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 0)
+#define	X86_FEATURE_TSC_ADJUST		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 1)
+#define	X86_FEATURE_SGX			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 2)
+#define	X86_FEATURE_HLE			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 4)
+#define	X86_FEATURE_SMEP	        KVM_X86_CPU_FEATURE(0x7, 0, EBX, 7)
+#define	X86_FEATURE_INVPCID		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 10)
+#define	X86_FEATURE_RTM			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 11)
+#define	X86_FEATURE_MPX			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 14)
+#define	X86_FEATURE_SMAP		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 20)
+#define	X86_FEATURE_PCOMMIT		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 22)
+#define	X86_FEATURE_CLFLUSHOPT		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 23)
+#define	X86_FEATURE_CLWB		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 24)
+#define	X86_FEATURE_UMIP		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 2)
+#define	X86_FEATURE_PKU			KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3)
+#define	X86_FEATURE_OSPKE		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 4)
+#define	X86_FEATURE_LA57		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 16)
+#define	X86_FEATURE_RDPID		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22)
+#define	X86_FEATURE_SGX_LC		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 30)
+#define	X86_FEATURE_SHSTK		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 7)
+#define	X86_FEATURE_IBT			KVM_X86_CPU_FEATURE(0x7, 0, EDX, 20)
+#define	X86_FEATURE_AMX_TILE		KVM_X86_CPU_FEATURE(0x7, 0, EDX, 24)
+#define	X86_FEATURE_SPEC_CTRL		KVM_X86_CPU_FEATURE(0x7, 0, EDX, 26)
+#define	X86_FEATURE_ARCH_CAPABILITIES	KVM_X86_CPU_FEATURE(0x7, 0, EDX, 29)
+#define	X86_FEATURE_PKS			KVM_X86_CPU_FEATURE(0x7, 0, ECX, 31)
+#define	X86_FEATURE_XTILECFG		KVM_X86_CPU_FEATURE(0xD, 0, EAX, 17)
+#define	X86_FEATURE_XTILEDATA		KVM_X86_CPU_FEATURE(0xD, 0, EAX, 18)
+#define	X86_FEATURE_XSAVES		KVM_X86_CPU_FEATURE(0xD, 1, EAX, 3)
+#define	X86_FEATURE_XFD			KVM_X86_CPU_FEATURE(0xD, 1, EAX, 4)
+#define X86_FEATURE_XTILEDATA_XFD	KVM_X86_CPU_FEATURE(0xD, 18, ECX, 2)
+
+/*
+ * Extended Leafs, a.k.a. AMD defined
+ */
+#define	X86_FEATURE_SVM			KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 2)
+#define	X86_FEATURE_NX			KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 20)
+#define	X86_FEATURE_GBPAGES		KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 26)
+#define	X86_FEATURE_RDTSCP		KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 27)
+#define	X86_FEATURE_LM			KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 29)
+#define	X86_FEATURE_INVTSC		KVM_X86_CPU_FEATURE(0x80000007, 0, EDX, 8)
+#define	X86_FEATURE_RDPRU		KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 4)
+#define	X86_FEATURE_AMD_IBPB		KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 12)
+#define	X86_FEATURE_NPT			KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 0)
+#define	X86_FEATURE_LBRV		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 1)
+#define	X86_FEATURE_NRIPS		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 3)
+#define X86_FEATURE_TSCRATEMSR          KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4)
+#define X86_FEATURE_PAUSEFILTER         KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10)
+#define X86_FEATURE_PFTHRESHOLD         KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12)
+#define	X86_FEATURE_VGIF		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16)
+#define X86_FEATURE_SEV			KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1)
+#define X86_FEATURE_SEV_ES		KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3)
+
+/*
+ * KVM defined paravirt features.
+ */
+#define X86_FEATURE_KVM_CLOCKSOURCE	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 0)
+#define X86_FEATURE_KVM_NOP_IO_DELAY	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 1)
+#define X86_FEATURE_KVM_MMU_OP		KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 2)
+#define X86_FEATURE_KVM_CLOCKSOURCE2	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 3)
+#define X86_FEATURE_KVM_ASYNC_PF	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 4)
+#define X86_FEATURE_KVM_STEAL_TIME	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 5)
+#define X86_FEATURE_KVM_PV_EOI		KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 6)
+#define X86_FEATURE_KVM_PV_UNHALT	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 7)
+/* Bit 8 apparently isn't used?!?! */
+#define X86_FEATURE_KVM_PV_TLB_FLUSH	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 9)
+#define X86_FEATURE_KVM_ASYNC_PF_VMEXIT	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 10)
+#define X86_FEATURE_KVM_PV_SEND_IPI	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 11)
+#define X86_FEATURE_KVM_POLL_CONTROL	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 12)
+#define X86_FEATURE_KVM_PV_SCHED_YIELD	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 13)
+#define X86_FEATURE_KVM_ASYNC_PF_INT	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 14)
+#define X86_FEATURE_KVM_MSI_EXT_DEST_ID	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 15)
+#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16)
+#define X86_FEATURE_KVM_MIGRATION_CONTROL	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17)
+
+/*
+ * Same idea as X86_FEATURE_XXX, but X86_PROPERTY_XXX retrieves a multi-bit
+ * value/property as opposed to a single-bit feature.  Again, pack the info
+ * into a 64-bit value to pass by value with no overhead.
+ */
+struct kvm_x86_cpu_property {
+	u32	function;
+	u8	index;
+	u8	reg;
+	u8	lo_bit;
+	u8	hi_bit;
+};
+#define	KVM_X86_CPU_PROPERTY(fn, idx, gpr, low_bit, high_bit)			\
+({										\
+	struct kvm_x86_cpu_property property = {				\
+		.function = fn,							\
+		.index = idx,							\
+		.reg = KVM_CPUID_##gpr,						\
+		.lo_bit = low_bit,						\
+		.hi_bit = high_bit,						\
+	};									\
+										\
+	kvm_static_assert(low_bit < high_bit);					\
+	kvm_static_assert((fn & 0xc0000000) == 0 ||				\
+			  (fn & 0xc0000000) == 0x40000000 ||			\
+			  (fn & 0xc0000000) == 0x80000000 ||			\
+			  (fn & 0xc0000000) == 0xc0000000);			\
+	kvm_static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE));	\
+	property;								\
+})
+
+#define X86_PROPERTY_MAX_BASIC_LEAF		KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31)
+#define X86_PROPERTY_PMU_VERSION		KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 0, 7)
+#define X86_PROPERTY_PMU_NR_GP_COUNTERS		KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 8, 15)
+#define X86_PROPERTY_PMU_GP_COUNTERS_BIT_WIDTH	KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 16, 23)
+#define X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH	KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 24, 31)
+#define X86_PROPERTY_PMU_EVENTS_MASK		KVM_X86_CPU_PROPERTY(0xa, 0, EBX, 0, 7)
+#define X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK	KVM_X86_CPU_PROPERTY(0xa, 0, ECX, 0, 31)
+#define X86_PROPERTY_PMU_NR_FIXED_COUNTERS	KVM_X86_CPU_PROPERTY(0xa, 0, EDX, 0, 4)
+#define X86_PROPERTY_PMU_FIXED_COUNTERS_BIT_WIDTH	KVM_X86_CPU_PROPERTY(0xa, 0, EDX, 5, 12)
+
+#define X86_PROPERTY_SUPPORTED_XCR0_LO		KVM_X86_CPU_PROPERTY(0xd,  0, EAX,  0, 31)
+#define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0	KVM_X86_CPU_PROPERTY(0xd,  0, EBX,  0, 31)
+#define X86_PROPERTY_XSTATE_MAX_SIZE		KVM_X86_CPU_PROPERTY(0xd,  0, ECX,  0, 31)
+#define X86_PROPERTY_SUPPORTED_XCR0_HI		KVM_X86_CPU_PROPERTY(0xd,  0, EDX,  0, 31)
+
+#define X86_PROPERTY_XSTATE_TILE_SIZE		KVM_X86_CPU_PROPERTY(0xd, 18, EAX,  0, 31)
+#define X86_PROPERTY_XSTATE_TILE_OFFSET		KVM_X86_CPU_PROPERTY(0xd, 18, EBX,  0, 31)
+#define X86_PROPERTY_AMX_MAX_PALETTE_TABLES	KVM_X86_CPU_PROPERTY(0x1d, 0, EAX,  0, 31)
+#define X86_PROPERTY_AMX_TOTAL_TILE_BYTES	KVM_X86_CPU_PROPERTY(0x1d, 1, EAX,  0, 15)
+#define X86_PROPERTY_AMX_BYTES_PER_TILE		KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 16, 31)
+#define X86_PROPERTY_AMX_BYTES_PER_ROW		KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 0,  15)
+#define X86_PROPERTY_AMX_NR_TILE_REGS		KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 16, 31)
+#define X86_PROPERTY_AMX_MAX_ROWS		KVM_X86_CPU_PROPERTY(0x1d, 1, ECX, 0,  15)
+
+#define X86_PROPERTY_MAX_KVM_LEAF		KVM_X86_CPU_PROPERTY(0x40000000, 0, EAX, 0, 31)
+
+#define X86_PROPERTY_MAX_EXT_LEAF		KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31)
+#define X86_PROPERTY_MAX_PHY_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7)
+#define X86_PROPERTY_MAX_VIRT_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15)
+#define X86_PROPERTY_GUEST_MAX_PHY_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 16, 23)
+#define X86_PROPERTY_SEV_C_BIT			KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5)
+#define X86_PROPERTY_PHYS_ADDR_REDUCTION	KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11)
+
+#define X86_PROPERTY_MAX_CENTAUR_LEAF		KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31)
+
+/*
+ * Intel's architectural PMU events are bizarre.  They have a "feature" bit
+ * that indicates the feature is _not_ supported, and a property that states
+ * the length of the bit mask of unsupported features.  A feature is supported
+ * if the size of the bit mask is larger than the "unavailable" bit, and said
+ * bit is not set.  Fixed counters also bizarre enumeration, but inverted from
+ * arch events for general purpose counters.  Fixed counters are supported if a
+ * feature flag is set **OR** the total number of fixed counters is greater
+ * than index of the counter.
+ *
+ * Wrap the events for general purpose and fixed counters to simplify checking
+ * whether or not a given architectural event is supported.
+ */
+struct kvm_x86_pmu_feature {
+	struct kvm_x86_cpu_feature f;
+};
+#define	KVM_X86_PMU_FEATURE(__reg, __bit)				\
+({									\
+	struct kvm_x86_pmu_feature feature = {				\
+		.f = KVM_X86_CPU_FEATURE(0xa, 0, __reg, __bit),		\
+	};								\
+									\
+	kvm_static_assert(KVM_CPUID_##__reg == KVM_CPUID_EBX ||		\
+			  KVM_CPUID_##__reg == KVM_CPUID_ECX);		\
+	feature;							\
+})
+
+#define X86_PMU_FEATURE_CPU_CYCLES			KVM_X86_PMU_FEATURE(EBX, 0)
+#define X86_PMU_FEATURE_INSNS_RETIRED			KVM_X86_PMU_FEATURE(EBX, 1)
+#define X86_PMU_FEATURE_REFERENCE_CYCLES		KVM_X86_PMU_FEATURE(EBX, 2)
+#define X86_PMU_FEATURE_LLC_REFERENCES			KVM_X86_PMU_FEATURE(EBX, 3)
+#define X86_PMU_FEATURE_LLC_MISSES			KVM_X86_PMU_FEATURE(EBX, 4)
+#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED		KVM_X86_PMU_FEATURE(EBX, 5)
+#define X86_PMU_FEATURE_BRANCHES_MISPREDICTED		KVM_X86_PMU_FEATURE(EBX, 6)
+#define X86_PMU_FEATURE_TOPDOWN_SLOTS			KVM_X86_PMU_FEATURE(EBX, 7)
+
+#define X86_PMU_FEATURE_INSNS_RETIRED_FIXED		KVM_X86_PMU_FEATURE(ECX, 0)
+#define X86_PMU_FEATURE_CPU_CYCLES_FIXED		KVM_X86_PMU_FEATURE(ECX, 1)
+#define X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED	KVM_X86_PMU_FEATURE(ECX, 2)
+#define X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED		KVM_X86_PMU_FEATURE(ECX, 3)
+
+static inline unsigned int x86_family(unsigned int eax)
+{
+	unsigned int x86;
+
+	x86 = (eax >> 8) & 0xf;
+
+	if (x86 == 0xf)
+		x86 += (eax >> 20) & 0xff;
+
+	return x86;
+}
+
+static inline unsigned int x86_model(unsigned int eax)
+{
+	return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
+}
+
+/* Page table bitfield declarations */
+#define PTE_PRESENT_MASK        BIT_ULL(0)
+#define PTE_WRITABLE_MASK       BIT_ULL(1)
+#define PTE_USER_MASK           BIT_ULL(2)
+#define PTE_ACCESSED_MASK       BIT_ULL(5)
+#define PTE_DIRTY_MASK          BIT_ULL(6)
+#define PTE_LARGE_MASK          BIT_ULL(7)
+#define PTE_GLOBAL_MASK         BIT_ULL(8)
+#define PTE_NX_MASK             BIT_ULL(63)
+
+#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
+
+#define PAGE_SHIFT		12
+#define PAGE_SIZE		(1ULL << PAGE_SHIFT)
+#define PAGE_MASK		(~(PAGE_SIZE-1) & PHYSICAL_PAGE_MASK)
+
+#define HUGEPAGE_SHIFT(x)	(PAGE_SHIFT + (((x) - 1) * 9))
+#define HUGEPAGE_SIZE(x)	(1UL << HUGEPAGE_SHIFT(x))
+#define HUGEPAGE_MASK(x)	(~(HUGEPAGE_SIZE(x) - 1) & PHYSICAL_PAGE_MASK)
+
+#define PTE_GET_PA(pte)		((pte) & PHYSICAL_PAGE_MASK)
+#define PTE_GET_PFN(pte)        (PTE_GET_PA(pte) >> PAGE_SHIFT)
+
+/* General Registers in 64-Bit Mode */
+struct gpr64_regs {
+	u64 rax;
+	u64 rcx;
+	u64 rdx;
+	u64 rbx;
+	u64 rsp;
+	u64 rbp;
+	u64 rsi;
+	u64 rdi;
+	u64 r8;
+	u64 r9;
+	u64 r10;
+	u64 r11;
+	u64 r12;
+	u64 r13;
+	u64 r14;
+	u64 r15;
+};
+
+struct desc64 {
+	uint16_t limit0;
+	uint16_t base0;
+	unsigned base1:8, type:4, s:1, dpl:2, p:1;
+	unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
+	uint32_t base3;
+	uint32_t zero1;
+} __attribute__((packed));
+
+struct desc_ptr {
+	uint16_t size;
+	uint64_t address;
+} __attribute__((packed));
+
+struct kvm_x86_state {
+	struct kvm_xsave *xsave;
+	struct kvm_vcpu_events events;
+	struct kvm_mp_state mp_state;
+	struct kvm_regs regs;
+	struct kvm_xcrs xcrs;
+	struct kvm_sregs sregs;
+	struct kvm_debugregs debugregs;
+	union {
+		struct kvm_nested_state nested;
+		char nested_[16384];
+	};
+	struct kvm_msrs msrs;
+};
+
+static inline uint64_t get_desc64_base(const struct desc64 *desc)
+{
+	return ((uint64_t)desc->base3 << 32) |
+		(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline uint64_t rdtsc(void)
+{
+	uint32_t eax, edx;
+	uint64_t tsc_val;
+	/*
+	 * The lfence is to wait (on Intel CPUs) until all previous
+	 * instructions have been executed. If software requires RDTSC to be
+	 * executed prior to execution of any subsequent instruction, it can
+	 * execute LFENCE immediately after RDTSC
+	 */
+	__asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
+	tsc_val = ((uint64_t)edx) << 32 | eax;
+	return tsc_val;
+}
+
+static inline uint64_t rdtscp(uint32_t *aux)
+{
+	uint32_t eax, edx;
+
+	__asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux));
+	return ((uint64_t)edx) << 32 | eax;
+}
+
+static inline uint64_t rdmsr(uint32_t msr)
+{
+	uint32_t a, d;
+
+	__asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+	return a | ((uint64_t) d << 32);
+}
+
+static inline void wrmsr(uint32_t msr, uint64_t value)
+{
+	uint32_t a = value;
+	uint32_t d = value >> 32;
+
+	__asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+
+static inline uint16_t inw(uint16_t port)
+{
+	uint16_t tmp;
+
+	__asm__ __volatile__("in %%dx, %%ax"
+		: /* output */ "=a" (tmp)
+		: /* input */ "d" (port));
+
+	return tmp;
+}
+
+static inline uint16_t get_es(void)
+{
+	uint16_t es;
+
+	__asm__ __volatile__("mov %%es, %[es]"
+			     : /* output */ [es]"=rm"(es));
+	return es;
+}
+
+static inline uint16_t get_cs(void)
+{
+	uint16_t cs;
+
+	__asm__ __volatile__("mov %%cs, %[cs]"
+			     : /* output */ [cs]"=rm"(cs));
+	return cs;
+}
+
+static inline uint16_t get_ss(void)
+{
+	uint16_t ss;
+
+	__asm__ __volatile__("mov %%ss, %[ss]"
+			     : /* output */ [ss]"=rm"(ss));
+	return ss;
+}
+
+static inline uint16_t get_ds(void)
+{
+	uint16_t ds;
+
+	__asm__ __volatile__("mov %%ds, %[ds]"
+			     : /* output */ [ds]"=rm"(ds));
+	return ds;
+}
+
+static inline uint16_t get_fs(void)
+{
+	uint16_t fs;
+
+	__asm__ __volatile__("mov %%fs, %[fs]"
+			     : /* output */ [fs]"=rm"(fs));
+	return fs;
+}
+
+static inline uint16_t get_gs(void)
+{
+	uint16_t gs;
+
+	__asm__ __volatile__("mov %%gs, %[gs]"
+			     : /* output */ [gs]"=rm"(gs));
+	return gs;
+}
+
+static inline uint16_t get_tr(void)
+{
+	uint16_t tr;
+
+	__asm__ __volatile__("str %[tr]"
+			     : /* output */ [tr]"=rm"(tr));
+	return tr;
+}
+
+static inline uint64_t get_cr0(void)
+{
+	uint64_t cr0;
+
+	__asm__ __volatile__("mov %%cr0, %[cr0]"
+			     : /* output */ [cr0]"=r"(cr0));
+	return cr0;
+}
+
+static inline uint64_t get_cr3(void)
+{
+	uint64_t cr3;
+
+	__asm__ __volatile__("mov %%cr3, %[cr3]"
+			     : /* output */ [cr3]"=r"(cr3));
+	return cr3;
+}
+
+static inline uint64_t get_cr4(void)
+{
+	uint64_t cr4;
+
+	__asm__ __volatile__("mov %%cr4, %[cr4]"
+			     : /* output */ [cr4]"=r"(cr4));
+	return cr4;
+}
+
+static inline void set_cr4(uint64_t val)
+{
+	__asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
+}
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	__asm__ __volatile__("xgetbv;"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax | ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	__asm__ __volatile__("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+static inline void wrpkru(u32 pkru)
+{
+	/* Note, ECX and EDX are architecturally required to be '0'. */
+	asm volatile(".byte 0x0f,0x01,0xef\n\t"
+		     : : "a" (pkru), "c"(0), "d"(0));
+}
+
+static inline struct desc_ptr get_gdt(void)
+{
+	struct desc_ptr gdt;
+	__asm__ __volatile__("sgdt %[gdt]"
+			     : /* output */ [gdt]"=m"(gdt));
+	return gdt;
+}
+
+static inline struct desc_ptr get_idt(void)
+{
+	struct desc_ptr idt;
+	__asm__ __volatile__("sidt %[idt]"
+			     : /* output */ [idt]"=m"(idt));
+	return idt;
+}
+
+static inline void outl(uint16_t port, uint32_t value)
+{
+	__asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
+}
+
+static inline void __cpuid(uint32_t function, uint32_t index,
+			   uint32_t *eax, uint32_t *ebx,
+			   uint32_t *ecx, uint32_t *edx)
+{
+	*eax = function;
+	*ecx = index;
+
+	asm volatile("cpuid"
+	    : "=a" (*eax),
+	      "=b" (*ebx),
+	      "=c" (*ecx),
+	      "=d" (*edx)
+	    : "0" (*eax), "2" (*ecx)
+	    : "memory");
+}
+
+static inline void cpuid(uint32_t function,
+			 uint32_t *eax, uint32_t *ebx,
+			 uint32_t *ecx, uint32_t *edx)
+{
+	return __cpuid(function, 0, eax, ebx, ecx, edx);
+}
+
+static inline uint32_t this_cpu_fms(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	return eax;
+}
+
+static inline uint32_t this_cpu_family(void)
+{
+	return x86_family(this_cpu_fms());
+}
+
+static inline uint32_t this_cpu_model(void)
+{
+	return x86_model(this_cpu_fms());
+}
+
+static inline bool this_cpu_vendor_string_is(const char *vendor)
+{
+	const uint32_t *chunk = (const uint32_t *)vendor;
+	uint32_t eax, ebx, ecx, edx;
+
+	cpuid(0, &eax, &ebx, &ecx, &edx);
+	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
+}
+
+static inline bool this_cpu_is_intel(void)
+{
+	return this_cpu_vendor_string_is("GenuineIntel");
+}
+
+/*
+ * Exclude early K5 samples with a vendor string of "AMDisbetter!"
+ */
+static inline bool this_cpu_is_amd(void)
+{
+	return this_cpu_vendor_string_is("AuthenticAMD");
+}
+
+static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index,
+				      uint8_t reg, uint8_t lo, uint8_t hi)
+{
+	uint32_t gprs[4];
+
+	__cpuid(function, index,
+		&gprs[KVM_CPUID_EAX], &gprs[KVM_CPUID_EBX],
+		&gprs[KVM_CPUID_ECX], &gprs[KVM_CPUID_EDX]);
+
+	return (gprs[reg] & GENMASK(hi, lo)) >> lo;
+}
+
+static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature)
+{
+	return __this_cpu_has(feature.function, feature.index,
+			      feature.reg, feature.bit, feature.bit);
+}
+
+static inline uint32_t this_cpu_property(struct kvm_x86_cpu_property property)
+{
+	return __this_cpu_has(property.function, property.index,
+			      property.reg, property.lo_bit, property.hi_bit);
+}
+
+static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property)
+{
+	uint32_t max_leaf;
+
+	switch (property.function & 0xc0000000) {
+	case 0:
+		max_leaf = this_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF);
+		break;
+	case 0x40000000:
+		max_leaf = this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF);
+		break;
+	case 0x80000000:
+		max_leaf = this_cpu_property(X86_PROPERTY_MAX_EXT_LEAF);
+		break;
+	case 0xc0000000:
+		max_leaf = this_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF);
+	}
+	return max_leaf >= property.function;
+}
+
+static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature)
+{
+	uint32_t nr_bits;
+
+	if (feature.f.reg == KVM_CPUID_EBX) {
+		nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+		return nr_bits > feature.f.bit && !this_cpu_has(feature.f);
+	}
+
+	GUEST_ASSERT(feature.f.reg == KVM_CPUID_ECX);
+	nr_bits = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	return nr_bits > feature.f.bit || this_cpu_has(feature.f);
+}
+
+static __always_inline uint64_t this_cpu_supported_xcr0(void)
+{
+	if (!this_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO))
+		return 0;
+
+	return this_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) |
+	       ((uint64_t)this_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32);
+}
+
+typedef u32		__attribute__((vector_size(16))) sse128_t;
+#define __sse128_u	union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x)	({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x)	({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+
+static inline void read_sse_reg(int reg, sse128_t *data)
+{
+	switch (reg) {
+	case 0:
+		asm("movdqa %%xmm0, %0" : "=m"(*data));
+		break;
+	case 1:
+		asm("movdqa %%xmm1, %0" : "=m"(*data));
+		break;
+	case 2:
+		asm("movdqa %%xmm2, %0" : "=m"(*data));
+		break;
+	case 3:
+		asm("movdqa %%xmm3, %0" : "=m"(*data));
+		break;
+	case 4:
+		asm("movdqa %%xmm4, %0" : "=m"(*data));
+		break;
+	case 5:
+		asm("movdqa %%xmm5, %0" : "=m"(*data));
+		break;
+	case 6:
+		asm("movdqa %%xmm6, %0" : "=m"(*data));
+		break;
+	case 7:
+		asm("movdqa %%xmm7, %0" : "=m"(*data));
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void write_sse_reg(int reg, const sse128_t *data)
+{
+	switch (reg) {
+	case 0:
+		asm("movdqa %0, %%xmm0" : : "m"(*data));
+		break;
+	case 1:
+		asm("movdqa %0, %%xmm1" : : "m"(*data));
+		break;
+	case 2:
+		asm("movdqa %0, %%xmm2" : : "m"(*data));
+		break;
+	case 3:
+		asm("movdqa %0, %%xmm3" : : "m"(*data));
+		break;
+	case 4:
+		asm("movdqa %0, %%xmm4" : : "m"(*data));
+		break;
+	case 5:
+		asm("movdqa %0, %%xmm5" : : "m"(*data));
+		break;
+	case 6:
+		asm("movdqa %0, %%xmm6" : : "m"(*data));
+		break;
+	case 7:
+		asm("movdqa %0, %%xmm7" : : "m"(*data));
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void cpu_relax(void)
+{
+	asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void udelay(unsigned long usec)
+{
+	uint64_t start, now, cycles;
+
+	GUEST_ASSERT(guest_tsc_khz);
+	cycles = guest_tsc_khz / 1000 * usec;
+
+	/*
+	 * Deliberately don't PAUSE, a.k.a. cpu_relax(), so that the delay is
+	 * as accurate as possible, e.g. doesn't trigger PAUSE-Loop VM-Exits.
+	 */
+	start = rdtsc();
+	do {
+		now = rdtsc();
+	} while (now - start < cycles);
+}
+
+#define ud2()			\
+	__asm__ __volatile__(	\
+		"ud2\n"	\
+		)
+
+#define hlt()			\
+	__asm__ __volatile__(	\
+		"hlt\n"	\
+		)
+
+struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu);
+void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state);
+void kvm_x86_state_cleanup(struct kvm_x86_state *state);
+
+const struct kvm_msr_list *kvm_get_msr_index_list(void);
+const struct kvm_msr_list *kvm_get_feature_msr_index_list(void);
+bool kvm_msr_is_in_save_restore_list(uint32_t msr_index);
+uint64_t kvm_get_feature_msr(uint64_t msr_index);
+
+static inline void vcpu_msrs_get(struct kvm_vcpu *vcpu,
+				 struct kvm_msrs *msrs)
+{
+	int r = __vcpu_ioctl(vcpu, KVM_GET_MSRS, msrs);
+
+	TEST_ASSERT(r == msrs->nmsrs,
+		    "KVM_GET_MSRS failed, r: %i (failed on MSR %x)",
+		    r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index);
+}
+static inline void vcpu_msrs_set(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs)
+{
+	int r = __vcpu_ioctl(vcpu, KVM_SET_MSRS, msrs);
+
+	TEST_ASSERT(r == msrs->nmsrs,
+		    "KVM_SET_MSRS failed, r: %i (failed on MSR %x)",
+		    r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index);
+}
+static inline void vcpu_debugregs_get(struct kvm_vcpu *vcpu,
+				      struct kvm_debugregs *debugregs)
+{
+	vcpu_ioctl(vcpu, KVM_GET_DEBUGREGS, debugregs);
+}
+static inline void vcpu_debugregs_set(struct kvm_vcpu *vcpu,
+				      struct kvm_debugregs *debugregs)
+{
+	vcpu_ioctl(vcpu, KVM_SET_DEBUGREGS, debugregs);
+}
+static inline void vcpu_xsave_get(struct kvm_vcpu *vcpu,
+				  struct kvm_xsave *xsave)
+{
+	vcpu_ioctl(vcpu, KVM_GET_XSAVE, xsave);
+}
+static inline void vcpu_xsave2_get(struct kvm_vcpu *vcpu,
+				   struct kvm_xsave *xsave)
+{
+	vcpu_ioctl(vcpu, KVM_GET_XSAVE2, xsave);
+}
+static inline void vcpu_xsave_set(struct kvm_vcpu *vcpu,
+				  struct kvm_xsave *xsave)
+{
+	vcpu_ioctl(vcpu, KVM_SET_XSAVE, xsave);
+}
+static inline void vcpu_xcrs_get(struct kvm_vcpu *vcpu,
+				 struct kvm_xcrs *xcrs)
+{
+	vcpu_ioctl(vcpu, KVM_GET_XCRS, xcrs);
+}
+static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs)
+{
+	vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs);
+}
+
+const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
+					       uint32_t function, uint32_t index);
+const struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
+
+static inline uint32_t kvm_cpu_fms(void)
+{
+	return get_cpuid_entry(kvm_get_supported_cpuid(), 0x1, 0)->eax;
+}
+
+static inline uint32_t kvm_cpu_family(void)
+{
+	return x86_family(kvm_cpu_fms());
+}
+
+static inline uint32_t kvm_cpu_model(void)
+{
+	return x86_model(kvm_cpu_fms());
+}
+
+bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
+		   struct kvm_x86_cpu_feature feature);
+
+static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature)
+{
+	return kvm_cpuid_has(kvm_get_supported_cpuid(), feature);
+}
+
+uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
+			    struct kvm_x86_cpu_property property);
+
+static inline uint32_t kvm_cpu_property(struct kvm_x86_cpu_property property)
+{
+	return kvm_cpuid_property(kvm_get_supported_cpuid(), property);
+}
+
+static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property)
+{
+	uint32_t max_leaf;
+
+	switch (property.function & 0xc0000000) {
+	case 0:
+		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF);
+		break;
+	case 0x40000000:
+		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_KVM_LEAF);
+		break;
+	case 0x80000000:
+		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_EXT_LEAF);
+		break;
+	case 0xc0000000:
+		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF);
+	}
+	return max_leaf >= property.function;
+}
+
+static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature)
+{
+	uint32_t nr_bits;
+
+	if (feature.f.reg == KVM_CPUID_EBX) {
+		nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+		return nr_bits > feature.f.bit && !kvm_cpu_has(feature.f);
+	}
+
+	TEST_ASSERT_EQ(feature.f.reg, KVM_CPUID_ECX);
+	nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	return nr_bits > feature.f.bit || kvm_cpu_has(feature.f);
+}
+
+static __always_inline uint64_t kvm_cpu_supported_xcr0(void)
+{
+	if (!kvm_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO))
+		return 0;
+
+	return kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) |
+	       ((uint64_t)kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32);
+}
+
+static inline size_t kvm_cpuid2_size(int nr_entries)
+{
+	return sizeof(struct kvm_cpuid2) +
+	       sizeof(struct kvm_cpuid_entry2) * nr_entries;
+}
+
+/*
+ * Allocate a "struct kvm_cpuid2* instance, with the 0-length arrary of
+ * entries sized to hold @nr_entries.  The caller is responsible for freeing
+ * the struct.
+ */
+static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries)
+{
+	struct kvm_cpuid2 *cpuid;
+
+	cpuid = malloc(kvm_cpuid2_size(nr_entries));
+	TEST_ASSERT(cpuid, "-ENOMEM when allocating kvm_cpuid2");
+
+	cpuid->nent = nr_entries;
+
+	return cpuid;
+}
+
+void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid);
+
+static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu,
+							      uint32_t function,
+							      uint32_t index)
+{
+	return (struct kvm_cpuid_entry2 *)get_cpuid_entry(vcpu->cpuid,
+							  function, index);
+}
+
+static inline struct kvm_cpuid_entry2 *vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu,
+							    uint32_t function)
+{
+	return __vcpu_get_cpuid_entry(vcpu, function, 0);
+}
+
+static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first");
+	r = __vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid);
+	if (r)
+		return r;
+
+	/* On success, refresh the cache to pick up adjustments made by KVM. */
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+	return 0;
+}
+
+static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu)
+{
+	TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first");
+	vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid);
+
+	/* Refresh the cache to pick up adjustments made by KVM. */
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+}
+
+static inline void vcpu_get_cpuid(struct kvm_vcpu *vcpu)
+{
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+}
+
+void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
+			     struct kvm_x86_cpu_property property,
+			     uint32_t value);
+void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr);
+
+void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function);
+
+static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu,
+				  struct kvm_x86_cpu_feature feature)
+{
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
+	return *((&entry->eax) + feature.reg) & BIT(feature.bit);
+}
+
+void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
+				     struct kvm_x86_cpu_feature feature,
+				     bool set);
+
+static inline void vcpu_set_cpuid_feature(struct kvm_vcpu *vcpu,
+					  struct kvm_x86_cpu_feature feature)
+{
+	vcpu_set_or_clear_cpuid_feature(vcpu, feature, true);
+
+}
+
+static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu,
+					    struct kvm_x86_cpu_feature feature)
+{
+	vcpu_set_or_clear_cpuid_feature(vcpu, feature, false);
+}
+
+uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index);
+int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value);
+
+/*
+ * Assert on an MSR access(es) and pretty print the MSR name when possible.
+ * Note, the caller provides the stringified name so that the name of macro is
+ * printed, not the value the macro resolves to (due to macro expansion).
+ */
+#define TEST_ASSERT_MSR(cond, fmt, msr, str, args...)				\
+do {										\
+	if (__builtin_constant_p(msr)) {					\
+		TEST_ASSERT(cond, fmt, str, args);				\
+	} else if (!(cond)) {							\
+		char buf[16];							\
+										\
+		snprintf(buf, sizeof(buf), "MSR 0x%x", msr);			\
+		TEST_ASSERT(cond, fmt, buf, args);				\
+	}									\
+} while (0)
+
+/*
+ * Returns true if KVM should return the last written value when reading an MSR
+ * from userspace, e.g. the MSR isn't a command MSR, doesn't emulate state that
+ * is changing, etc.  This is NOT an exhaustive list!  The intent is to filter
+ * out MSRs that are not durable _and_ that a selftest wants to write.
+ */
+static inline bool is_durable_msr(uint32_t msr)
+{
+	return msr != MSR_IA32_TSC;
+}
+
+#define vcpu_set_msr(vcpu, msr, val)							\
+do {											\
+	uint64_t r, v = val;								\
+											\
+	TEST_ASSERT_MSR(_vcpu_set_msr(vcpu, msr, v) == 1,				\
+			"KVM_SET_MSRS failed on %s, value = 0x%lx", msr, #msr, v);	\
+	if (!is_durable_msr(msr))							\
+		break;									\
+	r = vcpu_get_msr(vcpu, msr);							\
+	TEST_ASSERT_MSR(r == v, "Set %s to '0x%lx', got back '0x%lx'", msr, #msr, v, r);\
+} while (0)
+
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+void kvm_init_vm_address_properties(struct kvm_vm *vm);
+bool vm_is_unrestricted_guest(struct kvm_vm *vm);
+
+struct ex_regs {
+	uint64_t rax, rcx, rdx, rbx;
+	uint64_t rbp, rsi, rdi;
+	uint64_t r8, r9, r10, r11;
+	uint64_t r12, r13, r14, r15;
+	uint64_t vector;
+	uint64_t error_code;
+	uint64_t rip;
+	uint64_t cs;
+	uint64_t rflags;
+};
+
+struct idt_entry {
+	uint16_t offset0;
+	uint16_t selector;
+	uint16_t ist : 3;
+	uint16_t : 5;
+	uint16_t type : 4;
+	uint16_t : 1;
+	uint16_t dpl : 2;
+	uint16_t p : 1;
+	uint16_t offset1;
+	uint32_t offset2; uint32_t reserved;
+};
+
+void vm_install_exception_handler(struct kvm_vm *vm, int vector,
+			void (*handler)(struct ex_regs *));
+
+/* If a toddler were to say "abracadabra". */
+#define KVM_EXCEPTION_MAGIC 0xabacadabaULL
+
+/*
+ * KVM selftest exception fixup uses registers to coordinate with the exception
+ * handler, versus the kernel's in-memory tables and KVM-Unit-Tests's in-memory
+ * per-CPU data.  Using only registers avoids having to map memory into the
+ * guest, doesn't require a valid, stable GS.base, and reduces the risk of
+ * for recursive faults when accessing memory in the handler.  The downside to
+ * using registers is that it restricts what registers can be used by the actual
+ * instruction.  But, selftests are 64-bit only, making register* pressure a
+ * minor concern.  Use r9-r11 as they are volatile, i.e. don't need to be saved
+ * by the callee, and except for r11 are not implicit parameters to any
+ * instructions.  Ideally, fixup would use r8-r10 and thus avoid implicit
+ * parameters entirely, but Hyper-V's hypercall ABI uses r8 and testing Hyper-V
+ * is higher priority than testing non-faulting SYSCALL/SYSRET.
+ *
+ * Note, the fixup handler deliberately does not handle #DE, i.e. the vector
+ * is guaranteed to be non-zero on fault.
+ *
+ * REGISTER INPUTS:
+ * r9  = MAGIC
+ * r10 = RIP
+ * r11 = new RIP on fault
+ *
+ * REGISTER OUTPUTS:
+ * r9  = exception vector (non-zero)
+ * r10 = error code
+ */
+#define __KVM_ASM_SAFE(insn, fep)				\
+	"mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t"	\
+	"lea 1f(%%rip), %%r10\n\t"				\
+	"lea 2f(%%rip), %%r11\n\t"				\
+	fep "1: " insn "\n\t"					\
+	"xor %%r9, %%r9\n\t"					\
+	"2:\n\t"						\
+	"mov  %%r9b, %[vector]\n\t"				\
+	"mov  %%r10, %[error_code]\n\t"
+
+#define KVM_ASM_SAFE(insn) __KVM_ASM_SAFE(insn, "")
+#define KVM_ASM_SAFE_FEP(insn) __KVM_ASM_SAFE(insn, KVM_FEP)
+
+#define KVM_ASM_SAFE_OUTPUTS(v, ec)	[vector] "=qm"(v), [error_code] "=rm"(ec)
+#define KVM_ASM_SAFE_CLOBBERS	"r9", "r10", "r11"
+
+#define kvm_asm_safe(insn, inputs...)					\
+({									\
+	uint64_t ign_error_code;					\
+	uint8_t vector;							\
+									\
+	asm volatile(KVM_ASM_SAFE(insn)					\
+		     : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code)	\
+		     : inputs						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+	vector;								\
+})
+
+#define kvm_asm_safe_ec(insn, error_code, inputs...)			\
+({									\
+	uint8_t vector;							\
+									\
+	asm volatile(KVM_ASM_SAFE(insn)					\
+		     : KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
+		     : inputs						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+	vector;								\
+})
+
+#define kvm_asm_safe_fep(insn, inputs...)				\
+({									\
+	uint64_t ign_error_code;					\
+	uint8_t vector;							\
+									\
+	asm volatile(KVM_ASM_SAFE(insn)					\
+		     : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code)	\
+		     : inputs						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+	vector;								\
+})
+
+#define kvm_asm_safe_ec_fep(insn, error_code, inputs...)		\
+({									\
+	uint8_t vector;							\
+									\
+	asm volatile(KVM_ASM_SAFE_FEP(insn)				\
+		     : KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
+		     : inputs						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+	vector;								\
+})
+
+#define BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP)			\
+static inline uint8_t insn##_safe ##_fep(uint32_t idx, uint64_t *val)	\
+{									\
+	uint64_t error_code;						\
+	uint8_t vector;							\
+	uint32_t a, d;							\
+									\
+	asm volatile(KVM_ASM_SAFE##_FEP(#insn)				\
+		     : "=a"(a), "=d"(d),				\
+		       KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
+		     : "c"(idx)						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+									\
+	*val = (uint64_t)a | ((uint64_t)d << 32);			\
+	return vector;							\
+}
+
+/*
+ * Generate {insn}_safe() and {insn}_safe_fep() helpers for instructions that
+ * use ECX as in input index, and EDX:EAX as a 64-bit output.
+ */
+#define BUILD_READ_U64_SAFE_HELPERS(insn)				\
+	BUILD_READ_U64_SAFE_HELPER(insn, , )				\
+	BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP)			\
+
+BUILD_READ_U64_SAFE_HELPERS(rdmsr)
+BUILD_READ_U64_SAFE_HELPERS(rdpmc)
+BUILD_READ_U64_SAFE_HELPERS(xgetbv)
+
+static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val)
+{
+	return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr));
+}
+
+static inline uint8_t xsetbv_safe(uint32_t index, uint64_t value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	return kvm_asm_safe("xsetbv", "a" (eax), "d" (edx), "c" (index));
+}
+
+bool kvm_is_tdp_enabled(void);
+
+static inline bool kvm_is_pmu_enabled(void)
+{
+	return get_kvm_param_bool("enable_pmu");
+}
+
+static inline bool kvm_is_forced_emulation_enabled(void)
+{
+	return !!get_kvm_param_integer("force_emulation_prefix");
+}
+
+uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
+				    int *level);
+uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);
+
+uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+		       uint64_t a3);
+uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1);
+void xen_hypercall(uint64_t nr, uint64_t a0, void *a1);
+
+static inline uint64_t __kvm_hypercall_map_gpa_range(uint64_t gpa,
+						     uint64_t size, uint64_t flags)
+{
+	return kvm_hypercall(KVM_HC_MAP_GPA_RANGE, gpa, size >> PAGE_SHIFT, flags, 0);
+}
+
+static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size,
+					       uint64_t flags)
+{
+	uint64_t ret = __kvm_hypercall_map_gpa_range(gpa, size, flags);
+
+	GUEST_ASSERT(!ret);
+}
+
+void __vm_xsave_require_permission(uint64_t xfeature, const char *name);
+
+#define vm_xsave_require_permission(xfeature)	\
+	__vm_xsave_require_permission(xfeature, #xfeature)
+
+enum pg_level {
+	PG_LEVEL_NONE,
+	PG_LEVEL_4K,
+	PG_LEVEL_2M,
+	PG_LEVEL_1G,
+	PG_LEVEL_512G,
+	PG_LEVEL_NUM
+};
+
+#define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12)
+#define PG_LEVEL_SIZE(_level) (1ull << PG_LEVEL_SHIFT(_level))
+
+#define PG_SIZE_4K PG_LEVEL_SIZE(PG_LEVEL_4K)
+#define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M)
+#define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
+
+void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level);
+void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+		    uint64_t nr_bytes, int level);
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE          (1UL<<0) /* Protection Enable */
+#define X86_CR0_MP          (1UL<<1) /* Monitor Coprocessor */
+#define X86_CR0_EM          (1UL<<2) /* Emulation */
+#define X86_CR0_TS          (1UL<<3) /* Task Switched */
+#define X86_CR0_ET          (1UL<<4) /* Extension Type */
+#define X86_CR0_NE          (1UL<<5) /* Numeric Error */
+#define X86_CR0_WP          (1UL<<16) /* Write Protect */
+#define X86_CR0_AM          (1UL<<18) /* Alignment Mask */
+#define X86_CR0_NW          (1UL<<29) /* Not Write-through */
+#define X86_CR0_CD          (1UL<<30) /* Cache Disable */
+#define X86_CR0_PG          (1UL<<31) /* Paging */
+
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33
+#define PFERR_IMPLICIT_ACCESS_BIT 48
+
+#define PFERR_PRESENT_MASK	BIT(PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK	BIT(PFERR_WRITE_BIT)
+#define PFERR_USER_MASK		BIT(PFERR_USER_BIT)
+#define PFERR_RSVD_MASK		BIT(PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK	BIT(PFERR_FETCH_BIT)
+#define PFERR_PK_MASK		BIT(PFERR_PK_BIT)
+#define PFERR_SGX_MASK		BIT(PFERR_SGX_BIT)
+#define PFERR_GUEST_FINAL_MASK	BIT_ULL(PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK	BIT_ULL(PFERR_GUEST_PAGE_BIT)
+#define PFERR_IMPLICIT_ACCESS	BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
+
+bool sys_clocksource_is_based_on_tsc(void);
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/x86/sev.h b/tools/testing/selftests/kvm/include/x86/sev.h
new file mode 100644
index 000000000000..82c11c81a956
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/sev.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Helpers used for SEV guests
+ *
+ */
+#ifndef SELFTEST_KVM_SEV_H
+#define SELFTEST_KVM_SEV_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "linux/psp-sev.h"
+
+#include "kvm_util.h"
+#include "svm_util.h"
+#include "processor.h"
+
+enum sev_guest_state {
+	SEV_GUEST_STATE_UNINITIALIZED = 0,
+	SEV_GUEST_STATE_LAUNCH_UPDATE,
+	SEV_GUEST_STATE_LAUNCH_SECRET,
+	SEV_GUEST_STATE_RUNNING,
+};
+
+#define SEV_POLICY_NO_DBG	(1UL << 0)
+#define SEV_POLICY_ES		(1UL << 2)
+
+#define GHCB_MSR_TERM_REQ	0x100
+
+void sev_vm_launch(struct kvm_vm *vm, uint32_t policy);
+void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement);
+void sev_vm_launch_finish(struct kvm_vm *vm);
+
+struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
+					   struct kvm_vcpu **cpu);
+void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement);
+
+kvm_static_assert(SEV_RET_SUCCESS == 0);
+
+/*
+ * The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long"
+ * instead of a proper struct.  The size of the parameter is embedded in the
+ * ioctl number, i.e. is ABI and thus immutable.  Hack around the mess by
+ * creating an overlay to pass in an "unsigned long" without a cast (casting
+ * will make the compiler unhappy due to dereferencing an aliased pointer).
+ */
+#define __vm_sev_ioctl(vm, cmd, arg)					\
+({									\
+	int r;								\
+									\
+	union {								\
+		struct kvm_sev_cmd c;					\
+		unsigned long raw;					\
+	} sev_cmd = { .c = {						\
+		.id = (cmd),						\
+		.data = (uint64_t)(arg),				\
+		.sev_fd = (vm)->arch.sev_fd,				\
+	} };								\
+									\
+	r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &sev_cmd.raw);	\
+	r ?: sev_cmd.c.error;						\
+})
+
+#define vm_sev_ioctl(vm, cmd, arg)					\
+({									\
+	int ret = __vm_sev_ioctl(vm, cmd, arg);				\
+									\
+	__TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd,	ret, vm);		\
+})
+
+void sev_vm_init(struct kvm_vm *vm);
+void sev_es_vm_init(struct kvm_vm *vm);
+
+static inline void sev_register_encrypted_memory(struct kvm_vm *vm,
+						 struct userspace_mem_region *region)
+{
+	struct kvm_enc_region range = {
+		.addr = region->region.userspace_addr,
+		.size = region->region.memory_size,
+	};
+
+	vm_ioctl(vm, KVM_MEMORY_ENCRYPT_REG_REGION, &range);
+}
+
+static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa,
+					  uint64_t size)
+{
+	struct kvm_sev_launch_update_data update_data = {
+		.uaddr = (unsigned long)addr_gpa2hva(vm, gpa),
+		.len = size,
+	};
+
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data);
+}
+
+#endif /* SELFTEST_KVM_SEV_H */
diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h
new file mode 100644
index 000000000000..29cffd0a9181
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/svm.h
@@ -0,0 +1,320 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef SELFTEST_KVM_SVM_H
+#define SELFTEST_KVM_SVM_H
+
+enum {
+	INTERCEPT_INTR,
+	INTERCEPT_NMI,
+	INTERCEPT_SMI,
+	INTERCEPT_INIT,
+	INTERCEPT_VINTR,
+	INTERCEPT_SELECTIVE_CR0,
+	INTERCEPT_STORE_IDTR,
+	INTERCEPT_STORE_GDTR,
+	INTERCEPT_STORE_LDTR,
+	INTERCEPT_STORE_TR,
+	INTERCEPT_LOAD_IDTR,
+	INTERCEPT_LOAD_GDTR,
+	INTERCEPT_LOAD_LDTR,
+	INTERCEPT_LOAD_TR,
+	INTERCEPT_RDTSC,
+	INTERCEPT_RDPMC,
+	INTERCEPT_PUSHF,
+	INTERCEPT_POPF,
+	INTERCEPT_CPUID,
+	INTERCEPT_RSM,
+	INTERCEPT_IRET,
+	INTERCEPT_INTn,
+	INTERCEPT_INVD,
+	INTERCEPT_PAUSE,
+	INTERCEPT_HLT,
+	INTERCEPT_INVLPG,
+	INTERCEPT_INVLPGA,
+	INTERCEPT_IOIO_PROT,
+	INTERCEPT_MSR_PROT,
+	INTERCEPT_TASK_SWITCH,
+	INTERCEPT_FERR_FREEZE,
+	INTERCEPT_SHUTDOWN,
+	INTERCEPT_VMRUN,
+	INTERCEPT_VMMCALL,
+	INTERCEPT_VMLOAD,
+	INTERCEPT_VMSAVE,
+	INTERCEPT_STGI,
+	INTERCEPT_CLGI,
+	INTERCEPT_SKINIT,
+	INTERCEPT_RDTSCP,
+	INTERCEPT_ICEBP,
+	INTERCEPT_WBINVD,
+	INTERCEPT_MONITOR,
+	INTERCEPT_MWAIT,
+	INTERCEPT_MWAIT_COND,
+	INTERCEPT_XSETBV,
+	INTERCEPT_RDPRU,
+};
+
+struct hv_vmcb_enlightenments {
+	struct __packed hv_enlightenments_control {
+		u32 nested_flush_hypercall:1;
+		u32 msr_bitmap:1;
+		u32 enlightened_npt_tlb: 1;
+		u32 reserved:29;
+	} __packed hv_enlightenments_control;
+	u32 hv_vp_id;
+	u64 hv_vm_id;
+	u64 partition_assist_page;
+	u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31)
+
+/* Synthetic VM-Exit */
+#define HV_SVM_EXITCODE_ENL			0xf0000000
+#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH	(1)
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+	u32 intercept_cr;
+	u32 intercept_dr;
+	u32 intercept_exceptions;
+	u64 intercept;
+	u8 reserved_1[40];
+	u16 pause_filter_thresh;
+	u16 pause_filter_count;
+	u64 iopm_base_pa;
+	u64 msrpm_base_pa;
+	u64 tsc_offset;
+	u32 asid;
+	u8 tlb_ctl;
+	u8 reserved_2[3];
+	u32 int_ctl;
+	u32 int_vector;
+	u32 int_state;
+	u8 reserved_3[4];
+	u32 exit_code;
+	u32 exit_code_hi;
+	u64 exit_info_1;
+	u64 exit_info_2;
+	u32 exit_int_info;
+	u32 exit_int_info_err;
+	u64 nested_ctl;
+	u64 avic_vapic_bar;
+	u8 reserved_4[8];
+	u32 event_inj;
+	u32 event_inj_err;
+	u64 nested_cr3;
+	u64 virt_ext;
+	u32 clean;
+	u32 reserved_5;
+	u64 next_rip;
+	u8 insn_len;
+	u8 insn_bytes[15];
+	u64 avic_backing_page;	/* Offset 0xe0 */
+	u8 reserved_6[8];	/* Offset 0xe8 */
+	u64 avic_logical_id;	/* Offset 0xf0 */
+	u64 avic_physical_id;	/* Offset 0xf8 */
+	u8 reserved_7[8];
+	u64 vmsa_pa;		/* Used for an SEV-ES guest */
+	u8 reserved_8[720];
+	/*
+	 * Offset 0x3e0, 32 bytes reserved
+	 * for use by hypervisor/software.
+	 */
+	union {
+		struct hv_vmcb_enlightenments hv_enlightenments;
+		u8 reserved_sw[32];
+	};
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_GIF_SHIFT 9
+#define V_GIF_MASK (1 << V_GIF_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define V_GIF_ENABLE_SHIFT 25
+#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+
+#define AVIC_ENABLE_SHIFT 31
+#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
+
+#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
+#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+#define SVM_VM_CR_VALID_MASK	0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
+
+#define SVM_NESTED_CTL_NP_ENABLE	BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE	BIT(1)
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+	u16 selector;
+	u16 attrib;
+	u32 limit;
+	u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+	struct vmcb_seg es;
+	struct vmcb_seg cs;
+	struct vmcb_seg ss;
+	struct vmcb_seg ds;
+	struct vmcb_seg fs;
+	struct vmcb_seg gs;
+	struct vmcb_seg gdtr;
+	struct vmcb_seg ldtr;
+	struct vmcb_seg idtr;
+	struct vmcb_seg tr;
+	u8 reserved_1[43];
+	u8 cpl;
+	u8 reserved_2[4];
+	u64 efer;
+	u8 reserved_3[112];
+	u64 cr4;
+	u64 cr3;
+	u64 cr0;
+	u64 dr7;
+	u64 dr6;
+	u64 rflags;
+	u64 rip;
+	u8 reserved_4[88];
+	u64 rsp;
+	u8 reserved_5[24];
+	u64 rax;
+	u64 star;
+	u64 lstar;
+	u64 cstar;
+	u64 sfmask;
+	u64 kernel_gs_base;
+	u64 sysenter_cs;
+	u64 sysenter_esp;
+	u64 sysenter_eip;
+	u64 cr2;
+	u8 reserved_6[32];
+	u64 g_pat;
+	u64 dbgctl;
+	u64 br_from;
+	u64 br_to;
+	u64 last_excp_from;
+	u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+	struct vmcb_control_area control;
+	struct vmcb_save_area save;
+};
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_READ	0
+#define INTERCEPT_CR3_READ	3
+#define INTERCEPT_CR4_READ	4
+#define INTERCEPT_CR8_READ	8
+#define INTERCEPT_CR0_WRITE	(16 + 0)
+#define INTERCEPT_CR3_WRITE	(16 + 3)
+#define INTERCEPT_CR4_WRITE	(16 + 4)
+#define INTERCEPT_CR8_WRITE	(16 + 8)
+
+#define INTERCEPT_DR0_READ	0
+#define INTERCEPT_DR1_READ	1
+#define INTERCEPT_DR2_READ	2
+#define INTERCEPT_DR3_READ	3
+#define INTERCEPT_DR4_READ	4
+#define INTERCEPT_DR5_READ	5
+#define INTERCEPT_DR6_READ	6
+#define INTERCEPT_DR7_READ	7
+#define INTERCEPT_DR0_WRITE	(16 + 0)
+#define INTERCEPT_DR1_WRITE	(16 + 1)
+#define INTERCEPT_DR2_WRITE	(16 + 2)
+#define INTERCEPT_DR3_WRITE	(16 + 3)
+#define INTERCEPT_DR4_WRITE	(16 + 4)
+#define INTERCEPT_DR5_WRITE	(16 + 5)
+#define INTERCEPT_DR6_WRITE	(16 + 6)
+#define INTERCEPT_DR7_WRITE	(16 + 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
+
+#define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define	SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define	SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+
+#define SVM_EXITINFO_REG_MASK 0x0F
+
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+
+#endif /* SELFTEST_KVM_SVM_H */
diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h
new file mode 100644
index 000000000000..b74c6dcddcbd
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/svm_util.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+
+#ifndef SELFTEST_KVM_SVM_UTILS_H
+#define SELFTEST_KVM_SVM_UTILS_H
+
+#include <asm/svm.h>
+
+#include <stdint.h>
+#include "svm.h"
+#include "processor.h"
+
+struct svm_test_data {
+	/* VMCB */
+	struct vmcb *vmcb; /* gva */
+	void *vmcb_hva;
+	uint64_t vmcb_gpa;
+
+	/* host state-save area */
+	struct vmcb_save_area *save_area; /* gva */
+	void *save_area_hva;
+	uint64_t save_area_gpa;
+
+	/* MSR-Bitmap */
+	void *msr; /* gva */
+	void *msr_hva;
+	uint64_t msr_gpa;
+};
+
+static inline void vmmcall(void)
+{
+	/*
+	 * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle
+	 * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended
+	 * use of this function is to exit to L1 from L2.  Clobber all other
+	 * GPRs as L1 doesn't correctly preserve them during vmexits.
+	 */
+	__asm__ __volatile__("push %%rbp; vmmcall; pop %%rbp"
+			     : : "a"(0xdeadbeef), "c"(0xbeefdead)
+			     : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+			       "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
+#define stgi()			\
+	__asm__ __volatile__(	\
+		"stgi\n"	\
+		)
+
+#define clgi()			\
+	__asm__ __volatile__(	\
+		"clgi\n"	\
+		)
+
+struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva);
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp);
+void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa);
+
+int open_sev_dev_path_or_exit(void);
+
+#endif /* SELFTEST_KVM_SVM_UTILS_H */
diff --git a/tools/testing/selftests/kvm/include/x86/ucall.h b/tools/testing/selftests/kvm/include/x86/ucall.h
new file mode 100644
index 000000000000..d3825dcc3cd9
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/ucall.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UCALL_H
+#define SELFTEST_KVM_UCALL_H
+
+#include "kvm_util.h"
+
+#define UCALL_EXIT_REASON       KVM_EXIT_IO
+
+static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
+{
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
new file mode 100644
index 000000000000..edb3c391b982
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_VMX_H
+#define SELFTEST_KVM_VMX_H
+
+#include <asm/vmx.h>
+
+#include <stdint.h>
+#include "processor.h"
+#include "apic.h"
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_INTR_WINDOW_EXITING		0x00000004
+#define CPU_BASED_USE_TSC_OFFSETTING		0x00000008
+#define CPU_BASED_HLT_EXITING			0x00000080
+#define CPU_BASED_INVLPG_EXITING		0x00000200
+#define CPU_BASED_MWAIT_EXITING			0x00000400
+#define CPU_BASED_RDPMC_EXITING			0x00000800
+#define CPU_BASED_RDTSC_EXITING			0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING		0x00008000
+#define CPU_BASED_CR3_STORE_EXITING		0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING		0x00080000
+#define CPU_BASED_CR8_STORE_EXITING		0x00100000
+#define CPU_BASED_TPR_SHADOW			0x00200000
+#define CPU_BASED_NMI_WINDOW_EXITING		0x00400000
+#define CPU_BASED_MOV_DR_EXITING		0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING		0x01000000
+#define CPU_BASED_USE_IO_BITMAPS		0x02000000
+#define CPU_BASED_MONITOR_TRAP			0x08000000
+#define CPU_BASED_USE_MSR_BITMAPS		0x10000000
+#define CPU_BASED_MONITOR_EXITING		0x20000000
+#define CPU_BASED_PAUSE_EXITING			0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS	0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x0401e172
+
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT		0x00000002
+#define SECONDARY_EXEC_DESC			0x00000004
+#define SECONDARY_EXEC_ENABLE_RDTSCP		0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE	0x00000010
+#define SECONDARY_EXEC_ENABLE_VPID		0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT	0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY	0x00000200
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
+#define SECONDARY_EXEC_RDRAND_EXITING		0x00000800
+#define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC		0x00002000
+#define SECONDARY_EXEC_SHADOW_VMCS		0x00004000
+#define SECONDARY_EXEC_RDSEED_EXITING		0x00010000
+#define SECONDARY_EXEC_ENABLE_PML		0x00020000
+#define SECONDARY_EPT_VE			0x00040000
+#define SECONDARY_ENABLE_XSAV_RESTORE		0x00100000
+#define SECONDARY_EXEC_TSC_SCALING		0x02000000
+
+#define PIN_BASED_EXT_INTR_MASK			0x00000001
+#define PIN_BASED_NMI_EXITING			0x00000008
+#define PIN_BASED_VIRTUAL_NMIS			0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER		0x00000040
+#define PIN_BASED_POSTED_INTR			0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS		0x00000004
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE		0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL	0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT		0x00008000
+#define VM_EXIT_SAVE_IA32_PAT			0x00040000
+#define VM_EXIT_LOAD_IA32_PAT			0x00080000
+#define VM_EXIT_SAVE_IA32_EFER			0x00100000
+#define VM_EXIT_LOAD_IA32_EFER			0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER	0x00400000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR	0x00036dff
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS		0x00000004
+#define VM_ENTRY_IA32E_MODE			0x00000200
+#define VM_ENTRY_SMM				0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR		0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL	0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT			0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER			0x00008000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR	0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK	0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA			0x00000020
+
+#define VMX_EPT_VPID_CAP_1G_PAGES		0x00020000
+#define VMX_EPT_VPID_CAP_AD_BITS		0x00200000
+
+#define EXIT_REASON_FAILED_VMENTRY	0x80000000
+
+enum vmcs_field {
+	VIRTUAL_PROCESSOR_ID		= 0x00000000,
+	POSTED_INTR_NV			= 0x00000002,
+	GUEST_ES_SELECTOR		= 0x00000800,
+	GUEST_CS_SELECTOR		= 0x00000802,
+	GUEST_SS_SELECTOR		= 0x00000804,
+	GUEST_DS_SELECTOR		= 0x00000806,
+	GUEST_FS_SELECTOR		= 0x00000808,
+	GUEST_GS_SELECTOR		= 0x0000080a,
+	GUEST_LDTR_SELECTOR		= 0x0000080c,
+	GUEST_TR_SELECTOR		= 0x0000080e,
+	GUEST_INTR_STATUS		= 0x00000810,
+	GUEST_PML_INDEX			= 0x00000812,
+	HOST_ES_SELECTOR		= 0x00000c00,
+	HOST_CS_SELECTOR		= 0x00000c02,
+	HOST_SS_SELECTOR		= 0x00000c04,
+	HOST_DS_SELECTOR		= 0x00000c06,
+	HOST_FS_SELECTOR		= 0x00000c08,
+	HOST_GS_SELECTOR		= 0x00000c0a,
+	HOST_TR_SELECTOR		= 0x00000c0c,
+	IO_BITMAP_A			= 0x00002000,
+	IO_BITMAP_A_HIGH		= 0x00002001,
+	IO_BITMAP_B			= 0x00002002,
+	IO_BITMAP_B_HIGH		= 0x00002003,
+	MSR_BITMAP			= 0x00002004,
+	MSR_BITMAP_HIGH			= 0x00002005,
+	VM_EXIT_MSR_STORE_ADDR		= 0x00002006,
+	VM_EXIT_MSR_STORE_ADDR_HIGH	= 0x00002007,
+	VM_EXIT_MSR_LOAD_ADDR		= 0x00002008,
+	VM_EXIT_MSR_LOAD_ADDR_HIGH	= 0x00002009,
+	VM_ENTRY_MSR_LOAD_ADDR		= 0x0000200a,
+	VM_ENTRY_MSR_LOAD_ADDR_HIGH	= 0x0000200b,
+	PML_ADDRESS			= 0x0000200e,
+	PML_ADDRESS_HIGH		= 0x0000200f,
+	TSC_OFFSET			= 0x00002010,
+	TSC_OFFSET_HIGH			= 0x00002011,
+	VIRTUAL_APIC_PAGE_ADDR		= 0x00002012,
+	VIRTUAL_APIC_PAGE_ADDR_HIGH	= 0x00002013,
+	APIC_ACCESS_ADDR		= 0x00002014,
+	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	POSTED_INTR_DESC_ADDR		= 0x00002016,
+	POSTED_INTR_DESC_ADDR_HIGH	= 0x00002017,
+	EPT_POINTER			= 0x0000201a,
+	EPT_POINTER_HIGH		= 0x0000201b,
+	EOI_EXIT_BITMAP0		= 0x0000201c,
+	EOI_EXIT_BITMAP0_HIGH		= 0x0000201d,
+	EOI_EXIT_BITMAP1		= 0x0000201e,
+	EOI_EXIT_BITMAP1_HIGH		= 0x0000201f,
+	EOI_EXIT_BITMAP2		= 0x00002020,
+	EOI_EXIT_BITMAP2_HIGH		= 0x00002021,
+	EOI_EXIT_BITMAP3		= 0x00002022,
+	EOI_EXIT_BITMAP3_HIGH		= 0x00002023,
+	VMREAD_BITMAP			= 0x00002026,
+	VMREAD_BITMAP_HIGH		= 0x00002027,
+	VMWRITE_BITMAP			= 0x00002028,
+	VMWRITE_BITMAP_HIGH		= 0x00002029,
+	XSS_EXIT_BITMAP			= 0x0000202C,
+	XSS_EXIT_BITMAP_HIGH		= 0x0000202D,
+	ENCLS_EXITING_BITMAP		= 0x0000202E,
+	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
+	TSC_MULTIPLIER			= 0x00002032,
+	TSC_MULTIPLIER_HIGH		= 0x00002033,
+	GUEST_PHYSICAL_ADDRESS		= 0x00002400,
+	GUEST_PHYSICAL_ADDRESS_HIGH	= 0x00002401,
+	VMCS_LINK_POINTER		= 0x00002800,
+	VMCS_LINK_POINTER_HIGH		= 0x00002801,
+	GUEST_IA32_DEBUGCTL		= 0x00002802,
+	GUEST_IA32_DEBUGCTL_HIGH	= 0x00002803,
+	GUEST_IA32_PAT			= 0x00002804,
+	GUEST_IA32_PAT_HIGH		= 0x00002805,
+	GUEST_IA32_EFER			= 0x00002806,
+	GUEST_IA32_EFER_HIGH		= 0x00002807,
+	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
+	GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+	GUEST_PDPTR0			= 0x0000280a,
+	GUEST_PDPTR0_HIGH		= 0x0000280b,
+	GUEST_PDPTR1			= 0x0000280c,
+	GUEST_PDPTR1_HIGH		= 0x0000280d,
+	GUEST_PDPTR2			= 0x0000280e,
+	GUEST_PDPTR2_HIGH		= 0x0000280f,
+	GUEST_PDPTR3			= 0x00002810,
+	GUEST_PDPTR3_HIGH		= 0x00002811,
+	GUEST_BNDCFGS			= 0x00002812,
+	GUEST_BNDCFGS_HIGH		= 0x00002813,
+	HOST_IA32_PAT			= 0x00002c00,
+	HOST_IA32_PAT_HIGH		= 0x00002c01,
+	HOST_IA32_EFER			= 0x00002c02,
+	HOST_IA32_EFER_HIGH		= 0x00002c03,
+	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
+	HOST_IA32_PERF_GLOBAL_CTRL_HIGH	= 0x00002c05,
+	PIN_BASED_VM_EXEC_CONTROL	= 0x00004000,
+	CPU_BASED_VM_EXEC_CONTROL	= 0x00004002,
+	EXCEPTION_BITMAP		= 0x00004004,
+	PAGE_FAULT_ERROR_CODE_MASK	= 0x00004006,
+	PAGE_FAULT_ERROR_CODE_MATCH	= 0x00004008,
+	CR3_TARGET_COUNT		= 0x0000400a,
+	VM_EXIT_CONTROLS		= 0x0000400c,
+	VM_EXIT_MSR_STORE_COUNT		= 0x0000400e,
+	VM_EXIT_MSR_LOAD_COUNT		= 0x00004010,
+	VM_ENTRY_CONTROLS		= 0x00004012,
+	VM_ENTRY_MSR_LOAD_COUNT		= 0x00004014,
+	VM_ENTRY_INTR_INFO_FIELD	= 0x00004016,
+	VM_ENTRY_EXCEPTION_ERROR_CODE	= 0x00004018,
+	VM_ENTRY_INSTRUCTION_LEN	= 0x0000401a,
+	TPR_THRESHOLD			= 0x0000401c,
+	SECONDARY_VM_EXEC_CONTROL	= 0x0000401e,
+	PLE_GAP				= 0x00004020,
+	PLE_WINDOW			= 0x00004022,
+	VM_INSTRUCTION_ERROR		= 0x00004400,
+	VM_EXIT_REASON			= 0x00004402,
+	VM_EXIT_INTR_INFO		= 0x00004404,
+	VM_EXIT_INTR_ERROR_CODE		= 0x00004406,
+	IDT_VECTORING_INFO_FIELD	= 0x00004408,
+	IDT_VECTORING_ERROR_CODE	= 0x0000440a,
+	VM_EXIT_INSTRUCTION_LEN		= 0x0000440c,
+	VMX_INSTRUCTION_INFO		= 0x0000440e,
+	GUEST_ES_LIMIT			= 0x00004800,
+	GUEST_CS_LIMIT			= 0x00004802,
+	GUEST_SS_LIMIT			= 0x00004804,
+	GUEST_DS_LIMIT			= 0x00004806,
+	GUEST_FS_LIMIT			= 0x00004808,
+	GUEST_GS_LIMIT			= 0x0000480a,
+	GUEST_LDTR_LIMIT		= 0x0000480c,
+	GUEST_TR_LIMIT			= 0x0000480e,
+	GUEST_GDTR_LIMIT		= 0x00004810,
+	GUEST_IDTR_LIMIT		= 0x00004812,
+	GUEST_ES_AR_BYTES		= 0x00004814,
+	GUEST_CS_AR_BYTES		= 0x00004816,
+	GUEST_SS_AR_BYTES		= 0x00004818,
+	GUEST_DS_AR_BYTES		= 0x0000481a,
+	GUEST_FS_AR_BYTES		= 0x0000481c,
+	GUEST_GS_AR_BYTES		= 0x0000481e,
+	GUEST_LDTR_AR_BYTES		= 0x00004820,
+	GUEST_TR_AR_BYTES		= 0x00004822,
+	GUEST_INTERRUPTIBILITY_INFO	= 0x00004824,
+	GUEST_ACTIVITY_STATE		= 0X00004826,
+	GUEST_SYSENTER_CS		= 0x0000482A,
+	VMX_PREEMPTION_TIMER_VALUE	= 0x0000482E,
+	HOST_IA32_SYSENTER_CS		= 0x00004c00,
+	CR0_GUEST_HOST_MASK		= 0x00006000,
+	CR4_GUEST_HOST_MASK		= 0x00006002,
+	CR0_READ_SHADOW			= 0x00006004,
+	CR4_READ_SHADOW			= 0x00006006,
+	CR3_TARGET_VALUE0		= 0x00006008,
+	CR3_TARGET_VALUE1		= 0x0000600a,
+	CR3_TARGET_VALUE2		= 0x0000600c,
+	CR3_TARGET_VALUE3		= 0x0000600e,
+	EXIT_QUALIFICATION		= 0x00006400,
+	GUEST_LINEAR_ADDRESS		= 0x0000640a,
+	GUEST_CR0			= 0x00006800,
+	GUEST_CR3			= 0x00006802,
+	GUEST_CR4			= 0x00006804,
+	GUEST_ES_BASE			= 0x00006806,
+	GUEST_CS_BASE			= 0x00006808,
+	GUEST_SS_BASE			= 0x0000680a,
+	GUEST_DS_BASE			= 0x0000680c,
+	GUEST_FS_BASE			= 0x0000680e,
+	GUEST_GS_BASE			= 0x00006810,
+	GUEST_LDTR_BASE			= 0x00006812,
+	GUEST_TR_BASE			= 0x00006814,
+	GUEST_GDTR_BASE			= 0x00006816,
+	GUEST_IDTR_BASE			= 0x00006818,
+	GUEST_DR7			= 0x0000681a,
+	GUEST_RSP			= 0x0000681c,
+	GUEST_RIP			= 0x0000681e,
+	GUEST_RFLAGS			= 0x00006820,
+	GUEST_PENDING_DBG_EXCEPTIONS	= 0x00006822,
+	GUEST_SYSENTER_ESP		= 0x00006824,
+	GUEST_SYSENTER_EIP		= 0x00006826,
+	HOST_CR0			= 0x00006c00,
+	HOST_CR3			= 0x00006c02,
+	HOST_CR4			= 0x00006c04,
+	HOST_FS_BASE			= 0x00006c06,
+	HOST_GS_BASE			= 0x00006c08,
+	HOST_TR_BASE			= 0x00006c0a,
+	HOST_GDTR_BASE			= 0x00006c0c,
+	HOST_IDTR_BASE			= 0x00006c0e,
+	HOST_IA32_SYSENTER_ESP		= 0x00006c10,
+	HOST_IA32_SYSENTER_EIP		= 0x00006c12,
+	HOST_RSP			= 0x00006c14,
+	HOST_RIP			= 0x00006c16,
+};
+
+struct vmx_msr_entry {
+	uint32_t index;
+	uint32_t reserved;
+	uint64_t value;
+} __attribute__ ((aligned(16)));
+
+#include "evmcs.h"
+
+static inline int vmxon(uint64_t phys)
+{
+	uint8_t ret;
+
+	__asm__ __volatile__ ("vmxon %[pa]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [pa]"m"(phys)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline void vmxoff(void)
+{
+	__asm__ __volatile__("vmxoff");
+}
+
+static inline int vmclear(uint64_t vmcs_pa)
+{
+	uint8_t ret;
+
+	__asm__ __volatile__ ("vmclear %[pa]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [pa]"m"(vmcs_pa)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline int vmptrld(uint64_t vmcs_pa)
+{
+	uint8_t ret;
+
+	if (enable_evmcs)
+		return -1;
+
+	__asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [pa]"m"(vmcs_pa)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline int vmptrst(uint64_t *value)
+{
+	uint64_t tmp;
+	uint8_t ret;
+
+	if (enable_evmcs)
+		return evmcs_vmptrst(value);
+
+	__asm__ __volatile__("vmptrst %[value]; setna %[ret]"
+		: [value]"=m"(tmp), [ret]"=rm"(ret)
+		: : "cc", "memory");
+
+	*value = tmp;
+	return ret;
+}
+
+/*
+ * A wrapper around vmptrst that ignores errors and returns zero if the
+ * vmptrst instruction fails.
+ */
+static inline uint64_t vmptrstz(void)
+{
+	uint64_t value = 0;
+	vmptrst(&value);
+	return value;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmlaunch.
+ */
+static inline int vmlaunch(void)
+{
+	int ret;
+
+	if (enable_evmcs)
+		return evmcs_vmlaunch();
+
+	__asm__ __volatile__("push %%rbp;"
+			     "push %%rcx;"
+			     "push %%rdx;"
+			     "push %%rsi;"
+			     "push %%rdi;"
+			     "push $0;"
+			     "vmwrite %%rsp, %[host_rsp];"
+			     "lea 1f(%%rip), %%rax;"
+			     "vmwrite %%rax, %[host_rip];"
+			     "vmlaunch;"
+			     "incq (%%rsp);"
+			     "1: pop %%rax;"
+			     "pop %%rdi;"
+			     "pop %%rsi;"
+			     "pop %%rdx;"
+			     "pop %%rcx;"
+			     "pop %%rbp;"
+			     : [ret]"=&a"(ret)
+			     : [host_rsp]"r"((uint64_t)HOST_RSP),
+			       [host_rip]"r"((uint64_t)HOST_RIP)
+			     : "memory", "cc", "rbx", "r8", "r9", "r10",
+			       "r11", "r12", "r13", "r14", "r15");
+	return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int vmresume(void)
+{
+	int ret;
+
+	if (enable_evmcs)
+		return evmcs_vmresume();
+
+	__asm__ __volatile__("push %%rbp;"
+			     "push %%rcx;"
+			     "push %%rdx;"
+			     "push %%rsi;"
+			     "push %%rdi;"
+			     "push $0;"
+			     "vmwrite %%rsp, %[host_rsp];"
+			     "lea 1f(%%rip), %%rax;"
+			     "vmwrite %%rax, %[host_rip];"
+			     "vmresume;"
+			     "incq (%%rsp);"
+			     "1: pop %%rax;"
+			     "pop %%rdi;"
+			     "pop %%rsi;"
+			     "pop %%rdx;"
+			     "pop %%rcx;"
+			     "pop %%rbp;"
+			     : [ret]"=&a"(ret)
+			     : [host_rsp]"r"((uint64_t)HOST_RSP),
+			       [host_rip]"r"((uint64_t)HOST_RIP)
+			     : "memory", "cc", "rbx", "r8", "r9", "r10",
+			       "r11", "r12", "r13", "r14", "r15");
+	return ret;
+}
+
+static inline void vmcall(void)
+{
+	/*
+	 * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle
+	 * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended
+	 * use of this function is to exit to L1 from L2.  Clobber all other
+	 * GPRs as L1 doesn't correctly preserve them during vmexits.
+	 */
+	__asm__ __volatile__("push %%rbp; vmcall; pop %%rbp"
+			     : : "a"(0xdeadbeef), "c"(0xbeefdead)
+			     : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+			       "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
+static inline int vmread(uint64_t encoding, uint64_t *value)
+{
+	uint64_t tmp;
+	uint8_t ret;
+
+	if (enable_evmcs)
+		return evmcs_vmread(encoding, value);
+
+	__asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
+		: [value]"=rm"(tmp), [ret]"=rm"(ret)
+		: [encoding]"r"(encoding)
+		: "cc", "memory");
+
+	*value = tmp;
+	return ret;
+}
+
+/*
+ * A wrapper around vmread that ignores errors and returns zero if the
+ * vmread instruction fails.
+ */
+static inline uint64_t vmreadz(uint64_t encoding)
+{
+	uint64_t value = 0;
+	vmread(encoding, &value);
+	return value;
+}
+
+static inline int vmwrite(uint64_t encoding, uint64_t value)
+{
+	uint8_t ret;
+
+	if (enable_evmcs)
+		return evmcs_vmwrite(encoding, value);
+
+	__asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [value]"rm"(value), [encoding]"r"(encoding)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline uint32_t vmcs_revision(void)
+{
+	return rdmsr(MSR_IA32_VMX_BASIC);
+}
+
+struct vmx_pages {
+	void *vmxon_hva;
+	uint64_t vmxon_gpa;
+	void *vmxon;
+
+	void *vmcs_hva;
+	uint64_t vmcs_gpa;
+	void *vmcs;
+
+	void *msr_hva;
+	uint64_t msr_gpa;
+	void *msr;
+
+	void *shadow_vmcs_hva;
+	uint64_t shadow_vmcs_gpa;
+	void *shadow_vmcs;
+
+	void *vmread_hva;
+	uint64_t vmread_gpa;
+	void *vmread;
+
+	void *vmwrite_hva;
+	uint64_t vmwrite_gpa;
+	void *vmwrite;
+
+	void *eptp_hva;
+	uint64_t eptp_gpa;
+	void *eptp;
+
+	void *apic_access_hva;
+	uint64_t apic_access_gpa;
+	void *apic_access;
+};
+
+union vmx_basic {
+	u64 val;
+	struct {
+		u32 revision;
+		u32	size:13,
+			reserved1:3,
+			width:1,
+			dual:1,
+			type:4,
+			insouts:1,
+			ctrl:1,
+			vm_entry_exception_ctrl:1,
+			reserved2:7;
+	};
+};
+
+union vmx_ctrl_msr {
+	u64 val;
+	struct {
+		u32 set, clr;
+	};
+};
+
+struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
+bool prepare_for_vmx_operation(struct vmx_pages *vmx);
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
+bool load_vmcs(struct vmx_pages *vmx);
+
+bool ept_1g_pages_supported(void);
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		   uint64_t nested_paddr, uint64_t paddr);
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		 uint64_t nested_paddr, uint64_t paddr, uint64_t size);
+void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
+			uint32_t memslot);
+void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+			    uint64_t addr, uint64_t size);
+bool kvm_cpu_has_ept(void);
+void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
+		  uint32_t eptp_memslot);
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
+
+#endif /* SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h
deleted file mode 100644
index 51990094effd..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/apic.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * tools/testing/selftests/kvm/include/x86_64/apic.h
- *
- * Copyright (C) 2021, Google LLC.
- */
-
-#ifndef SELFTEST_KVM_APIC_H
-#define SELFTEST_KVM_APIC_H
-
-#include <stdint.h>
-
-#include "processor.h"
-#include "ucall_common.h"
-
-#define APIC_DEFAULT_GPA		0xfee00000ULL
-
-/* APIC base address MSR and fields */
-#define MSR_IA32_APICBASE		0x0000001b
-#define MSR_IA32_APICBASE_BSP		(1<<8)
-#define MSR_IA32_APICBASE_EXTD		(1<<10)
-#define MSR_IA32_APICBASE_ENABLE	(1<<11)
-#define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
-#define		GET_APIC_BASE(x)	(((x) >> 12) << 12)
-
-#define APIC_BASE_MSR	0x800
-#define X2APIC_ENABLE	(1UL << 10)
-#define	APIC_ID		0x20
-#define	APIC_LVR	0x30
-#define		GET_APIC_ID_FIELD(x)	(((x) >> 24) & 0xFF)
-#define	APIC_TASKPRI	0x80
-#define	APIC_PROCPRI	0xA0
-#define	APIC_EOI	0xB0
-#define	APIC_SPIV	0xF0
-#define		APIC_SPIV_FOCUS_DISABLED	(1 << 9)
-#define		APIC_SPIV_APIC_ENABLED		(1 << 8)
-#define APIC_IRR	0x200
-#define	APIC_ICR	0x300
-#define	APIC_LVTCMCI	0x2f0
-#define		APIC_DEST_SELF		0x40000
-#define		APIC_DEST_ALLINC	0x80000
-#define		APIC_DEST_ALLBUT	0xC0000
-#define		APIC_ICR_RR_MASK	0x30000
-#define		APIC_ICR_RR_INVALID	0x00000
-#define		APIC_ICR_RR_INPROG	0x10000
-#define		APIC_ICR_RR_VALID	0x20000
-#define		APIC_INT_LEVELTRIG	0x08000
-#define		APIC_INT_ASSERT		0x04000
-#define		APIC_ICR_BUSY		0x01000
-#define		APIC_DEST_LOGICAL	0x00800
-#define		APIC_DEST_PHYSICAL	0x00000
-#define		APIC_DM_FIXED		0x00000
-#define		APIC_DM_FIXED_MASK	0x00700
-#define		APIC_DM_LOWEST		0x00100
-#define		APIC_DM_SMI		0x00200
-#define		APIC_DM_REMRD		0x00300
-#define		APIC_DM_NMI		0x00400
-#define		APIC_DM_INIT		0x00500
-#define		APIC_DM_STARTUP		0x00600
-#define		APIC_DM_EXTINT		0x00700
-#define		APIC_VECTOR_MASK	0x000FF
-#define	APIC_ICR2	0x310
-#define		SET_APIC_DEST_FIELD(x)	((x) << 24)
-#define APIC_LVTT	0x320
-#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
-#define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
-#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
-#define		APIC_LVT_MASKED			(1 << 16)
-#define	APIC_TMICT	0x380
-#define	APIC_TMCCT	0x390
-#define	APIC_TDCR	0x3E0
-
-void apic_disable(void);
-void xapic_enable(void);
-void x2apic_enable(void);
-
-static inline uint32_t get_bsp_flag(void)
-{
-	return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP;
-}
-
-static inline uint32_t xapic_read_reg(unsigned int reg)
-{
-	return ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2];
-}
-
-static inline void xapic_write_reg(unsigned int reg, uint32_t val)
-{
-	((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2] = val;
-}
-
-static inline uint64_t x2apic_read_reg(unsigned int reg)
-{
-	return rdmsr(APIC_BASE_MSR + (reg >> 4));
-}
-
-static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value)
-{
-	return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value);
-}
-
-static inline void x2apic_write_reg(unsigned int reg, uint64_t value)
-{
-	uint8_t fault = x2apic_write_reg_safe(reg, value);
-
-	__GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n",
-		       fault, APIC_BASE_MSR + (reg >> 4), value);
-}
-
-static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value)
-{
-	uint8_t fault = x2apic_write_reg_safe(reg, value);
-
-	__GUEST_ASSERT(fault == GP_VECTOR,
-		       "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n",
-		       APIC_BASE_MSR + (reg >> 4), value, fault);
-}
-
-
-#endif /* SELFTEST_KVM_APIC_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
deleted file mode 100644
index 901caf0e0939..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h
+++ /dev/null
@@ -1,1279 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * tools/testing/selftests/kvm/include/x86_64/evmcs.h
- *
- * Copyright (C) 2018, Red Hat, Inc.
- *
- */
-
-#ifndef SELFTEST_KVM_EVMCS_H
-#define SELFTEST_KVM_EVMCS_H
-
-#include <stdint.h>
-#include "hyperv.h"
-#include "vmx.h"
-
-#define u16 uint16_t
-#define u32 uint32_t
-#define u64 uint64_t
-
-#define EVMCS_VERSION 1
-
-extern bool enable_evmcs;
-
-struct hv_enlightened_vmcs {
-	u32 revision_id;
-	u32 abort;
-
-	u16 host_es_selector;
-	u16 host_cs_selector;
-	u16 host_ss_selector;
-	u16 host_ds_selector;
-	u16 host_fs_selector;
-	u16 host_gs_selector;
-	u16 host_tr_selector;
-
-	u16 padding16_1;
-
-	u64 host_ia32_pat;
-	u64 host_ia32_efer;
-
-	u64 host_cr0;
-	u64 host_cr3;
-	u64 host_cr4;
-
-	u64 host_ia32_sysenter_esp;
-	u64 host_ia32_sysenter_eip;
-	u64 host_rip;
-	u32 host_ia32_sysenter_cs;
-
-	u32 pin_based_vm_exec_control;
-	u32 vm_exit_controls;
-	u32 secondary_vm_exec_control;
-
-	u64 io_bitmap_a;
-	u64 io_bitmap_b;
-	u64 msr_bitmap;
-
-	u16 guest_es_selector;
-	u16 guest_cs_selector;
-	u16 guest_ss_selector;
-	u16 guest_ds_selector;
-	u16 guest_fs_selector;
-	u16 guest_gs_selector;
-	u16 guest_ldtr_selector;
-	u16 guest_tr_selector;
-
-	u32 guest_es_limit;
-	u32 guest_cs_limit;
-	u32 guest_ss_limit;
-	u32 guest_ds_limit;
-	u32 guest_fs_limit;
-	u32 guest_gs_limit;
-	u32 guest_ldtr_limit;
-	u32 guest_tr_limit;
-	u32 guest_gdtr_limit;
-	u32 guest_idtr_limit;
-
-	u32 guest_es_ar_bytes;
-	u32 guest_cs_ar_bytes;
-	u32 guest_ss_ar_bytes;
-	u32 guest_ds_ar_bytes;
-	u32 guest_fs_ar_bytes;
-	u32 guest_gs_ar_bytes;
-	u32 guest_ldtr_ar_bytes;
-	u32 guest_tr_ar_bytes;
-
-	u64 guest_es_base;
-	u64 guest_cs_base;
-	u64 guest_ss_base;
-	u64 guest_ds_base;
-	u64 guest_fs_base;
-	u64 guest_gs_base;
-	u64 guest_ldtr_base;
-	u64 guest_tr_base;
-	u64 guest_gdtr_base;
-	u64 guest_idtr_base;
-
-	u64 padding64_1[3];
-
-	u64 vm_exit_msr_store_addr;
-	u64 vm_exit_msr_load_addr;
-	u64 vm_entry_msr_load_addr;
-
-	u64 cr3_target_value0;
-	u64 cr3_target_value1;
-	u64 cr3_target_value2;
-	u64 cr3_target_value3;
-
-	u32 page_fault_error_code_mask;
-	u32 page_fault_error_code_match;
-
-	u32 cr3_target_count;
-	u32 vm_exit_msr_store_count;
-	u32 vm_exit_msr_load_count;
-	u32 vm_entry_msr_load_count;
-
-	u64 tsc_offset;
-	u64 virtual_apic_page_addr;
-	u64 vmcs_link_pointer;
-
-	u64 guest_ia32_debugctl;
-	u64 guest_ia32_pat;
-	u64 guest_ia32_efer;
-
-	u64 guest_pdptr0;
-	u64 guest_pdptr1;
-	u64 guest_pdptr2;
-	u64 guest_pdptr3;
-
-	u64 guest_pending_dbg_exceptions;
-	u64 guest_sysenter_esp;
-	u64 guest_sysenter_eip;
-
-	u32 guest_activity_state;
-	u32 guest_sysenter_cs;
-
-	u64 cr0_guest_host_mask;
-	u64 cr4_guest_host_mask;
-	u64 cr0_read_shadow;
-	u64 cr4_read_shadow;
-	u64 guest_cr0;
-	u64 guest_cr3;
-	u64 guest_cr4;
-	u64 guest_dr7;
-
-	u64 host_fs_base;
-	u64 host_gs_base;
-	u64 host_tr_base;
-	u64 host_gdtr_base;
-	u64 host_idtr_base;
-	u64 host_rsp;
-
-	u64 ept_pointer;
-
-	u16 virtual_processor_id;
-	u16 padding16_2[3];
-
-	u64 padding64_2[5];
-	u64 guest_physical_address;
-
-	u32 vm_instruction_error;
-	u32 vm_exit_reason;
-	u32 vm_exit_intr_info;
-	u32 vm_exit_intr_error_code;
-	u32 idt_vectoring_info_field;
-	u32 idt_vectoring_error_code;
-	u32 vm_exit_instruction_len;
-	u32 vmx_instruction_info;
-
-	u64 exit_qualification;
-	u64 exit_io_instruction_ecx;
-	u64 exit_io_instruction_esi;
-	u64 exit_io_instruction_edi;
-	u64 exit_io_instruction_eip;
-
-	u64 guest_linear_address;
-	u64 guest_rsp;
-	u64 guest_rflags;
-
-	u32 guest_interruptibility_info;
-	u32 cpu_based_vm_exec_control;
-	u32 exception_bitmap;
-	u32 vm_entry_controls;
-	u32 vm_entry_intr_info_field;
-	u32 vm_entry_exception_error_code;
-	u32 vm_entry_instruction_len;
-	u32 tpr_threshold;
-
-	u64 guest_rip;
-
-	u32 hv_clean_fields;
-	u32 padding32_1;
-	u32 hv_synthetic_controls;
-	struct {
-		u32 nested_flush_hypercall:1;
-		u32 msr_bitmap:1;
-		u32 reserved:30;
-	}  __packed hv_enlightenments_control;
-	u32 hv_vp_id;
-	u32 padding32_2;
-	u64 hv_vm_id;
-	u64 partition_assist_page;
-	u64 padding64_4[4];
-	u64 guest_bndcfgs;
-	u64 guest_ia32_perf_global_ctrl;
-	u64 guest_ia32_s_cet;
-	u64 guest_ssp;
-	u64 guest_ia32_int_ssp_table_addr;
-	u64 guest_ia32_lbr_ctl;
-	u64 padding64_5[2];
-	u64 xss_exit_bitmap;
-	u64 encls_exiting_bitmap;
-	u64 host_ia32_perf_global_ctrl;
-	u64 tsc_multiplier;
-	u64 host_ia32_s_cet;
-	u64 host_ssp;
-	u64 host_ia32_int_ssp_table_addr;
-	u64 padding64_6;
-} __packed;
-
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE                     0
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP                BIT(0)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP               BIT(1)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2             BIT(2)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1             BIT(3)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC             BIT(4)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT            BIT(5)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY            BIT(6)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN            BIT(7)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR                     BIT(8)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT             BIT(9)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC              BIT(10)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1               BIT(11)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2               BIT(12)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER             BIT(13)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1                BIT(14)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL    BIT(15)
-#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL                      0xFFFF
-
-#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
-
-extern struct hv_enlightened_vmcs *current_evmcs;
-
-int vcpu_enable_evmcs(struct kvm_vcpu *vcpu);
-
-static inline void evmcs_enable(void)
-{
-	enable_evmcs = true;
-}
-
-static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
-{
-	current_vp_assist->current_nested_vmcs = vmcs_pa;
-	current_vp_assist->enlighten_vmentry = 1;
-
-	current_evmcs = vmcs;
-
-	return 0;
-}
-
-static inline bool load_evmcs(struct hyperv_test_pages *hv)
-{
-	if (evmcs_vmptrld(hv->enlightened_vmcs_gpa, hv->enlightened_vmcs))
-		return false;
-
-	current_evmcs->revision_id = EVMCS_VERSION;
-
-	return true;
-}
-
-static inline int evmcs_vmptrst(uint64_t *value)
-{
-	*value = current_vp_assist->current_nested_vmcs &
-		~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-
-	return 0;
-}
-
-static inline int evmcs_vmread(uint64_t encoding, uint64_t *value)
-{
-	switch (encoding) {
-	case GUEST_RIP:
-		*value = current_evmcs->guest_rip;
-		break;
-	case GUEST_RSP:
-		*value = current_evmcs->guest_rsp;
-		break;
-	case GUEST_RFLAGS:
-		*value = current_evmcs->guest_rflags;
-		break;
-	case HOST_IA32_PAT:
-		*value = current_evmcs->host_ia32_pat;
-		break;
-	case HOST_IA32_EFER:
-		*value = current_evmcs->host_ia32_efer;
-		break;
-	case HOST_CR0:
-		*value = current_evmcs->host_cr0;
-		break;
-	case HOST_CR3:
-		*value = current_evmcs->host_cr3;
-		break;
-	case HOST_CR4:
-		*value = current_evmcs->host_cr4;
-		break;
-	case HOST_IA32_SYSENTER_ESP:
-		*value = current_evmcs->host_ia32_sysenter_esp;
-		break;
-	case HOST_IA32_SYSENTER_EIP:
-		*value = current_evmcs->host_ia32_sysenter_eip;
-		break;
-	case HOST_RIP:
-		*value = current_evmcs->host_rip;
-		break;
-	case IO_BITMAP_A:
-		*value = current_evmcs->io_bitmap_a;
-		break;
-	case IO_BITMAP_B:
-		*value = current_evmcs->io_bitmap_b;
-		break;
-	case MSR_BITMAP:
-		*value = current_evmcs->msr_bitmap;
-		break;
-	case GUEST_ES_BASE:
-		*value = current_evmcs->guest_es_base;
-		break;
-	case GUEST_CS_BASE:
-		*value = current_evmcs->guest_cs_base;
-		break;
-	case GUEST_SS_BASE:
-		*value = current_evmcs->guest_ss_base;
-		break;
-	case GUEST_DS_BASE:
-		*value = current_evmcs->guest_ds_base;
-		break;
-	case GUEST_FS_BASE:
-		*value = current_evmcs->guest_fs_base;
-		break;
-	case GUEST_GS_BASE:
-		*value = current_evmcs->guest_gs_base;
-		break;
-	case GUEST_LDTR_BASE:
-		*value = current_evmcs->guest_ldtr_base;
-		break;
-	case GUEST_TR_BASE:
-		*value = current_evmcs->guest_tr_base;
-		break;
-	case GUEST_GDTR_BASE:
-		*value = current_evmcs->guest_gdtr_base;
-		break;
-	case GUEST_IDTR_BASE:
-		*value = current_evmcs->guest_idtr_base;
-		break;
-	case TSC_OFFSET:
-		*value = current_evmcs->tsc_offset;
-		break;
-	case VIRTUAL_APIC_PAGE_ADDR:
-		*value = current_evmcs->virtual_apic_page_addr;
-		break;
-	case VMCS_LINK_POINTER:
-		*value = current_evmcs->vmcs_link_pointer;
-		break;
-	case GUEST_IA32_DEBUGCTL:
-		*value = current_evmcs->guest_ia32_debugctl;
-		break;
-	case GUEST_IA32_PAT:
-		*value = current_evmcs->guest_ia32_pat;
-		break;
-	case GUEST_IA32_EFER:
-		*value = current_evmcs->guest_ia32_efer;
-		break;
-	case GUEST_PDPTR0:
-		*value = current_evmcs->guest_pdptr0;
-		break;
-	case GUEST_PDPTR1:
-		*value = current_evmcs->guest_pdptr1;
-		break;
-	case GUEST_PDPTR2:
-		*value = current_evmcs->guest_pdptr2;
-		break;
-	case GUEST_PDPTR3:
-		*value = current_evmcs->guest_pdptr3;
-		break;
-	case GUEST_PENDING_DBG_EXCEPTIONS:
-		*value = current_evmcs->guest_pending_dbg_exceptions;
-		break;
-	case GUEST_SYSENTER_ESP:
-		*value = current_evmcs->guest_sysenter_esp;
-		break;
-	case GUEST_SYSENTER_EIP:
-		*value = current_evmcs->guest_sysenter_eip;
-		break;
-	case CR0_GUEST_HOST_MASK:
-		*value = current_evmcs->cr0_guest_host_mask;
-		break;
-	case CR4_GUEST_HOST_MASK:
-		*value = current_evmcs->cr4_guest_host_mask;
-		break;
-	case CR0_READ_SHADOW:
-		*value = current_evmcs->cr0_read_shadow;
-		break;
-	case CR4_READ_SHADOW:
-		*value = current_evmcs->cr4_read_shadow;
-		break;
-	case GUEST_CR0:
-		*value = current_evmcs->guest_cr0;
-		break;
-	case GUEST_CR3:
-		*value = current_evmcs->guest_cr3;
-		break;
-	case GUEST_CR4:
-		*value = current_evmcs->guest_cr4;
-		break;
-	case GUEST_DR7:
-		*value = current_evmcs->guest_dr7;
-		break;
-	case HOST_FS_BASE:
-		*value = current_evmcs->host_fs_base;
-		break;
-	case HOST_GS_BASE:
-		*value = current_evmcs->host_gs_base;
-		break;
-	case HOST_TR_BASE:
-		*value = current_evmcs->host_tr_base;
-		break;
-	case HOST_GDTR_BASE:
-		*value = current_evmcs->host_gdtr_base;
-		break;
-	case HOST_IDTR_BASE:
-		*value = current_evmcs->host_idtr_base;
-		break;
-	case HOST_RSP:
-		*value = current_evmcs->host_rsp;
-		break;
-	case EPT_POINTER:
-		*value = current_evmcs->ept_pointer;
-		break;
-	case GUEST_BNDCFGS:
-		*value = current_evmcs->guest_bndcfgs;
-		break;
-	case XSS_EXIT_BITMAP:
-		*value = current_evmcs->xss_exit_bitmap;
-		break;
-	case GUEST_PHYSICAL_ADDRESS:
-		*value = current_evmcs->guest_physical_address;
-		break;
-	case EXIT_QUALIFICATION:
-		*value = current_evmcs->exit_qualification;
-		break;
-	case GUEST_LINEAR_ADDRESS:
-		*value = current_evmcs->guest_linear_address;
-		break;
-	case VM_EXIT_MSR_STORE_ADDR:
-		*value = current_evmcs->vm_exit_msr_store_addr;
-		break;
-	case VM_EXIT_MSR_LOAD_ADDR:
-		*value = current_evmcs->vm_exit_msr_load_addr;
-		break;
-	case VM_ENTRY_MSR_LOAD_ADDR:
-		*value = current_evmcs->vm_entry_msr_load_addr;
-		break;
-	case CR3_TARGET_VALUE0:
-		*value = current_evmcs->cr3_target_value0;
-		break;
-	case CR3_TARGET_VALUE1:
-		*value = current_evmcs->cr3_target_value1;
-		break;
-	case CR3_TARGET_VALUE2:
-		*value = current_evmcs->cr3_target_value2;
-		break;
-	case CR3_TARGET_VALUE3:
-		*value = current_evmcs->cr3_target_value3;
-		break;
-	case TPR_THRESHOLD:
-		*value = current_evmcs->tpr_threshold;
-		break;
-	case GUEST_INTERRUPTIBILITY_INFO:
-		*value = current_evmcs->guest_interruptibility_info;
-		break;
-	case CPU_BASED_VM_EXEC_CONTROL:
-		*value = current_evmcs->cpu_based_vm_exec_control;
-		break;
-	case EXCEPTION_BITMAP:
-		*value = current_evmcs->exception_bitmap;
-		break;
-	case VM_ENTRY_CONTROLS:
-		*value = current_evmcs->vm_entry_controls;
-		break;
-	case VM_ENTRY_INTR_INFO_FIELD:
-		*value = current_evmcs->vm_entry_intr_info_field;
-		break;
-	case VM_ENTRY_EXCEPTION_ERROR_CODE:
-		*value = current_evmcs->vm_entry_exception_error_code;
-		break;
-	case VM_ENTRY_INSTRUCTION_LEN:
-		*value = current_evmcs->vm_entry_instruction_len;
-		break;
-	case HOST_IA32_SYSENTER_CS:
-		*value = current_evmcs->host_ia32_sysenter_cs;
-		break;
-	case PIN_BASED_VM_EXEC_CONTROL:
-		*value = current_evmcs->pin_based_vm_exec_control;
-		break;
-	case VM_EXIT_CONTROLS:
-		*value = current_evmcs->vm_exit_controls;
-		break;
-	case SECONDARY_VM_EXEC_CONTROL:
-		*value = current_evmcs->secondary_vm_exec_control;
-		break;
-	case GUEST_ES_LIMIT:
-		*value = current_evmcs->guest_es_limit;
-		break;
-	case GUEST_CS_LIMIT:
-		*value = current_evmcs->guest_cs_limit;
-		break;
-	case GUEST_SS_LIMIT:
-		*value = current_evmcs->guest_ss_limit;
-		break;
-	case GUEST_DS_LIMIT:
-		*value = current_evmcs->guest_ds_limit;
-		break;
-	case GUEST_FS_LIMIT:
-		*value = current_evmcs->guest_fs_limit;
-		break;
-	case GUEST_GS_LIMIT:
-		*value = current_evmcs->guest_gs_limit;
-		break;
-	case GUEST_LDTR_LIMIT:
-		*value = current_evmcs->guest_ldtr_limit;
-		break;
-	case GUEST_TR_LIMIT:
-		*value = current_evmcs->guest_tr_limit;
-		break;
-	case GUEST_GDTR_LIMIT:
-		*value = current_evmcs->guest_gdtr_limit;
-		break;
-	case GUEST_IDTR_LIMIT:
-		*value = current_evmcs->guest_idtr_limit;
-		break;
-	case GUEST_ES_AR_BYTES:
-		*value = current_evmcs->guest_es_ar_bytes;
-		break;
-	case GUEST_CS_AR_BYTES:
-		*value = current_evmcs->guest_cs_ar_bytes;
-		break;
-	case GUEST_SS_AR_BYTES:
-		*value = current_evmcs->guest_ss_ar_bytes;
-		break;
-	case GUEST_DS_AR_BYTES:
-		*value = current_evmcs->guest_ds_ar_bytes;
-		break;
-	case GUEST_FS_AR_BYTES:
-		*value = current_evmcs->guest_fs_ar_bytes;
-		break;
-	case GUEST_GS_AR_BYTES:
-		*value = current_evmcs->guest_gs_ar_bytes;
-		break;
-	case GUEST_LDTR_AR_BYTES:
-		*value = current_evmcs->guest_ldtr_ar_bytes;
-		break;
-	case GUEST_TR_AR_BYTES:
-		*value = current_evmcs->guest_tr_ar_bytes;
-		break;
-	case GUEST_ACTIVITY_STATE:
-		*value = current_evmcs->guest_activity_state;
-		break;
-	case GUEST_SYSENTER_CS:
-		*value = current_evmcs->guest_sysenter_cs;
-		break;
-	case VM_INSTRUCTION_ERROR:
-		*value = current_evmcs->vm_instruction_error;
-		break;
-	case VM_EXIT_REASON:
-		*value = current_evmcs->vm_exit_reason;
-		break;
-	case VM_EXIT_INTR_INFO:
-		*value = current_evmcs->vm_exit_intr_info;
-		break;
-	case VM_EXIT_INTR_ERROR_CODE:
-		*value = current_evmcs->vm_exit_intr_error_code;
-		break;
-	case IDT_VECTORING_INFO_FIELD:
-		*value = current_evmcs->idt_vectoring_info_field;
-		break;
-	case IDT_VECTORING_ERROR_CODE:
-		*value = current_evmcs->idt_vectoring_error_code;
-		break;
-	case VM_EXIT_INSTRUCTION_LEN:
-		*value = current_evmcs->vm_exit_instruction_len;
-		break;
-	case VMX_INSTRUCTION_INFO:
-		*value = current_evmcs->vmx_instruction_info;
-		break;
-	case PAGE_FAULT_ERROR_CODE_MASK:
-		*value = current_evmcs->page_fault_error_code_mask;
-		break;
-	case PAGE_FAULT_ERROR_CODE_MATCH:
-		*value = current_evmcs->page_fault_error_code_match;
-		break;
-	case CR3_TARGET_COUNT:
-		*value = current_evmcs->cr3_target_count;
-		break;
-	case VM_EXIT_MSR_STORE_COUNT:
-		*value = current_evmcs->vm_exit_msr_store_count;
-		break;
-	case VM_EXIT_MSR_LOAD_COUNT:
-		*value = current_evmcs->vm_exit_msr_load_count;
-		break;
-	case VM_ENTRY_MSR_LOAD_COUNT:
-		*value = current_evmcs->vm_entry_msr_load_count;
-		break;
-	case HOST_ES_SELECTOR:
-		*value = current_evmcs->host_es_selector;
-		break;
-	case HOST_CS_SELECTOR:
-		*value = current_evmcs->host_cs_selector;
-		break;
-	case HOST_SS_SELECTOR:
-		*value = current_evmcs->host_ss_selector;
-		break;
-	case HOST_DS_SELECTOR:
-		*value = current_evmcs->host_ds_selector;
-		break;
-	case HOST_FS_SELECTOR:
-		*value = current_evmcs->host_fs_selector;
-		break;
-	case HOST_GS_SELECTOR:
-		*value = current_evmcs->host_gs_selector;
-		break;
-	case HOST_TR_SELECTOR:
-		*value = current_evmcs->host_tr_selector;
-		break;
-	case GUEST_ES_SELECTOR:
-		*value = current_evmcs->guest_es_selector;
-		break;
-	case GUEST_CS_SELECTOR:
-		*value = current_evmcs->guest_cs_selector;
-		break;
-	case GUEST_SS_SELECTOR:
-		*value = current_evmcs->guest_ss_selector;
-		break;
-	case GUEST_DS_SELECTOR:
-		*value = current_evmcs->guest_ds_selector;
-		break;
-	case GUEST_FS_SELECTOR:
-		*value = current_evmcs->guest_fs_selector;
-		break;
-	case GUEST_GS_SELECTOR:
-		*value = current_evmcs->guest_gs_selector;
-		break;
-	case GUEST_LDTR_SELECTOR:
-		*value = current_evmcs->guest_ldtr_selector;
-		break;
-	case GUEST_TR_SELECTOR:
-		*value = current_evmcs->guest_tr_selector;
-		break;
-	case VIRTUAL_PROCESSOR_ID:
-		*value = current_evmcs->virtual_processor_id;
-		break;
-	case HOST_IA32_PERF_GLOBAL_CTRL:
-		*value = current_evmcs->host_ia32_perf_global_ctrl;
-		break;
-	case GUEST_IA32_PERF_GLOBAL_CTRL:
-		*value = current_evmcs->guest_ia32_perf_global_ctrl;
-		break;
-	case ENCLS_EXITING_BITMAP:
-		*value = current_evmcs->encls_exiting_bitmap;
-		break;
-	case TSC_MULTIPLIER:
-		*value = current_evmcs->tsc_multiplier;
-		break;
-	default: return 1;
-	}
-
-	return 0;
-}
-
-static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
-{
-	switch (encoding) {
-	case GUEST_RIP:
-		current_evmcs->guest_rip = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case GUEST_RSP:
-		current_evmcs->guest_rsp = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
-		break;
-	case GUEST_RFLAGS:
-		current_evmcs->guest_rflags = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
-		break;
-	case HOST_IA32_PAT:
-		current_evmcs->host_ia32_pat = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_IA32_EFER:
-		current_evmcs->host_ia32_efer = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_CR0:
-		current_evmcs->host_cr0 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_CR3:
-		current_evmcs->host_cr3 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_CR4:
-		current_evmcs->host_cr4 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_IA32_SYSENTER_ESP:
-		current_evmcs->host_ia32_sysenter_esp = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_IA32_SYSENTER_EIP:
-		current_evmcs->host_ia32_sysenter_eip = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_RIP:
-		current_evmcs->host_rip = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case IO_BITMAP_A:
-		current_evmcs->io_bitmap_a = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP;
-		break;
-	case IO_BITMAP_B:
-		current_evmcs->io_bitmap_b = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP;
-		break;
-	case MSR_BITMAP:
-		current_evmcs->msr_bitmap = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-		break;
-	case GUEST_ES_BASE:
-		current_evmcs->guest_es_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_CS_BASE:
-		current_evmcs->guest_cs_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_SS_BASE:
-		current_evmcs->guest_ss_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_DS_BASE:
-		current_evmcs->guest_ds_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_FS_BASE:
-		current_evmcs->guest_fs_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_GS_BASE:
-		current_evmcs->guest_gs_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_LDTR_BASE:
-		current_evmcs->guest_ldtr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_TR_BASE:
-		current_evmcs->guest_tr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_GDTR_BASE:
-		current_evmcs->guest_gdtr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_IDTR_BASE:
-		current_evmcs->guest_idtr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case TSC_OFFSET:
-		current_evmcs->tsc_offset = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
-		break;
-	case VIRTUAL_APIC_PAGE_ADDR:
-		current_evmcs->virtual_apic_page_addr = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
-		break;
-	case VMCS_LINK_POINTER:
-		current_evmcs->vmcs_link_pointer = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_IA32_DEBUGCTL:
-		current_evmcs->guest_ia32_debugctl = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_IA32_PAT:
-		current_evmcs->guest_ia32_pat = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_IA32_EFER:
-		current_evmcs->guest_ia32_efer = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_PDPTR0:
-		current_evmcs->guest_pdptr0 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_PDPTR1:
-		current_evmcs->guest_pdptr1 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_PDPTR2:
-		current_evmcs->guest_pdptr2 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_PDPTR3:
-		current_evmcs->guest_pdptr3 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_PENDING_DBG_EXCEPTIONS:
-		current_evmcs->guest_pending_dbg_exceptions = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_SYSENTER_ESP:
-		current_evmcs->guest_sysenter_esp = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_SYSENTER_EIP:
-		current_evmcs->guest_sysenter_eip = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case CR0_GUEST_HOST_MASK:
-		current_evmcs->cr0_guest_host_mask = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case CR4_GUEST_HOST_MASK:
-		current_evmcs->cr4_guest_host_mask = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case CR0_READ_SHADOW:
-		current_evmcs->cr0_read_shadow = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case CR4_READ_SHADOW:
-		current_evmcs->cr4_read_shadow = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case GUEST_CR0:
-		current_evmcs->guest_cr0 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case GUEST_CR3:
-		current_evmcs->guest_cr3 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case GUEST_CR4:
-		current_evmcs->guest_cr4 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case GUEST_DR7:
-		current_evmcs->guest_dr7 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR;
-		break;
-	case HOST_FS_BASE:
-		current_evmcs->host_fs_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-		break;
-	case HOST_GS_BASE:
-		current_evmcs->host_gs_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-		break;
-	case HOST_TR_BASE:
-		current_evmcs->host_tr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-		break;
-	case HOST_GDTR_BASE:
-		current_evmcs->host_gdtr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-		break;
-	case HOST_IDTR_BASE:
-		current_evmcs->host_idtr_base = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-		break;
-	case HOST_RSP:
-		current_evmcs->host_rsp = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-		break;
-	case EPT_POINTER:
-		current_evmcs->ept_pointer = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
-		break;
-	case GUEST_BNDCFGS:
-		current_evmcs->guest_bndcfgs = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case XSS_EXIT_BITMAP:
-		current_evmcs->xss_exit_bitmap = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
-		break;
-	case GUEST_PHYSICAL_ADDRESS:
-		current_evmcs->guest_physical_address = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case EXIT_QUALIFICATION:
-		current_evmcs->exit_qualification = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case GUEST_LINEAR_ADDRESS:
-		current_evmcs->guest_linear_address = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case VM_EXIT_MSR_STORE_ADDR:
-		current_evmcs->vm_exit_msr_store_addr = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case VM_EXIT_MSR_LOAD_ADDR:
-		current_evmcs->vm_exit_msr_load_addr = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case VM_ENTRY_MSR_LOAD_ADDR:
-		current_evmcs->vm_entry_msr_load_addr = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case CR3_TARGET_VALUE0:
-		current_evmcs->cr3_target_value0 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case CR3_TARGET_VALUE1:
-		current_evmcs->cr3_target_value1 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case CR3_TARGET_VALUE2:
-		current_evmcs->cr3_target_value2 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case CR3_TARGET_VALUE3:
-		current_evmcs->cr3_target_value3 = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case TPR_THRESHOLD:
-		current_evmcs->tpr_threshold = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case GUEST_INTERRUPTIBILITY_INFO:
-		current_evmcs->guest_interruptibility_info = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC;
-		break;
-	case CPU_BASED_VM_EXEC_CONTROL:
-		current_evmcs->cpu_based_vm_exec_control = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC;
-		break;
-	case EXCEPTION_BITMAP:
-		current_evmcs->exception_bitmap = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN;
-		break;
-	case VM_ENTRY_CONTROLS:
-		current_evmcs->vm_entry_controls = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY;
-		break;
-	case VM_ENTRY_INTR_INFO_FIELD:
-		current_evmcs->vm_entry_intr_info_field = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
-		break;
-	case VM_ENTRY_EXCEPTION_ERROR_CODE:
-		current_evmcs->vm_entry_exception_error_code = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
-		break;
-	case VM_ENTRY_INSTRUCTION_LEN:
-		current_evmcs->vm_entry_instruction_len = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT;
-		break;
-	case HOST_IA32_SYSENTER_CS:
-		current_evmcs->host_ia32_sysenter_cs = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case PIN_BASED_VM_EXEC_CONTROL:
-		current_evmcs->pin_based_vm_exec_control = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
-		break;
-	case VM_EXIT_CONTROLS:
-		current_evmcs->vm_exit_controls = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
-		break;
-	case SECONDARY_VM_EXEC_CONTROL:
-		current_evmcs->secondary_vm_exec_control = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1;
-		break;
-	case GUEST_ES_LIMIT:
-		current_evmcs->guest_es_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_CS_LIMIT:
-		current_evmcs->guest_cs_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_SS_LIMIT:
-		current_evmcs->guest_ss_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_DS_LIMIT:
-		current_evmcs->guest_ds_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_FS_LIMIT:
-		current_evmcs->guest_fs_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_GS_LIMIT:
-		current_evmcs->guest_gs_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_LDTR_LIMIT:
-		current_evmcs->guest_ldtr_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_TR_LIMIT:
-		current_evmcs->guest_tr_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_GDTR_LIMIT:
-		current_evmcs->guest_gdtr_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_IDTR_LIMIT:
-		current_evmcs->guest_idtr_limit = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_ES_AR_BYTES:
-		current_evmcs->guest_es_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_CS_AR_BYTES:
-		current_evmcs->guest_cs_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_SS_AR_BYTES:
-		current_evmcs->guest_ss_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_DS_AR_BYTES:
-		current_evmcs->guest_ds_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_FS_AR_BYTES:
-		current_evmcs->guest_fs_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_GS_AR_BYTES:
-		current_evmcs->guest_gs_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_LDTR_AR_BYTES:
-		current_evmcs->guest_ldtr_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_TR_AR_BYTES:
-		current_evmcs->guest_tr_ar_bytes = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_ACTIVITY_STATE:
-		current_evmcs->guest_activity_state = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case GUEST_SYSENTER_CS:
-		current_evmcs->guest_sysenter_cs = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case VM_INSTRUCTION_ERROR:
-		current_evmcs->vm_instruction_error = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case VM_EXIT_REASON:
-		current_evmcs->vm_exit_reason = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case VM_EXIT_INTR_INFO:
-		current_evmcs->vm_exit_intr_info = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case VM_EXIT_INTR_ERROR_CODE:
-		current_evmcs->vm_exit_intr_error_code = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case IDT_VECTORING_INFO_FIELD:
-		current_evmcs->idt_vectoring_info_field = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case IDT_VECTORING_ERROR_CODE:
-		current_evmcs->idt_vectoring_error_code = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case VM_EXIT_INSTRUCTION_LEN:
-		current_evmcs->vm_exit_instruction_len = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case VMX_INSTRUCTION_INFO:
-		current_evmcs->vmx_instruction_info = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE;
-		break;
-	case PAGE_FAULT_ERROR_CODE_MASK:
-		current_evmcs->page_fault_error_code_mask = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case PAGE_FAULT_ERROR_CODE_MATCH:
-		current_evmcs->page_fault_error_code_match = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case CR3_TARGET_COUNT:
-		current_evmcs->cr3_target_count = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case VM_EXIT_MSR_STORE_COUNT:
-		current_evmcs->vm_exit_msr_store_count = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case VM_EXIT_MSR_LOAD_COUNT:
-		current_evmcs->vm_exit_msr_load_count = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case VM_ENTRY_MSR_LOAD_COUNT:
-		current_evmcs->vm_entry_msr_load_count = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-		break;
-	case HOST_ES_SELECTOR:
-		current_evmcs->host_es_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_CS_SELECTOR:
-		current_evmcs->host_cs_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_SS_SELECTOR:
-		current_evmcs->host_ss_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_DS_SELECTOR:
-		current_evmcs->host_ds_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_FS_SELECTOR:
-		current_evmcs->host_fs_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_GS_SELECTOR:
-		current_evmcs->host_gs_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case HOST_TR_SELECTOR:
-		current_evmcs->host_tr_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case GUEST_ES_SELECTOR:
-		current_evmcs->guest_es_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_CS_SELECTOR:
-		current_evmcs->guest_cs_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_SS_SELECTOR:
-		current_evmcs->guest_ss_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_DS_SELECTOR:
-		current_evmcs->guest_ds_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_FS_SELECTOR:
-		current_evmcs->guest_fs_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_GS_SELECTOR:
-		current_evmcs->guest_gs_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_LDTR_SELECTOR:
-		current_evmcs->guest_ldtr_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case GUEST_TR_SELECTOR:
-		current_evmcs->guest_tr_selector = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2;
-		break;
-	case VIRTUAL_PROCESSOR_ID:
-		current_evmcs->virtual_processor_id = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
-		break;
-	case HOST_IA32_PERF_GLOBAL_CTRL:
-		current_evmcs->host_ia32_perf_global_ctrl = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-		break;
-	case GUEST_IA32_PERF_GLOBAL_CTRL:
-		current_evmcs->guest_ia32_perf_global_ctrl = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
-		break;
-	case ENCLS_EXITING_BITMAP:
-		current_evmcs->encls_exiting_bitmap = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
-		break;
-	case TSC_MULTIPLIER:
-		current_evmcs->tsc_multiplier = value;
-		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
-		break;
-	default: return 1;
-	}
-
-	return 0;
-}
-
-static inline int evmcs_vmlaunch(void)
-{
-	int ret;
-
-	current_evmcs->hv_clean_fields = 0;
-
-	__asm__ __volatile__("push %%rbp;"
-			     "push %%rcx;"
-			     "push %%rdx;"
-			     "push %%rsi;"
-			     "push %%rdi;"
-			     "push $0;"
-			     "mov %%rsp, (%[host_rsp]);"
-			     "lea 1f(%%rip), %%rax;"
-			     "mov %%rax, (%[host_rip]);"
-			     "vmlaunch;"
-			     "incq (%%rsp);"
-			     "1: pop %%rax;"
-			     "pop %%rdi;"
-			     "pop %%rsi;"
-			     "pop %%rdx;"
-			     "pop %%rcx;"
-			     "pop %%rbp;"
-			     : [ret]"=&a"(ret)
-			     : [host_rsp]"r"
-			       ((uint64_t)&current_evmcs->host_rsp),
-			       [host_rip]"r"
-			       ((uint64_t)&current_evmcs->host_rip)
-			     : "memory", "cc", "rbx", "r8", "r9", "r10",
-			       "r11", "r12", "r13", "r14", "r15");
-	return ret;
-}
-
-/*
- * No guest state (e.g. GPRs) is established by this vmresume.
- */
-static inline int evmcs_vmresume(void)
-{
-	int ret;
-
-	/* HOST_RIP */
-	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
-	/* HOST_RSP */
-	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER;
-
-	__asm__ __volatile__("push %%rbp;"
-			     "push %%rcx;"
-			     "push %%rdx;"
-			     "push %%rsi;"
-			     "push %%rdi;"
-			     "push $0;"
-			     "mov %%rsp, (%[host_rsp]);"
-			     "lea 1f(%%rip), %%rax;"
-			     "mov %%rax, (%[host_rip]);"
-			     "vmresume;"
-			     "incq (%%rsp);"
-			     "1: pop %%rax;"
-			     "pop %%rdi;"
-			     "pop %%rsi;"
-			     "pop %%rdx;"
-			     "pop %%rcx;"
-			     "pop %%rbp;"
-			     : [ret]"=&a"(ret)
-			     : [host_rsp]"r"
-			       ((uint64_t)&current_evmcs->host_rsp),
-			       [host_rip]"r"
-			       ((uint64_t)&current_evmcs->host_rip)
-			     : "memory", "cc", "rbx", "r8", "r9", "r10",
-			       "r11", "r12", "r13", "r14", "r15");
-	return ret;
-}
-
-#endif /* !SELFTEST_KVM_EVMCS_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
deleted file mode 100644
index 6849e2552f1b..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * tools/testing/selftests/kvm/include/x86_64/hyperv.h
- *
- * Copyright (C) 2021, Red Hat, Inc.
- *
- */
-
-#ifndef SELFTEST_KVM_HYPERV_H
-#define SELFTEST_KVM_HYPERV_H
-
-#include "processor.h"
-
-#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS	0x40000000
-#define HYPERV_CPUID_INTERFACE			0x40000001
-#define HYPERV_CPUID_VERSION			0x40000002
-#define HYPERV_CPUID_FEATURES			0x40000003
-#define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004
-#define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005
-#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES	0x40000007
-#define HYPERV_CPUID_NESTED_FEATURES		0x4000000A
-#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS	0x40000080
-#define HYPERV_CPUID_SYNDBG_INTERFACE			0x40000081
-#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES	0x40000082
-
-#define HV_X64_MSR_GUEST_OS_ID			0x40000000
-#define HV_X64_MSR_HYPERCALL			0x40000001
-#define HV_X64_MSR_VP_INDEX			0x40000002
-#define HV_X64_MSR_RESET			0x40000003
-#define HV_X64_MSR_VP_RUNTIME			0x40000010
-#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
-#define HV_X64_MSR_REFERENCE_TSC		0x40000021
-#define HV_X64_MSR_TSC_FREQUENCY		0x40000022
-#define HV_X64_MSR_APIC_FREQUENCY		0x40000023
-#define HV_X64_MSR_EOI				0x40000070
-#define HV_X64_MSR_ICR				0x40000071
-#define HV_X64_MSR_TPR				0x40000072
-#define HV_X64_MSR_VP_ASSIST_PAGE		0x40000073
-#define HV_X64_MSR_SCONTROL			0x40000080
-#define HV_X64_MSR_SVERSION			0x40000081
-#define HV_X64_MSR_SIEFP			0x40000082
-#define HV_X64_MSR_SIMP				0x40000083
-#define HV_X64_MSR_EOM				0x40000084
-#define HV_X64_MSR_SINT0			0x40000090
-#define HV_X64_MSR_SINT1			0x40000091
-#define HV_X64_MSR_SINT2			0x40000092
-#define HV_X64_MSR_SINT3			0x40000093
-#define HV_X64_MSR_SINT4			0x40000094
-#define HV_X64_MSR_SINT5			0x40000095
-#define HV_X64_MSR_SINT6			0x40000096
-#define HV_X64_MSR_SINT7			0x40000097
-#define HV_X64_MSR_SINT8			0x40000098
-#define HV_X64_MSR_SINT9			0x40000099
-#define HV_X64_MSR_SINT10			0x4000009A
-#define HV_X64_MSR_SINT11			0x4000009B
-#define HV_X64_MSR_SINT12			0x4000009C
-#define HV_X64_MSR_SINT13			0x4000009D
-#define HV_X64_MSR_SINT14			0x4000009E
-#define HV_X64_MSR_SINT15			0x4000009F
-#define HV_X64_MSR_STIMER0_CONFIG		0x400000B0
-#define HV_X64_MSR_STIMER0_COUNT		0x400000B1
-#define HV_X64_MSR_STIMER1_CONFIG		0x400000B2
-#define HV_X64_MSR_STIMER1_COUNT		0x400000B3
-#define HV_X64_MSR_STIMER2_CONFIG		0x400000B4
-#define HV_X64_MSR_STIMER2_COUNT		0x400000B5
-#define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
-#define HV_X64_MSR_STIMER3_COUNT		0x400000B7
-#define HV_X64_MSR_GUEST_IDLE			0x400000F0
-#define HV_X64_MSR_CRASH_P0			0x40000100
-#define HV_X64_MSR_CRASH_P1			0x40000101
-#define HV_X64_MSR_CRASH_P2			0x40000102
-#define HV_X64_MSR_CRASH_P3			0x40000103
-#define HV_X64_MSR_CRASH_P4			0x40000104
-#define HV_X64_MSR_CRASH_CTL			0x40000105
-#define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
-#define HV_X64_MSR_TSC_EMULATION_CONTROL	0x40000107
-#define HV_X64_MSR_TSC_EMULATION_STATUS		0x40000108
-#define HV_X64_MSR_TSC_INVARIANT_CONTROL	0x40000118
-
-#define HV_X64_MSR_SYNDBG_CONTROL		0x400000F1
-#define HV_X64_MSR_SYNDBG_STATUS		0x400000F2
-#define HV_X64_MSR_SYNDBG_SEND_BUFFER		0x400000F3
-#define HV_X64_MSR_SYNDBG_RECV_BUFFER		0x400000F4
-#define HV_X64_MSR_SYNDBG_PENDING_BUFFER	0x400000F5
-#define HV_X64_MSR_SYNDBG_OPTIONS		0x400000FF
-
-/* HYPERV_CPUID_FEATURES.EAX */
-#define HV_MSR_VP_RUNTIME_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 0)
-#define HV_MSR_TIME_REF_COUNT_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 1)
-#define HV_MSR_SYNIC_AVAILABLE			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 2)
-#define HV_MSR_SYNTIMER_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 3)
-#define HV_MSR_APIC_ACCESS_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 4)
-#define HV_MSR_HYPERCALL_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 5)
-#define HV_MSR_VP_INDEX_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 6)
-#define HV_MSR_RESET_AVAILABLE			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 7)
-#define HV_MSR_STAT_PAGES_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 8)
-#define HV_MSR_REFERENCE_TSC_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 9)
-#define HV_MSR_GUEST_IDLE_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 10)
-#define HV_ACCESS_FREQUENCY_MSRS		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 11)
-#define HV_ACCESS_REENLIGHTENMENT		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 13)
-#define HV_ACCESS_TSC_INVARIANT			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 15)
-
-/* HYPERV_CPUID_FEATURES.EBX */
-#define HV_CREATE_PARTITIONS		        \
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 0)
-#define HV_ACCESS_PARTITION_ID			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 1)
-#define HV_ACCESS_MEMORY_POOL			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 2)
-#define HV_ADJUST_MESSAGE_BUFFERS		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 3)
-#define HV_POST_MESSAGES			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 4)
-#define HV_SIGNAL_EVENTS			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 5)
-#define HV_CREATE_PORT				\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 6)
-#define HV_CONNECT_PORT				\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 7)
-#define HV_ACCESS_STATS				\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 8)
-#define HV_DEBUGGING				\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 11)
-#define HV_CPU_MANAGEMENT			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 12)
-#define HV_ENABLE_EXTENDED_HYPERCALLS		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 20)
-#define HV_ISOLATION				\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 22)
-
-/* HYPERV_CPUID_FEATURES.EDX */
-#define HV_X64_MWAIT_AVAILABLE				\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 0)
-#define HV_X64_GUEST_DEBUGGING_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 1)
-#define HV_X64_PERF_MONITOR_AVAILABLE			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 2)
-#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 3)
-#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 4)
-#define HV_X64_GUEST_IDLE_STATE_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 5)
-#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 8)
-#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 10)
-#define HV_FEATURE_DEBUG_MSRS_AVAILABLE			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 11)
-#define HV_STIMER_DIRECT_MODE_AVAILABLE			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 19)
-
-/* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
-#define HV_X64_AS_SWITCH_RECOMMENDED			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 0)
-#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 1)
-#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 2)
-#define HV_X64_APIC_ACCESS_RECOMMENDED			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 3)
-#define HV_X64_SYSTEM_RESET_RECOMMENDED			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 4)
-#define HV_X64_RELAXED_TIMING_RECOMMENDED		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 5)
-#define HV_DEPRECATING_AEOI_RECOMMENDED			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 9)
-#define HV_X64_CLUSTER_IPI_RECOMMENDED			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 10)
-#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 11)
-#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14)
-
-/* HYPERV_CPUID_NESTED_FEATURES.EAX */
-#define HV_X64_NESTED_DIRECT_FLUSH			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17)
-#define HV_X64_NESTED_GUEST_MAPPING_FLUSH		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18)
-#define HV_X64_NESTED_MSR_BITMAP			\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19)
-
-/* HYPERV_CPUID_NESTED_FEATURES.EBX */
-#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL		\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0)
-
-/* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
-#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING	\
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1)
-
-/* Hypercalls */
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
-#define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
-#define HVCALL_SEND_IPI				0x000b
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX	0x0013
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX	0x0014
-#define HVCALL_SEND_IPI_EX			0x0015
-#define HVCALL_GET_PARTITION_ID			0x0046
-#define HVCALL_DEPOSIT_MEMORY			0x0048
-#define HVCALL_CREATE_VP			0x004e
-#define HVCALL_GET_VP_REGISTERS			0x0050
-#define HVCALL_SET_VP_REGISTERS			0x0051
-#define HVCALL_POST_MESSAGE			0x005c
-#define HVCALL_SIGNAL_EVENT			0x005d
-#define HVCALL_POST_DEBUG_DATA			0x0069
-#define HVCALL_RETRIEVE_DEBUG_DATA		0x006a
-#define HVCALL_RESET_DEBUG_SESSION		0x006b
-#define HVCALL_ADD_LOGICAL_PROCESSOR		0x0076
-#define HVCALL_MAP_DEVICE_INTERRUPT		0x007c
-#define HVCALL_UNMAP_DEVICE_INTERRUPT		0x007d
-#define HVCALL_RETARGET_INTERRUPT		0x007e
-#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
-#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
-
-/* Extended hypercalls */
-#define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
-
-#define HV_FLUSH_ALL_PROCESSORS			BIT(0)
-#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
-#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
-#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
-
-/* hypercall status code */
-#define HV_STATUS_SUCCESS			0
-#define HV_STATUS_INVALID_HYPERCALL_CODE	2
-#define HV_STATUS_INVALID_HYPERCALL_INPUT	3
-#define HV_STATUS_INVALID_ALIGNMENT		4
-#define HV_STATUS_INVALID_PARAMETER		5
-#define HV_STATUS_ACCESS_DENIED			6
-#define HV_STATUS_OPERATION_DENIED		8
-#define HV_STATUS_INSUFFICIENT_MEMORY		11
-#define HV_STATUS_INVALID_PORT_ID		17
-#define HV_STATUS_INVALID_CONNECTION_ID		18
-#define HV_STATUS_INSUFFICIENT_BUFFERS		19
-
-/* hypercall options */
-#define HV_HYPERCALL_FAST_BIT		BIT(16)
-#define HV_HYPERCALL_VARHEAD_OFFSET	17
-#define HV_HYPERCALL_REP_COMP_OFFSET	32
-
-/*
- * Issue a Hyper-V hypercall. Returns exception vector raised or 0, 'hv_status'
- * is set to the hypercall status (if no exception occurred).
- */
-static inline uint8_t __hyperv_hypercall(u64 control, vm_vaddr_t input_address,
-					 vm_vaddr_t output_address,
-					 uint64_t *hv_status)
-{
-	uint64_t error_code;
-	uint8_t vector;
-
-	/* Note both the hypercall and the "asm safe" clobber r9-r11. */
-	asm volatile("mov %[output_address], %%r8\n\t"
-		     KVM_ASM_SAFE("vmcall")
-		     : "=a" (*hv_status),
-		       "+c" (control), "+d" (input_address),
-		       KVM_ASM_SAFE_OUTPUTS(vector, error_code)
-		     : [output_address] "r"(output_address),
-		       "a" (-EFAULT)
-		     : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS);
-	return vector;
-}
-
-/* Issue a Hyper-V hypercall and assert that it succeeded. */
-static inline void hyperv_hypercall(u64 control, vm_vaddr_t input_address,
-				    vm_vaddr_t output_address)
-{
-	uint64_t hv_status;
-	uint8_t vector;
-
-	vector = __hyperv_hypercall(control, input_address, output_address, &hv_status);
-
-	GUEST_ASSERT(!vector);
-	GUEST_ASSERT((hv_status & 0xffff) == 0);
-}
-
-/* Write 'Fast' hypercall input 'data' to the first 'n_sse_regs' SSE regs */
-static inline void hyperv_write_xmm_input(void *data, int n_sse_regs)
-{
-	int i;
-
-	for (i = 0; i < n_sse_regs; i++)
-		write_sse_reg(i, (sse128_t *)(data + sizeof(sse128_t) * i));
-}
-
-/* Proper HV_X64_MSR_GUEST_OS_ID value */
-#define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48)
-
-#define HV_X64_MSR_VP_ASSIST_PAGE		0x40000073
-#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE	0x00000001
-#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT	12
-#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK	\
-		(~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
-
-struct hv_nested_enlightenments_control {
-	struct {
-		__u32 directhypercall:1;
-		__u32 reserved:31;
-	} features;
-	struct {
-		__u32 reserved;
-	} hypercallControls;
-} __packed;
-
-/* Define virtual processor assist page structure. */
-struct hv_vp_assist_page {
-	__u32 apic_assist;
-	__u32 reserved1;
-	__u64 vtl_control[3];
-	struct hv_nested_enlightenments_control nested_control;
-	__u8 enlighten_vmentry;
-	__u8 reserved2[7];
-	__u64 current_nested_vmcs;
-} __packed;
-
-extern struct hv_vp_assist_page *current_vp_assist;
-
-int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist);
-
-struct hyperv_test_pages {
-	/* VP assist page */
-	void *vp_assist_hva;
-	uint64_t vp_assist_gpa;
-	void *vp_assist;
-
-	/* Partition assist page */
-	void *partition_assist_hva;
-	uint64_t partition_assist_gpa;
-	void *partition_assist;
-
-	/* Enlightened VMCS */
-	void *enlightened_vmcs_hva;
-	uint64_t enlightened_vmcs_gpa;
-	void *enlightened_vmcs;
-};
-
-struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
-						       vm_vaddr_t *p_hv_pages_gva);
-
-/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
-#define HV_INVARIANT_TSC_EXPOSED               BIT_ULL(0)
-
-const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
-const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu);
-void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu);
-
-bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature);
-
-#endif /* !SELFTEST_KVM_HYPERV_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h
deleted file mode 100644
index 972bb1c4ab4c..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_UTIL_ARCH_H
-#define SELFTEST_KVM_UTIL_ARCH_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "kvm_util_types.h"
-#include "test_util.h"
-
-extern bool is_forced_emulation_enabled;
-
-struct kvm_vm_arch {
-	vm_vaddr_t gdt;
-	vm_vaddr_t tss;
-	vm_vaddr_t idt;
-
-	uint64_t c_bit;
-	uint64_t s_bit;
-	int sev_fd;
-	bool is_pt_protected;
-};
-
-static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
-{
-	return arch->c_bit || arch->s_bit;
-}
-
-#define vm_arch_has_protected_memory(vm) \
-	__vm_arch_has_protected_memory(&(vm)->arch)
-
-#define vcpu_arch_put_guest(mem, __val)							\
-do {											\
-	const typeof(mem) val = (__val);						\
-											\
-	if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) {		\
-		(mem) = val;								\
-	} else if (guest_random_bool(&guest_rng)) {					\
-		__asm__ __volatile__(KVM_FEP "mov %1, %0"				\
-				     : "+m" (mem)					\
-				     : "r" (val) : "memory");				\
-	} else {									\
-		uint64_t __old = READ_ONCE(mem);					\
-											\
-		__asm__ __volatile__(KVM_FEP LOCK_PREFIX "cmpxchg %[new], %[ptr]"	\
-				     : [ptr] "+m" (mem), [old] "+a" (__old)		\
-				     : [new]"r" (val) : "memory", "cc");		\
-	}										\
-} while (0)
-
-#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/x86_64/mce.h b/tools/testing/selftests/kvm/include/x86_64/mce.h
deleted file mode 100644
index 6119321f3f5d..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/mce.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * tools/testing/selftests/kvm/include/x86_64/mce.h
- *
- * Copyright (C) 2022, Google LLC.
- */
-
-#ifndef SELFTEST_KVM_MCE_H
-#define SELFTEST_KVM_MCE_H
-
-#define MCG_CTL_P		BIT_ULL(8)   /* MCG_CTL register available */
-#define MCG_SER_P		BIT_ULL(24)  /* MCA recovery/new status bits */
-#define MCG_LMCE_P		BIT_ULL(27)  /* Local machine check supported */
-#define MCG_CMCI_P		BIT_ULL(10)  /* CMCI supported */
-#define KVM_MAX_MCE_BANKS 32
-#define MCG_CAP_BANKS_MASK 0xff       /* Bit 0-7 of the MCG_CAP register are #banks */
-#define MCI_STATUS_VAL (1ULL << 63)   /* valid error */
-#define MCI_STATUS_UC (1ULL << 61)    /* uncorrected error */
-#define MCI_STATUS_EN (1ULL << 60)    /* error enabled */
-#define MCI_STATUS_MISCV (1ULL << 59) /* misc error reg. valid */
-#define MCI_STATUS_ADDRV (1ULL << 58) /* addr reg. valid */
-#define MCM_ADDR_PHYS 2    /* physical address */
-#define MCI_CTL2_CMCI_EN		BIT_ULL(30)
-
-#endif /* SELFTEST_KVM_MCE_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/pmu.h b/tools/testing/selftests/kvm/include/x86_64/pmu.h
deleted file mode 100644
index 3c10c4dc0ae8..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/pmu.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2023, Tencent, Inc.
- */
-#ifndef SELFTEST_KVM_PMU_H
-#define SELFTEST_KVM_PMU_H
-
-#include <stdint.h>
-
-#define KVM_PMU_EVENT_FILTER_MAX_EVENTS			300
-
-/*
- * Encode an eventsel+umask pair into event-select MSR format.  Note, this is
- * technically AMD's format, as Intel's format only supports 8 bits for the
- * event selector, i.e. doesn't use bits 24:16 for the selector.  But, OR-ing
- * in '0' is a nop and won't clobber the CMASK.
- */
-#define RAW_EVENT(eventsel, umask) (((eventsel & 0xf00UL) << 24) |	\
-				    ((eventsel) & 0xff) |		\
-				    ((umask) & 0xff) << 8)
-
-/*
- * These are technically Intel's definitions, but except for CMASK (see above),
- * AMD's layout is compatible with Intel's.
- */
-#define ARCH_PERFMON_EVENTSEL_EVENT		GENMASK_ULL(7, 0)
-#define ARCH_PERFMON_EVENTSEL_UMASK		GENMASK_ULL(15, 8)
-#define ARCH_PERFMON_EVENTSEL_USR		BIT_ULL(16)
-#define ARCH_PERFMON_EVENTSEL_OS		BIT_ULL(17)
-#define ARCH_PERFMON_EVENTSEL_EDGE		BIT_ULL(18)
-#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL	BIT_ULL(19)
-#define ARCH_PERFMON_EVENTSEL_INT		BIT_ULL(20)
-#define ARCH_PERFMON_EVENTSEL_ANY		BIT_ULL(21)
-#define ARCH_PERFMON_EVENTSEL_ENABLE		BIT_ULL(22)
-#define ARCH_PERFMON_EVENTSEL_INV		BIT_ULL(23)
-#define ARCH_PERFMON_EVENTSEL_CMASK		GENMASK_ULL(31, 24)
-
-/* RDPMC control flags, Intel only. */
-#define INTEL_RDPMC_METRICS			BIT_ULL(29)
-#define INTEL_RDPMC_FIXED			BIT_ULL(30)
-#define INTEL_RDPMC_FAST			BIT_ULL(31)
-
-/* Fixed PMC controls, Intel only. */
-#define FIXED_PMC_GLOBAL_CTRL_ENABLE(_idx)	BIT_ULL((32 + (_idx)))
-
-#define FIXED_PMC_KERNEL			BIT_ULL(0)
-#define FIXED_PMC_USER				BIT_ULL(1)
-#define FIXED_PMC_ANYTHREAD			BIT_ULL(2)
-#define FIXED_PMC_ENABLE_PMI			BIT_ULL(3)
-#define FIXED_PMC_NR_BITS			4
-#define FIXED_PMC_CTRL(_idx, _val)		((_val) << ((_idx) * FIXED_PMC_NR_BITS))
-
-#define PMU_CAP_FW_WRITES			BIT_ULL(13)
-#define PMU_CAP_LBR_FMT				0x3f
-
-#define	INTEL_ARCH_CPU_CYCLES			RAW_EVENT(0x3c, 0x00)
-#define	INTEL_ARCH_INSTRUCTIONS_RETIRED		RAW_EVENT(0xc0, 0x00)
-#define	INTEL_ARCH_REFERENCE_CYCLES		RAW_EVENT(0x3c, 0x01)
-#define	INTEL_ARCH_LLC_REFERENCES		RAW_EVENT(0x2e, 0x4f)
-#define	INTEL_ARCH_LLC_MISSES			RAW_EVENT(0x2e, 0x41)
-#define	INTEL_ARCH_BRANCHES_RETIRED		RAW_EVENT(0xc4, 0x00)
-#define	INTEL_ARCH_BRANCHES_MISPREDICTED	RAW_EVENT(0xc5, 0x00)
-#define	INTEL_ARCH_TOPDOWN_SLOTS		RAW_EVENT(0xa4, 0x01)
-
-#define	AMD_ZEN_CORE_CYCLES			RAW_EVENT(0x76, 0x00)
-#define	AMD_ZEN_INSTRUCTIONS_RETIRED		RAW_EVENT(0xc0, 0x00)
-#define	AMD_ZEN_BRANCHES_RETIRED		RAW_EVENT(0xc2, 0x00)
-#define	AMD_ZEN_BRANCHES_MISPREDICTED		RAW_EVENT(0xc3, 0x00)
-
-/*
- * Note!  The order and thus the index of the architectural events matters as
- * support for each event is enumerated via CPUID using the index of the event.
- */
-enum intel_pmu_architectural_events {
-	INTEL_ARCH_CPU_CYCLES_INDEX,
-	INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX,
-	INTEL_ARCH_REFERENCE_CYCLES_INDEX,
-	INTEL_ARCH_LLC_REFERENCES_INDEX,
-	INTEL_ARCH_LLC_MISSES_INDEX,
-	INTEL_ARCH_BRANCHES_RETIRED_INDEX,
-	INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX,
-	INTEL_ARCH_TOPDOWN_SLOTS_INDEX,
-	NR_INTEL_ARCH_EVENTS,
-};
-
-enum amd_pmu_zen_events {
-	AMD_ZEN_CORE_CYCLES_INDEX,
-	AMD_ZEN_INSTRUCTIONS_INDEX,
-	AMD_ZEN_BRANCHES_INDEX,
-	AMD_ZEN_BRANCH_MISSES_INDEX,
-	NR_AMD_ZEN_EVENTS,
-};
-
-extern const uint64_t intel_pmu_arch_events[];
-extern const uint64_t amd_pmu_zen_events[];
-
-#endif /* SELFTEST_KVM_PMU_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
deleted file mode 100644
index 645200e95f89..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ /dev/null
@@ -1,1397 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * tools/testing/selftests/kvm/include/x86_64/processor.h
- *
- * Copyright (C) 2018, Google LLC.
- */
-
-#ifndef SELFTEST_KVM_PROCESSOR_H
-#define SELFTEST_KVM_PROCESSOR_H
-
-#include <assert.h>
-#include <stdint.h>
-#include <syscall.h>
-
-#include <asm/msr-index.h>
-#include <asm/prctl.h>
-
-#include <linux/kvm_para.h>
-#include <linux/stringify.h>
-
-#include "kvm_util.h"
-#include "ucall_common.h"
-
-extern bool host_cpu_is_intel;
-extern bool host_cpu_is_amd;
-extern uint64_t guest_tsc_khz;
-
-#ifndef MAX_NR_CPUID_ENTRIES
-#define MAX_NR_CPUID_ENTRIES 100
-#endif
-
-/* Forced emulation prefix, used to invoke the emulator unconditionally. */
-#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
-
-#define NMI_VECTOR		0x02
-
-#define X86_EFLAGS_FIXED	 (1u << 1)
-
-#define X86_CR4_VME		(1ul << 0)
-#define X86_CR4_PVI		(1ul << 1)
-#define X86_CR4_TSD		(1ul << 2)
-#define X86_CR4_DE		(1ul << 3)
-#define X86_CR4_PSE		(1ul << 4)
-#define X86_CR4_PAE		(1ul << 5)
-#define X86_CR4_MCE		(1ul << 6)
-#define X86_CR4_PGE		(1ul << 7)
-#define X86_CR4_PCE		(1ul << 8)
-#define X86_CR4_OSFXSR		(1ul << 9)
-#define X86_CR4_OSXMMEXCPT	(1ul << 10)
-#define X86_CR4_UMIP		(1ul << 11)
-#define X86_CR4_LA57		(1ul << 12)
-#define X86_CR4_VMXE		(1ul << 13)
-#define X86_CR4_SMXE		(1ul << 14)
-#define X86_CR4_FSGSBASE	(1ul << 16)
-#define X86_CR4_PCIDE		(1ul << 17)
-#define X86_CR4_OSXSAVE		(1ul << 18)
-#define X86_CR4_SMEP		(1ul << 20)
-#define X86_CR4_SMAP		(1ul << 21)
-#define X86_CR4_PKE		(1ul << 22)
-
-struct xstate_header {
-	u64				xstate_bv;
-	u64				xcomp_bv;
-	u64				reserved[6];
-} __attribute__((packed));
-
-struct xstate {
-	u8				i387[512];
-	struct xstate_header		header;
-	u8				extended_state_area[0];
-} __attribute__ ((packed, aligned (64)));
-
-#define XFEATURE_MASK_FP		BIT_ULL(0)
-#define XFEATURE_MASK_SSE		BIT_ULL(1)
-#define XFEATURE_MASK_YMM		BIT_ULL(2)
-#define XFEATURE_MASK_BNDREGS		BIT_ULL(3)
-#define XFEATURE_MASK_BNDCSR		BIT_ULL(4)
-#define XFEATURE_MASK_OPMASK		BIT_ULL(5)
-#define XFEATURE_MASK_ZMM_Hi256		BIT_ULL(6)
-#define XFEATURE_MASK_Hi16_ZMM		BIT_ULL(7)
-#define XFEATURE_MASK_PT		BIT_ULL(8)
-#define XFEATURE_MASK_PKRU		BIT_ULL(9)
-#define XFEATURE_MASK_PASID		BIT_ULL(10)
-#define XFEATURE_MASK_CET_USER		BIT_ULL(11)
-#define XFEATURE_MASK_CET_KERNEL	BIT_ULL(12)
-#define XFEATURE_MASK_LBR		BIT_ULL(15)
-#define XFEATURE_MASK_XTILE_CFG		BIT_ULL(17)
-#define XFEATURE_MASK_XTILE_DATA	BIT_ULL(18)
-
-#define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK | \
-					 XFEATURE_MASK_ZMM_Hi256 | \
-					 XFEATURE_MASK_Hi16_ZMM)
-#define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILE_DATA | \
-					 XFEATURE_MASK_XTILE_CFG)
-
-/* Note, these are ordered alphabetically to match kvm_cpuid_entry2.  Eww. */
-enum cpuid_output_regs {
-	KVM_CPUID_EAX,
-	KVM_CPUID_EBX,
-	KVM_CPUID_ECX,
-	KVM_CPUID_EDX
-};
-
-/*
- * Pack the information into a 64-bit value so that each X86_FEATURE_XXX can be
- * passed by value with no overhead.
- */
-struct kvm_x86_cpu_feature {
-	u32	function;
-	u16	index;
-	u8	reg;
-	u8	bit;
-};
-#define	KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit)				\
-({										\
-	struct kvm_x86_cpu_feature feature = {					\
-		.function = fn,							\
-		.index = idx,							\
-		.reg = KVM_CPUID_##gpr,						\
-		.bit = __bit,							\
-	};									\
-										\
-	kvm_static_assert((fn & 0xc0000000) == 0 ||				\
-			  (fn & 0xc0000000) == 0x40000000 ||			\
-			  (fn & 0xc0000000) == 0x80000000 ||			\
-			  (fn & 0xc0000000) == 0xc0000000);			\
-	kvm_static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE));	\
-	feature;								\
-})
-
-/*
- * Basic Leafs, a.k.a. Intel defined
- */
-#define	X86_FEATURE_MWAIT		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 3)
-#define	X86_FEATURE_VMX			KVM_X86_CPU_FEATURE(0x1, 0, ECX, 5)
-#define	X86_FEATURE_SMX			KVM_X86_CPU_FEATURE(0x1, 0, ECX, 6)
-#define	X86_FEATURE_PDCM		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 15)
-#define	X86_FEATURE_PCID		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 17)
-#define X86_FEATURE_X2APIC		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 21)
-#define	X86_FEATURE_MOVBE		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 22)
-#define	X86_FEATURE_TSC_DEADLINE_TIMER	KVM_X86_CPU_FEATURE(0x1, 0, ECX, 24)
-#define	X86_FEATURE_XSAVE		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26)
-#define	X86_FEATURE_OSXSAVE		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27)
-#define	X86_FEATURE_RDRAND		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30)
-#define	X86_FEATURE_HYPERVISOR		KVM_X86_CPU_FEATURE(0x1, 0, ECX, 31)
-#define X86_FEATURE_PAE			KVM_X86_CPU_FEATURE(0x1, 0, EDX, 6)
-#define	X86_FEATURE_MCE			KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7)
-#define	X86_FEATURE_APIC		KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9)
-#define	X86_FEATURE_CLFLUSH		KVM_X86_CPU_FEATURE(0x1, 0, EDX, 19)
-#define	X86_FEATURE_XMM			KVM_X86_CPU_FEATURE(0x1, 0, EDX, 25)
-#define	X86_FEATURE_XMM2		KVM_X86_CPU_FEATURE(0x1, 0, EDX, 26)
-#define	X86_FEATURE_FSGSBASE		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 0)
-#define	X86_FEATURE_TSC_ADJUST		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 1)
-#define	X86_FEATURE_SGX			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 2)
-#define	X86_FEATURE_HLE			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 4)
-#define	X86_FEATURE_SMEP	        KVM_X86_CPU_FEATURE(0x7, 0, EBX, 7)
-#define	X86_FEATURE_INVPCID		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 10)
-#define	X86_FEATURE_RTM			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 11)
-#define	X86_FEATURE_MPX			KVM_X86_CPU_FEATURE(0x7, 0, EBX, 14)
-#define	X86_FEATURE_SMAP		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 20)
-#define	X86_FEATURE_PCOMMIT		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 22)
-#define	X86_FEATURE_CLFLUSHOPT		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 23)
-#define	X86_FEATURE_CLWB		KVM_X86_CPU_FEATURE(0x7, 0, EBX, 24)
-#define	X86_FEATURE_UMIP		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 2)
-#define	X86_FEATURE_PKU			KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3)
-#define	X86_FEATURE_OSPKE		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 4)
-#define	X86_FEATURE_LA57		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 16)
-#define	X86_FEATURE_RDPID		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22)
-#define	X86_FEATURE_SGX_LC		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 30)
-#define	X86_FEATURE_SHSTK		KVM_X86_CPU_FEATURE(0x7, 0, ECX, 7)
-#define	X86_FEATURE_IBT			KVM_X86_CPU_FEATURE(0x7, 0, EDX, 20)
-#define	X86_FEATURE_AMX_TILE		KVM_X86_CPU_FEATURE(0x7, 0, EDX, 24)
-#define	X86_FEATURE_SPEC_CTRL		KVM_X86_CPU_FEATURE(0x7, 0, EDX, 26)
-#define	X86_FEATURE_ARCH_CAPABILITIES	KVM_X86_CPU_FEATURE(0x7, 0, EDX, 29)
-#define	X86_FEATURE_PKS			KVM_X86_CPU_FEATURE(0x7, 0, ECX, 31)
-#define	X86_FEATURE_XTILECFG		KVM_X86_CPU_FEATURE(0xD, 0, EAX, 17)
-#define	X86_FEATURE_XTILEDATA		KVM_X86_CPU_FEATURE(0xD, 0, EAX, 18)
-#define	X86_FEATURE_XSAVES		KVM_X86_CPU_FEATURE(0xD, 1, EAX, 3)
-#define	X86_FEATURE_XFD			KVM_X86_CPU_FEATURE(0xD, 1, EAX, 4)
-#define X86_FEATURE_XTILEDATA_XFD	KVM_X86_CPU_FEATURE(0xD, 18, ECX, 2)
-
-/*
- * Extended Leafs, a.k.a. AMD defined
- */
-#define	X86_FEATURE_SVM			KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 2)
-#define	X86_FEATURE_NX			KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 20)
-#define	X86_FEATURE_GBPAGES		KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 26)
-#define	X86_FEATURE_RDTSCP		KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 27)
-#define	X86_FEATURE_LM			KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 29)
-#define	X86_FEATURE_INVTSC		KVM_X86_CPU_FEATURE(0x80000007, 0, EDX, 8)
-#define	X86_FEATURE_RDPRU		KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 4)
-#define	X86_FEATURE_AMD_IBPB		KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 12)
-#define	X86_FEATURE_NPT			KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 0)
-#define	X86_FEATURE_LBRV		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 1)
-#define	X86_FEATURE_NRIPS		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 3)
-#define X86_FEATURE_TSCRATEMSR          KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4)
-#define X86_FEATURE_PAUSEFILTER         KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10)
-#define X86_FEATURE_PFTHRESHOLD         KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12)
-#define	X86_FEATURE_VGIF		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16)
-#define X86_FEATURE_SEV			KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1)
-#define X86_FEATURE_SEV_ES		KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3)
-
-/*
- * KVM defined paravirt features.
- */
-#define X86_FEATURE_KVM_CLOCKSOURCE	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 0)
-#define X86_FEATURE_KVM_NOP_IO_DELAY	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 1)
-#define X86_FEATURE_KVM_MMU_OP		KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 2)
-#define X86_FEATURE_KVM_CLOCKSOURCE2	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 3)
-#define X86_FEATURE_KVM_ASYNC_PF	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 4)
-#define X86_FEATURE_KVM_STEAL_TIME	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 5)
-#define X86_FEATURE_KVM_PV_EOI		KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 6)
-#define X86_FEATURE_KVM_PV_UNHALT	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 7)
-/* Bit 8 apparently isn't used?!?! */
-#define X86_FEATURE_KVM_PV_TLB_FLUSH	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 9)
-#define X86_FEATURE_KVM_ASYNC_PF_VMEXIT	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 10)
-#define X86_FEATURE_KVM_PV_SEND_IPI	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 11)
-#define X86_FEATURE_KVM_POLL_CONTROL	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 12)
-#define X86_FEATURE_KVM_PV_SCHED_YIELD	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 13)
-#define X86_FEATURE_KVM_ASYNC_PF_INT	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 14)
-#define X86_FEATURE_KVM_MSI_EXT_DEST_ID	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 15)
-#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16)
-#define X86_FEATURE_KVM_MIGRATION_CONTROL	KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17)
-
-/*
- * Same idea as X86_FEATURE_XXX, but X86_PROPERTY_XXX retrieves a multi-bit
- * value/property as opposed to a single-bit feature.  Again, pack the info
- * into a 64-bit value to pass by value with no overhead.
- */
-struct kvm_x86_cpu_property {
-	u32	function;
-	u8	index;
-	u8	reg;
-	u8	lo_bit;
-	u8	hi_bit;
-};
-#define	KVM_X86_CPU_PROPERTY(fn, idx, gpr, low_bit, high_bit)			\
-({										\
-	struct kvm_x86_cpu_property property = {				\
-		.function = fn,							\
-		.index = idx,							\
-		.reg = KVM_CPUID_##gpr,						\
-		.lo_bit = low_bit,						\
-		.hi_bit = high_bit,						\
-	};									\
-										\
-	kvm_static_assert(low_bit < high_bit);					\
-	kvm_static_assert((fn & 0xc0000000) == 0 ||				\
-			  (fn & 0xc0000000) == 0x40000000 ||			\
-			  (fn & 0xc0000000) == 0x80000000 ||			\
-			  (fn & 0xc0000000) == 0xc0000000);			\
-	kvm_static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE));	\
-	property;								\
-})
-
-#define X86_PROPERTY_MAX_BASIC_LEAF		KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31)
-#define X86_PROPERTY_PMU_VERSION		KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 0, 7)
-#define X86_PROPERTY_PMU_NR_GP_COUNTERS		KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 8, 15)
-#define X86_PROPERTY_PMU_GP_COUNTERS_BIT_WIDTH	KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 16, 23)
-#define X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH	KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 24, 31)
-#define X86_PROPERTY_PMU_EVENTS_MASK		KVM_X86_CPU_PROPERTY(0xa, 0, EBX, 0, 7)
-#define X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK	KVM_X86_CPU_PROPERTY(0xa, 0, ECX, 0, 31)
-#define X86_PROPERTY_PMU_NR_FIXED_COUNTERS	KVM_X86_CPU_PROPERTY(0xa, 0, EDX, 0, 4)
-#define X86_PROPERTY_PMU_FIXED_COUNTERS_BIT_WIDTH	KVM_X86_CPU_PROPERTY(0xa, 0, EDX, 5, 12)
-
-#define X86_PROPERTY_SUPPORTED_XCR0_LO		KVM_X86_CPU_PROPERTY(0xd,  0, EAX,  0, 31)
-#define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0	KVM_X86_CPU_PROPERTY(0xd,  0, EBX,  0, 31)
-#define X86_PROPERTY_XSTATE_MAX_SIZE		KVM_X86_CPU_PROPERTY(0xd,  0, ECX,  0, 31)
-#define X86_PROPERTY_SUPPORTED_XCR0_HI		KVM_X86_CPU_PROPERTY(0xd,  0, EDX,  0, 31)
-
-#define X86_PROPERTY_XSTATE_TILE_SIZE		KVM_X86_CPU_PROPERTY(0xd, 18, EAX,  0, 31)
-#define X86_PROPERTY_XSTATE_TILE_OFFSET		KVM_X86_CPU_PROPERTY(0xd, 18, EBX,  0, 31)
-#define X86_PROPERTY_AMX_MAX_PALETTE_TABLES	KVM_X86_CPU_PROPERTY(0x1d, 0, EAX,  0, 31)
-#define X86_PROPERTY_AMX_TOTAL_TILE_BYTES	KVM_X86_CPU_PROPERTY(0x1d, 1, EAX,  0, 15)
-#define X86_PROPERTY_AMX_BYTES_PER_TILE		KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 16, 31)
-#define X86_PROPERTY_AMX_BYTES_PER_ROW		KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 0,  15)
-#define X86_PROPERTY_AMX_NR_TILE_REGS		KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 16, 31)
-#define X86_PROPERTY_AMX_MAX_ROWS		KVM_X86_CPU_PROPERTY(0x1d, 1, ECX, 0,  15)
-
-#define X86_PROPERTY_MAX_KVM_LEAF		KVM_X86_CPU_PROPERTY(0x40000000, 0, EAX, 0, 31)
-
-#define X86_PROPERTY_MAX_EXT_LEAF		KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31)
-#define X86_PROPERTY_MAX_PHY_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7)
-#define X86_PROPERTY_MAX_VIRT_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15)
-#define X86_PROPERTY_GUEST_MAX_PHY_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 16, 23)
-#define X86_PROPERTY_SEV_C_BIT			KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5)
-#define X86_PROPERTY_PHYS_ADDR_REDUCTION	KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11)
-
-#define X86_PROPERTY_MAX_CENTAUR_LEAF		KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31)
-
-/*
- * Intel's architectural PMU events are bizarre.  They have a "feature" bit
- * that indicates the feature is _not_ supported, and a property that states
- * the length of the bit mask of unsupported features.  A feature is supported
- * if the size of the bit mask is larger than the "unavailable" bit, and said
- * bit is not set.  Fixed counters also bizarre enumeration, but inverted from
- * arch events for general purpose counters.  Fixed counters are supported if a
- * feature flag is set **OR** the total number of fixed counters is greater
- * than index of the counter.
- *
- * Wrap the events for general purpose and fixed counters to simplify checking
- * whether or not a given architectural event is supported.
- */
-struct kvm_x86_pmu_feature {
-	struct kvm_x86_cpu_feature f;
-};
-#define	KVM_X86_PMU_FEATURE(__reg, __bit)				\
-({									\
-	struct kvm_x86_pmu_feature feature = {				\
-		.f = KVM_X86_CPU_FEATURE(0xa, 0, __reg, __bit),		\
-	};								\
-									\
-	kvm_static_assert(KVM_CPUID_##__reg == KVM_CPUID_EBX ||		\
-			  KVM_CPUID_##__reg == KVM_CPUID_ECX);		\
-	feature;							\
-})
-
-#define X86_PMU_FEATURE_CPU_CYCLES			KVM_X86_PMU_FEATURE(EBX, 0)
-#define X86_PMU_FEATURE_INSNS_RETIRED			KVM_X86_PMU_FEATURE(EBX, 1)
-#define X86_PMU_FEATURE_REFERENCE_CYCLES		KVM_X86_PMU_FEATURE(EBX, 2)
-#define X86_PMU_FEATURE_LLC_REFERENCES			KVM_X86_PMU_FEATURE(EBX, 3)
-#define X86_PMU_FEATURE_LLC_MISSES			KVM_X86_PMU_FEATURE(EBX, 4)
-#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED		KVM_X86_PMU_FEATURE(EBX, 5)
-#define X86_PMU_FEATURE_BRANCHES_MISPREDICTED		KVM_X86_PMU_FEATURE(EBX, 6)
-#define X86_PMU_FEATURE_TOPDOWN_SLOTS			KVM_X86_PMU_FEATURE(EBX, 7)
-
-#define X86_PMU_FEATURE_INSNS_RETIRED_FIXED		KVM_X86_PMU_FEATURE(ECX, 0)
-#define X86_PMU_FEATURE_CPU_CYCLES_FIXED		KVM_X86_PMU_FEATURE(ECX, 1)
-#define X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED	KVM_X86_PMU_FEATURE(ECX, 2)
-#define X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED		KVM_X86_PMU_FEATURE(ECX, 3)
-
-static inline unsigned int x86_family(unsigned int eax)
-{
-	unsigned int x86;
-
-	x86 = (eax >> 8) & 0xf;
-
-	if (x86 == 0xf)
-		x86 += (eax >> 20) & 0xff;
-
-	return x86;
-}
-
-static inline unsigned int x86_model(unsigned int eax)
-{
-	return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
-}
-
-/* Page table bitfield declarations */
-#define PTE_PRESENT_MASK        BIT_ULL(0)
-#define PTE_WRITABLE_MASK       BIT_ULL(1)
-#define PTE_USER_MASK           BIT_ULL(2)
-#define PTE_ACCESSED_MASK       BIT_ULL(5)
-#define PTE_DIRTY_MASK          BIT_ULL(6)
-#define PTE_LARGE_MASK          BIT_ULL(7)
-#define PTE_GLOBAL_MASK         BIT_ULL(8)
-#define PTE_NX_MASK             BIT_ULL(63)
-
-#define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
-
-#define PAGE_SHIFT		12
-#define PAGE_SIZE		(1ULL << PAGE_SHIFT)
-#define PAGE_MASK		(~(PAGE_SIZE-1) & PHYSICAL_PAGE_MASK)
-
-#define HUGEPAGE_SHIFT(x)	(PAGE_SHIFT + (((x) - 1) * 9))
-#define HUGEPAGE_SIZE(x)	(1UL << HUGEPAGE_SHIFT(x))
-#define HUGEPAGE_MASK(x)	(~(HUGEPAGE_SIZE(x) - 1) & PHYSICAL_PAGE_MASK)
-
-#define PTE_GET_PA(pte)		((pte) & PHYSICAL_PAGE_MASK)
-#define PTE_GET_PFN(pte)        (PTE_GET_PA(pte) >> PAGE_SHIFT)
-
-/* General Registers in 64-Bit Mode */
-struct gpr64_regs {
-	u64 rax;
-	u64 rcx;
-	u64 rdx;
-	u64 rbx;
-	u64 rsp;
-	u64 rbp;
-	u64 rsi;
-	u64 rdi;
-	u64 r8;
-	u64 r9;
-	u64 r10;
-	u64 r11;
-	u64 r12;
-	u64 r13;
-	u64 r14;
-	u64 r15;
-};
-
-struct desc64 {
-	uint16_t limit0;
-	uint16_t base0;
-	unsigned base1:8, type:4, s:1, dpl:2, p:1;
-	unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
-	uint32_t base3;
-	uint32_t zero1;
-} __attribute__((packed));
-
-struct desc_ptr {
-	uint16_t size;
-	uint64_t address;
-} __attribute__((packed));
-
-struct kvm_x86_state {
-	struct kvm_xsave *xsave;
-	struct kvm_vcpu_events events;
-	struct kvm_mp_state mp_state;
-	struct kvm_regs regs;
-	struct kvm_xcrs xcrs;
-	struct kvm_sregs sregs;
-	struct kvm_debugregs debugregs;
-	union {
-		struct kvm_nested_state nested;
-		char nested_[16384];
-	};
-	struct kvm_msrs msrs;
-};
-
-static inline uint64_t get_desc64_base(const struct desc64 *desc)
-{
-	return ((uint64_t)desc->base3 << 32) |
-		(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
-}
-
-static inline uint64_t rdtsc(void)
-{
-	uint32_t eax, edx;
-	uint64_t tsc_val;
-	/*
-	 * The lfence is to wait (on Intel CPUs) until all previous
-	 * instructions have been executed. If software requires RDTSC to be
-	 * executed prior to execution of any subsequent instruction, it can
-	 * execute LFENCE immediately after RDTSC
-	 */
-	__asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
-	tsc_val = ((uint64_t)edx) << 32 | eax;
-	return tsc_val;
-}
-
-static inline uint64_t rdtscp(uint32_t *aux)
-{
-	uint32_t eax, edx;
-
-	__asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux));
-	return ((uint64_t)edx) << 32 | eax;
-}
-
-static inline uint64_t rdmsr(uint32_t msr)
-{
-	uint32_t a, d;
-
-	__asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory");
-
-	return a | ((uint64_t) d << 32);
-}
-
-static inline void wrmsr(uint32_t msr, uint64_t value)
-{
-	uint32_t a = value;
-	uint32_t d = value >> 32;
-
-	__asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory");
-}
-
-
-static inline uint16_t inw(uint16_t port)
-{
-	uint16_t tmp;
-
-	__asm__ __volatile__("in %%dx, %%ax"
-		: /* output */ "=a" (tmp)
-		: /* input */ "d" (port));
-
-	return tmp;
-}
-
-static inline uint16_t get_es(void)
-{
-	uint16_t es;
-
-	__asm__ __volatile__("mov %%es, %[es]"
-			     : /* output */ [es]"=rm"(es));
-	return es;
-}
-
-static inline uint16_t get_cs(void)
-{
-	uint16_t cs;
-
-	__asm__ __volatile__("mov %%cs, %[cs]"
-			     : /* output */ [cs]"=rm"(cs));
-	return cs;
-}
-
-static inline uint16_t get_ss(void)
-{
-	uint16_t ss;
-
-	__asm__ __volatile__("mov %%ss, %[ss]"
-			     : /* output */ [ss]"=rm"(ss));
-	return ss;
-}
-
-static inline uint16_t get_ds(void)
-{
-	uint16_t ds;
-
-	__asm__ __volatile__("mov %%ds, %[ds]"
-			     : /* output */ [ds]"=rm"(ds));
-	return ds;
-}
-
-static inline uint16_t get_fs(void)
-{
-	uint16_t fs;
-
-	__asm__ __volatile__("mov %%fs, %[fs]"
-			     : /* output */ [fs]"=rm"(fs));
-	return fs;
-}
-
-static inline uint16_t get_gs(void)
-{
-	uint16_t gs;
-
-	__asm__ __volatile__("mov %%gs, %[gs]"
-			     : /* output */ [gs]"=rm"(gs));
-	return gs;
-}
-
-static inline uint16_t get_tr(void)
-{
-	uint16_t tr;
-
-	__asm__ __volatile__("str %[tr]"
-			     : /* output */ [tr]"=rm"(tr));
-	return tr;
-}
-
-static inline uint64_t get_cr0(void)
-{
-	uint64_t cr0;
-
-	__asm__ __volatile__("mov %%cr0, %[cr0]"
-			     : /* output */ [cr0]"=r"(cr0));
-	return cr0;
-}
-
-static inline uint64_t get_cr3(void)
-{
-	uint64_t cr3;
-
-	__asm__ __volatile__("mov %%cr3, %[cr3]"
-			     : /* output */ [cr3]"=r"(cr3));
-	return cr3;
-}
-
-static inline uint64_t get_cr4(void)
-{
-	uint64_t cr4;
-
-	__asm__ __volatile__("mov %%cr4, %[cr4]"
-			     : /* output */ [cr4]"=r"(cr4));
-	return cr4;
-}
-
-static inline void set_cr4(uint64_t val)
-{
-	__asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
-}
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	__asm__ __volatile__("xgetbv;"
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
-	return eax | ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	__asm__ __volatile__("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
-}
-
-static inline void wrpkru(u32 pkru)
-{
-	/* Note, ECX and EDX are architecturally required to be '0'. */
-	asm volatile(".byte 0x0f,0x01,0xef\n\t"
-		     : : "a" (pkru), "c"(0), "d"(0));
-}
-
-static inline struct desc_ptr get_gdt(void)
-{
-	struct desc_ptr gdt;
-	__asm__ __volatile__("sgdt %[gdt]"
-			     : /* output */ [gdt]"=m"(gdt));
-	return gdt;
-}
-
-static inline struct desc_ptr get_idt(void)
-{
-	struct desc_ptr idt;
-	__asm__ __volatile__("sidt %[idt]"
-			     : /* output */ [idt]"=m"(idt));
-	return idt;
-}
-
-static inline void outl(uint16_t port, uint32_t value)
-{
-	__asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
-}
-
-static inline void __cpuid(uint32_t function, uint32_t index,
-			   uint32_t *eax, uint32_t *ebx,
-			   uint32_t *ecx, uint32_t *edx)
-{
-	*eax = function;
-	*ecx = index;
-
-	asm volatile("cpuid"
-	    : "=a" (*eax),
-	      "=b" (*ebx),
-	      "=c" (*ecx),
-	      "=d" (*edx)
-	    : "0" (*eax), "2" (*ecx)
-	    : "memory");
-}
-
-static inline void cpuid(uint32_t function,
-			 uint32_t *eax, uint32_t *ebx,
-			 uint32_t *ecx, uint32_t *edx)
-{
-	return __cpuid(function, 0, eax, ebx, ecx, edx);
-}
-
-static inline uint32_t this_cpu_fms(void)
-{
-	uint32_t eax, ebx, ecx, edx;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-	return eax;
-}
-
-static inline uint32_t this_cpu_family(void)
-{
-	return x86_family(this_cpu_fms());
-}
-
-static inline uint32_t this_cpu_model(void)
-{
-	return x86_model(this_cpu_fms());
-}
-
-static inline bool this_cpu_vendor_string_is(const char *vendor)
-{
-	const uint32_t *chunk = (const uint32_t *)vendor;
-	uint32_t eax, ebx, ecx, edx;
-
-	cpuid(0, &eax, &ebx, &ecx, &edx);
-	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
-}
-
-static inline bool this_cpu_is_intel(void)
-{
-	return this_cpu_vendor_string_is("GenuineIntel");
-}
-
-/*
- * Exclude early K5 samples with a vendor string of "AMDisbetter!"
- */
-static inline bool this_cpu_is_amd(void)
-{
-	return this_cpu_vendor_string_is("AuthenticAMD");
-}
-
-static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index,
-				      uint8_t reg, uint8_t lo, uint8_t hi)
-{
-	uint32_t gprs[4];
-
-	__cpuid(function, index,
-		&gprs[KVM_CPUID_EAX], &gprs[KVM_CPUID_EBX],
-		&gprs[KVM_CPUID_ECX], &gprs[KVM_CPUID_EDX]);
-
-	return (gprs[reg] & GENMASK(hi, lo)) >> lo;
-}
-
-static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature)
-{
-	return __this_cpu_has(feature.function, feature.index,
-			      feature.reg, feature.bit, feature.bit);
-}
-
-static inline uint32_t this_cpu_property(struct kvm_x86_cpu_property property)
-{
-	return __this_cpu_has(property.function, property.index,
-			      property.reg, property.lo_bit, property.hi_bit);
-}
-
-static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property)
-{
-	uint32_t max_leaf;
-
-	switch (property.function & 0xc0000000) {
-	case 0:
-		max_leaf = this_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF);
-		break;
-	case 0x40000000:
-		max_leaf = this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF);
-		break;
-	case 0x80000000:
-		max_leaf = this_cpu_property(X86_PROPERTY_MAX_EXT_LEAF);
-		break;
-	case 0xc0000000:
-		max_leaf = this_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF);
-	}
-	return max_leaf >= property.function;
-}
-
-static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature)
-{
-	uint32_t nr_bits;
-
-	if (feature.f.reg == KVM_CPUID_EBX) {
-		nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
-		return nr_bits > feature.f.bit && !this_cpu_has(feature.f);
-	}
-
-	GUEST_ASSERT(feature.f.reg == KVM_CPUID_ECX);
-	nr_bits = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
-	return nr_bits > feature.f.bit || this_cpu_has(feature.f);
-}
-
-static __always_inline uint64_t this_cpu_supported_xcr0(void)
-{
-	if (!this_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO))
-		return 0;
-
-	return this_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) |
-	       ((uint64_t)this_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32);
-}
-
-typedef u32		__attribute__((vector_size(16))) sse128_t;
-#define __sse128_u	union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
-#define sse128_lo(x)	({ __sse128_u t; t.vec = x; t.as_u64[0]; })
-#define sse128_hi(x)	({ __sse128_u t; t.vec = x; t.as_u64[1]; })
-
-static inline void read_sse_reg(int reg, sse128_t *data)
-{
-	switch (reg) {
-	case 0:
-		asm("movdqa %%xmm0, %0" : "=m"(*data));
-		break;
-	case 1:
-		asm("movdqa %%xmm1, %0" : "=m"(*data));
-		break;
-	case 2:
-		asm("movdqa %%xmm2, %0" : "=m"(*data));
-		break;
-	case 3:
-		asm("movdqa %%xmm3, %0" : "=m"(*data));
-		break;
-	case 4:
-		asm("movdqa %%xmm4, %0" : "=m"(*data));
-		break;
-	case 5:
-		asm("movdqa %%xmm5, %0" : "=m"(*data));
-		break;
-	case 6:
-		asm("movdqa %%xmm6, %0" : "=m"(*data));
-		break;
-	case 7:
-		asm("movdqa %%xmm7, %0" : "=m"(*data));
-		break;
-	default:
-		BUG();
-	}
-}
-
-static inline void write_sse_reg(int reg, const sse128_t *data)
-{
-	switch (reg) {
-	case 0:
-		asm("movdqa %0, %%xmm0" : : "m"(*data));
-		break;
-	case 1:
-		asm("movdqa %0, %%xmm1" : : "m"(*data));
-		break;
-	case 2:
-		asm("movdqa %0, %%xmm2" : : "m"(*data));
-		break;
-	case 3:
-		asm("movdqa %0, %%xmm3" : : "m"(*data));
-		break;
-	case 4:
-		asm("movdqa %0, %%xmm4" : : "m"(*data));
-		break;
-	case 5:
-		asm("movdqa %0, %%xmm5" : : "m"(*data));
-		break;
-	case 6:
-		asm("movdqa %0, %%xmm6" : : "m"(*data));
-		break;
-	case 7:
-		asm("movdqa %0, %%xmm7" : : "m"(*data));
-		break;
-	default:
-		BUG();
-	}
-}
-
-static inline void cpu_relax(void)
-{
-	asm volatile("rep; nop" ::: "memory");
-}
-
-static inline void udelay(unsigned long usec)
-{
-	uint64_t start, now, cycles;
-
-	GUEST_ASSERT(guest_tsc_khz);
-	cycles = guest_tsc_khz / 1000 * usec;
-
-	/*
-	 * Deliberately don't PAUSE, a.k.a. cpu_relax(), so that the delay is
-	 * as accurate as possible, e.g. doesn't trigger PAUSE-Loop VM-Exits.
-	 */
-	start = rdtsc();
-	do {
-		now = rdtsc();
-	} while (now - start < cycles);
-}
-
-#define ud2()			\
-	__asm__ __volatile__(	\
-		"ud2\n"	\
-		)
-
-#define hlt()			\
-	__asm__ __volatile__(	\
-		"hlt\n"	\
-		)
-
-struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu);
-void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state);
-void kvm_x86_state_cleanup(struct kvm_x86_state *state);
-
-const struct kvm_msr_list *kvm_get_msr_index_list(void);
-const struct kvm_msr_list *kvm_get_feature_msr_index_list(void);
-bool kvm_msr_is_in_save_restore_list(uint32_t msr_index);
-uint64_t kvm_get_feature_msr(uint64_t msr_index);
-
-static inline void vcpu_msrs_get(struct kvm_vcpu *vcpu,
-				 struct kvm_msrs *msrs)
-{
-	int r = __vcpu_ioctl(vcpu, KVM_GET_MSRS, msrs);
-
-	TEST_ASSERT(r == msrs->nmsrs,
-		    "KVM_GET_MSRS failed, r: %i (failed on MSR %x)",
-		    r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index);
-}
-static inline void vcpu_msrs_set(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs)
-{
-	int r = __vcpu_ioctl(vcpu, KVM_SET_MSRS, msrs);
-
-	TEST_ASSERT(r == msrs->nmsrs,
-		    "KVM_SET_MSRS failed, r: %i (failed on MSR %x)",
-		    r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index);
-}
-static inline void vcpu_debugregs_get(struct kvm_vcpu *vcpu,
-				      struct kvm_debugregs *debugregs)
-{
-	vcpu_ioctl(vcpu, KVM_GET_DEBUGREGS, debugregs);
-}
-static inline void vcpu_debugregs_set(struct kvm_vcpu *vcpu,
-				      struct kvm_debugregs *debugregs)
-{
-	vcpu_ioctl(vcpu, KVM_SET_DEBUGREGS, debugregs);
-}
-static inline void vcpu_xsave_get(struct kvm_vcpu *vcpu,
-				  struct kvm_xsave *xsave)
-{
-	vcpu_ioctl(vcpu, KVM_GET_XSAVE, xsave);
-}
-static inline void vcpu_xsave2_get(struct kvm_vcpu *vcpu,
-				   struct kvm_xsave *xsave)
-{
-	vcpu_ioctl(vcpu, KVM_GET_XSAVE2, xsave);
-}
-static inline void vcpu_xsave_set(struct kvm_vcpu *vcpu,
-				  struct kvm_xsave *xsave)
-{
-	vcpu_ioctl(vcpu, KVM_SET_XSAVE, xsave);
-}
-static inline void vcpu_xcrs_get(struct kvm_vcpu *vcpu,
-				 struct kvm_xcrs *xcrs)
-{
-	vcpu_ioctl(vcpu, KVM_GET_XCRS, xcrs);
-}
-static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs)
-{
-	vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs);
-}
-
-const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
-					       uint32_t function, uint32_t index);
-const struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
-
-static inline uint32_t kvm_cpu_fms(void)
-{
-	return get_cpuid_entry(kvm_get_supported_cpuid(), 0x1, 0)->eax;
-}
-
-static inline uint32_t kvm_cpu_family(void)
-{
-	return x86_family(kvm_cpu_fms());
-}
-
-static inline uint32_t kvm_cpu_model(void)
-{
-	return x86_model(kvm_cpu_fms());
-}
-
-bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
-		   struct kvm_x86_cpu_feature feature);
-
-static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature)
-{
-	return kvm_cpuid_has(kvm_get_supported_cpuid(), feature);
-}
-
-uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
-			    struct kvm_x86_cpu_property property);
-
-static inline uint32_t kvm_cpu_property(struct kvm_x86_cpu_property property)
-{
-	return kvm_cpuid_property(kvm_get_supported_cpuid(), property);
-}
-
-static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property)
-{
-	uint32_t max_leaf;
-
-	switch (property.function & 0xc0000000) {
-	case 0:
-		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF);
-		break;
-	case 0x40000000:
-		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_KVM_LEAF);
-		break;
-	case 0x80000000:
-		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_EXT_LEAF);
-		break;
-	case 0xc0000000:
-		max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF);
-	}
-	return max_leaf >= property.function;
-}
-
-static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature)
-{
-	uint32_t nr_bits;
-
-	if (feature.f.reg == KVM_CPUID_EBX) {
-		nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
-		return nr_bits > feature.f.bit && !kvm_cpu_has(feature.f);
-	}
-
-	TEST_ASSERT_EQ(feature.f.reg, KVM_CPUID_ECX);
-	nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
-	return nr_bits > feature.f.bit || kvm_cpu_has(feature.f);
-}
-
-static __always_inline uint64_t kvm_cpu_supported_xcr0(void)
-{
-	if (!kvm_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO))
-		return 0;
-
-	return kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) |
-	       ((uint64_t)kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32);
-}
-
-static inline size_t kvm_cpuid2_size(int nr_entries)
-{
-	return sizeof(struct kvm_cpuid2) +
-	       sizeof(struct kvm_cpuid_entry2) * nr_entries;
-}
-
-/*
- * Allocate a "struct kvm_cpuid2* instance, with the 0-length arrary of
- * entries sized to hold @nr_entries.  The caller is responsible for freeing
- * the struct.
- */
-static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries)
-{
-	struct kvm_cpuid2 *cpuid;
-
-	cpuid = malloc(kvm_cpuid2_size(nr_entries));
-	TEST_ASSERT(cpuid, "-ENOMEM when allocating kvm_cpuid2");
-
-	cpuid->nent = nr_entries;
-
-	return cpuid;
-}
-
-void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid);
-
-static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu,
-							      uint32_t function,
-							      uint32_t index)
-{
-	return (struct kvm_cpuid_entry2 *)get_cpuid_entry(vcpu->cpuid,
-							  function, index);
-}
-
-static inline struct kvm_cpuid_entry2 *vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu,
-							    uint32_t function)
-{
-	return __vcpu_get_cpuid_entry(vcpu, function, 0);
-}
-
-static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first");
-	r = __vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid);
-	if (r)
-		return r;
-
-	/* On success, refresh the cache to pick up adjustments made by KVM. */
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
-	return 0;
-}
-
-static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu)
-{
-	TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first");
-	vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid);
-
-	/* Refresh the cache to pick up adjustments made by KVM. */
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
-}
-
-static inline void vcpu_get_cpuid(struct kvm_vcpu *vcpu)
-{
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
-}
-
-void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
-			     struct kvm_x86_cpu_property property,
-			     uint32_t value);
-void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr);
-
-void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function);
-
-static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu,
-				  struct kvm_x86_cpu_feature feature)
-{
-	struct kvm_cpuid_entry2 *entry;
-
-	entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
-	return *((&entry->eax) + feature.reg) & BIT(feature.bit);
-}
-
-void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
-				     struct kvm_x86_cpu_feature feature,
-				     bool set);
-
-static inline void vcpu_set_cpuid_feature(struct kvm_vcpu *vcpu,
-					  struct kvm_x86_cpu_feature feature)
-{
-	vcpu_set_or_clear_cpuid_feature(vcpu, feature, true);
-
-}
-
-static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu,
-					    struct kvm_x86_cpu_feature feature)
-{
-	vcpu_set_or_clear_cpuid_feature(vcpu, feature, false);
-}
-
-uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index);
-int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value);
-
-/*
- * Assert on an MSR access(es) and pretty print the MSR name when possible.
- * Note, the caller provides the stringified name so that the name of macro is
- * printed, not the value the macro resolves to (due to macro expansion).
- */
-#define TEST_ASSERT_MSR(cond, fmt, msr, str, args...)				\
-do {										\
-	if (__builtin_constant_p(msr)) {					\
-		TEST_ASSERT(cond, fmt, str, args);				\
-	} else if (!(cond)) {							\
-		char buf[16];							\
-										\
-		snprintf(buf, sizeof(buf), "MSR 0x%x", msr);			\
-		TEST_ASSERT(cond, fmt, buf, args);				\
-	}									\
-} while (0)
-
-/*
- * Returns true if KVM should return the last written value when reading an MSR
- * from userspace, e.g. the MSR isn't a command MSR, doesn't emulate state that
- * is changing, etc.  This is NOT an exhaustive list!  The intent is to filter
- * out MSRs that are not durable _and_ that a selftest wants to write.
- */
-static inline bool is_durable_msr(uint32_t msr)
-{
-	return msr != MSR_IA32_TSC;
-}
-
-#define vcpu_set_msr(vcpu, msr, val)							\
-do {											\
-	uint64_t r, v = val;								\
-											\
-	TEST_ASSERT_MSR(_vcpu_set_msr(vcpu, msr, v) == 1,				\
-			"KVM_SET_MSRS failed on %s, value = 0x%lx", msr, #msr, v);	\
-	if (!is_durable_msr(msr))							\
-		break;									\
-	r = vcpu_get_msr(vcpu, msr);							\
-	TEST_ASSERT_MSR(r == v, "Set %s to '0x%lx', got back '0x%lx'", msr, #msr, v, r);\
-} while (0)
-
-void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
-void kvm_init_vm_address_properties(struct kvm_vm *vm);
-bool vm_is_unrestricted_guest(struct kvm_vm *vm);
-
-struct ex_regs {
-	uint64_t rax, rcx, rdx, rbx;
-	uint64_t rbp, rsi, rdi;
-	uint64_t r8, r9, r10, r11;
-	uint64_t r12, r13, r14, r15;
-	uint64_t vector;
-	uint64_t error_code;
-	uint64_t rip;
-	uint64_t cs;
-	uint64_t rflags;
-};
-
-struct idt_entry {
-	uint16_t offset0;
-	uint16_t selector;
-	uint16_t ist : 3;
-	uint16_t : 5;
-	uint16_t type : 4;
-	uint16_t : 1;
-	uint16_t dpl : 2;
-	uint16_t p : 1;
-	uint16_t offset1;
-	uint32_t offset2; uint32_t reserved;
-};
-
-void vm_install_exception_handler(struct kvm_vm *vm, int vector,
-			void (*handler)(struct ex_regs *));
-
-/* If a toddler were to say "abracadabra". */
-#define KVM_EXCEPTION_MAGIC 0xabacadabaULL
-
-/*
- * KVM selftest exception fixup uses registers to coordinate with the exception
- * handler, versus the kernel's in-memory tables and KVM-Unit-Tests's in-memory
- * per-CPU data.  Using only registers avoids having to map memory into the
- * guest, doesn't require a valid, stable GS.base, and reduces the risk of
- * for recursive faults when accessing memory in the handler.  The downside to
- * using registers is that it restricts what registers can be used by the actual
- * instruction.  But, selftests are 64-bit only, making register* pressure a
- * minor concern.  Use r9-r11 as they are volatile, i.e. don't need to be saved
- * by the callee, and except for r11 are not implicit parameters to any
- * instructions.  Ideally, fixup would use r8-r10 and thus avoid implicit
- * parameters entirely, but Hyper-V's hypercall ABI uses r8 and testing Hyper-V
- * is higher priority than testing non-faulting SYSCALL/SYSRET.
- *
- * Note, the fixup handler deliberately does not handle #DE, i.e. the vector
- * is guaranteed to be non-zero on fault.
- *
- * REGISTER INPUTS:
- * r9  = MAGIC
- * r10 = RIP
- * r11 = new RIP on fault
- *
- * REGISTER OUTPUTS:
- * r9  = exception vector (non-zero)
- * r10 = error code
- */
-#define __KVM_ASM_SAFE(insn, fep)				\
-	"mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t"	\
-	"lea 1f(%%rip), %%r10\n\t"				\
-	"lea 2f(%%rip), %%r11\n\t"				\
-	fep "1: " insn "\n\t"					\
-	"xor %%r9, %%r9\n\t"					\
-	"2:\n\t"						\
-	"mov  %%r9b, %[vector]\n\t"				\
-	"mov  %%r10, %[error_code]\n\t"
-
-#define KVM_ASM_SAFE(insn) __KVM_ASM_SAFE(insn, "")
-#define KVM_ASM_SAFE_FEP(insn) __KVM_ASM_SAFE(insn, KVM_FEP)
-
-#define KVM_ASM_SAFE_OUTPUTS(v, ec)	[vector] "=qm"(v), [error_code] "=rm"(ec)
-#define KVM_ASM_SAFE_CLOBBERS	"r9", "r10", "r11"
-
-#define kvm_asm_safe(insn, inputs...)					\
-({									\
-	uint64_t ign_error_code;					\
-	uint8_t vector;							\
-									\
-	asm volatile(KVM_ASM_SAFE(insn)					\
-		     : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code)	\
-		     : inputs						\
-		     : KVM_ASM_SAFE_CLOBBERS);				\
-	vector;								\
-})
-
-#define kvm_asm_safe_ec(insn, error_code, inputs...)			\
-({									\
-	uint8_t vector;							\
-									\
-	asm volatile(KVM_ASM_SAFE(insn)					\
-		     : KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
-		     : inputs						\
-		     : KVM_ASM_SAFE_CLOBBERS);				\
-	vector;								\
-})
-
-#define kvm_asm_safe_fep(insn, inputs...)				\
-({									\
-	uint64_t ign_error_code;					\
-	uint8_t vector;							\
-									\
-	asm volatile(KVM_ASM_SAFE(insn)					\
-		     : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code)	\
-		     : inputs						\
-		     : KVM_ASM_SAFE_CLOBBERS);				\
-	vector;								\
-})
-
-#define kvm_asm_safe_ec_fep(insn, error_code, inputs...)		\
-({									\
-	uint8_t vector;							\
-									\
-	asm volatile(KVM_ASM_SAFE_FEP(insn)				\
-		     : KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
-		     : inputs						\
-		     : KVM_ASM_SAFE_CLOBBERS);				\
-	vector;								\
-})
-
-#define BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP)			\
-static inline uint8_t insn##_safe ##_fep(uint32_t idx, uint64_t *val)	\
-{									\
-	uint64_t error_code;						\
-	uint8_t vector;							\
-	uint32_t a, d;							\
-									\
-	asm volatile(KVM_ASM_SAFE##_FEP(#insn)				\
-		     : "=a"(a), "=d"(d),				\
-		       KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
-		     : "c"(idx)						\
-		     : KVM_ASM_SAFE_CLOBBERS);				\
-									\
-	*val = (uint64_t)a | ((uint64_t)d << 32);			\
-	return vector;							\
-}
-
-/*
- * Generate {insn}_safe() and {insn}_safe_fep() helpers for instructions that
- * use ECX as in input index, and EDX:EAX as a 64-bit output.
- */
-#define BUILD_READ_U64_SAFE_HELPERS(insn)				\
-	BUILD_READ_U64_SAFE_HELPER(insn, , )				\
-	BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP)			\
-
-BUILD_READ_U64_SAFE_HELPERS(rdmsr)
-BUILD_READ_U64_SAFE_HELPERS(rdpmc)
-BUILD_READ_U64_SAFE_HELPERS(xgetbv)
-
-static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val)
-{
-	return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr));
-}
-
-static inline uint8_t xsetbv_safe(uint32_t index, uint64_t value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	return kvm_asm_safe("xsetbv", "a" (eax), "d" (edx), "c" (index));
-}
-
-bool kvm_is_tdp_enabled(void);
-
-static inline bool kvm_is_pmu_enabled(void)
-{
-	return get_kvm_param_bool("enable_pmu");
-}
-
-static inline bool kvm_is_forced_emulation_enabled(void)
-{
-	return !!get_kvm_param_integer("force_emulation_prefix");
-}
-
-uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
-				    int *level);
-uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);
-
-uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
-		       uint64_t a3);
-uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1);
-void xen_hypercall(uint64_t nr, uint64_t a0, void *a1);
-
-static inline uint64_t __kvm_hypercall_map_gpa_range(uint64_t gpa,
-						     uint64_t size, uint64_t flags)
-{
-	return kvm_hypercall(KVM_HC_MAP_GPA_RANGE, gpa, size >> PAGE_SHIFT, flags, 0);
-}
-
-static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size,
-					       uint64_t flags)
-{
-	uint64_t ret = __kvm_hypercall_map_gpa_range(gpa, size, flags);
-
-	GUEST_ASSERT(!ret);
-}
-
-void __vm_xsave_require_permission(uint64_t xfeature, const char *name);
-
-#define vm_xsave_require_permission(xfeature)	\
-	__vm_xsave_require_permission(xfeature, #xfeature)
-
-enum pg_level {
-	PG_LEVEL_NONE,
-	PG_LEVEL_4K,
-	PG_LEVEL_2M,
-	PG_LEVEL_1G,
-	PG_LEVEL_512G,
-	PG_LEVEL_NUM
-};
-
-#define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12)
-#define PG_LEVEL_SIZE(_level) (1ull << PG_LEVEL_SHIFT(_level))
-
-#define PG_SIZE_4K PG_LEVEL_SIZE(PG_LEVEL_4K)
-#define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M)
-#define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
-
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level);
-void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-		    uint64_t nr_bytes, int level);
-
-/*
- * Basic CPU control in CR0
- */
-#define X86_CR0_PE          (1UL<<0) /* Protection Enable */
-#define X86_CR0_MP          (1UL<<1) /* Monitor Coprocessor */
-#define X86_CR0_EM          (1UL<<2) /* Emulation */
-#define X86_CR0_TS          (1UL<<3) /* Task Switched */
-#define X86_CR0_ET          (1UL<<4) /* Extension Type */
-#define X86_CR0_NE          (1UL<<5) /* Numeric Error */
-#define X86_CR0_WP          (1UL<<16) /* Write Protect */
-#define X86_CR0_AM          (1UL<<18) /* Alignment Mask */
-#define X86_CR0_NW          (1UL<<29) /* Not Write-through */
-#define X86_CR0_CD          (1UL<<30) /* Cache Disable */
-#define X86_CR0_PG          (1UL<<31) /* Paging */
-
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-#define PFERR_PK_BIT 5
-#define PFERR_SGX_BIT 15
-#define PFERR_GUEST_FINAL_BIT 32
-#define PFERR_GUEST_PAGE_BIT 33
-#define PFERR_IMPLICIT_ACCESS_BIT 48
-
-#define PFERR_PRESENT_MASK	BIT(PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK	BIT(PFERR_WRITE_BIT)
-#define PFERR_USER_MASK		BIT(PFERR_USER_BIT)
-#define PFERR_RSVD_MASK		BIT(PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK	BIT(PFERR_FETCH_BIT)
-#define PFERR_PK_MASK		BIT(PFERR_PK_BIT)
-#define PFERR_SGX_MASK		BIT(PFERR_SGX_BIT)
-#define PFERR_GUEST_FINAL_MASK	BIT_ULL(PFERR_GUEST_FINAL_BIT)
-#define PFERR_GUEST_PAGE_MASK	BIT_ULL(PFERR_GUEST_PAGE_BIT)
-#define PFERR_IMPLICIT_ACCESS	BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
-
-bool sys_clocksource_is_based_on_tsc(void);
-
-#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h
deleted file mode 100644
index 82c11c81a956..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/sev.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Helpers used for SEV guests
- *
- */
-#ifndef SELFTEST_KVM_SEV_H
-#define SELFTEST_KVM_SEV_H
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "linux/psp-sev.h"
-
-#include "kvm_util.h"
-#include "svm_util.h"
-#include "processor.h"
-
-enum sev_guest_state {
-	SEV_GUEST_STATE_UNINITIALIZED = 0,
-	SEV_GUEST_STATE_LAUNCH_UPDATE,
-	SEV_GUEST_STATE_LAUNCH_SECRET,
-	SEV_GUEST_STATE_RUNNING,
-};
-
-#define SEV_POLICY_NO_DBG	(1UL << 0)
-#define SEV_POLICY_ES		(1UL << 2)
-
-#define GHCB_MSR_TERM_REQ	0x100
-
-void sev_vm_launch(struct kvm_vm *vm, uint32_t policy);
-void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement);
-void sev_vm_launch_finish(struct kvm_vm *vm);
-
-struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
-					   struct kvm_vcpu **cpu);
-void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement);
-
-kvm_static_assert(SEV_RET_SUCCESS == 0);
-
-/*
- * The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long"
- * instead of a proper struct.  The size of the parameter is embedded in the
- * ioctl number, i.e. is ABI and thus immutable.  Hack around the mess by
- * creating an overlay to pass in an "unsigned long" without a cast (casting
- * will make the compiler unhappy due to dereferencing an aliased pointer).
- */
-#define __vm_sev_ioctl(vm, cmd, arg)					\
-({									\
-	int r;								\
-									\
-	union {								\
-		struct kvm_sev_cmd c;					\
-		unsigned long raw;					\
-	} sev_cmd = { .c = {						\
-		.id = (cmd),						\
-		.data = (uint64_t)(arg),				\
-		.sev_fd = (vm)->arch.sev_fd,				\
-	} };								\
-									\
-	r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &sev_cmd.raw);	\
-	r ?: sev_cmd.c.error;						\
-})
-
-#define vm_sev_ioctl(vm, cmd, arg)					\
-({									\
-	int ret = __vm_sev_ioctl(vm, cmd, arg);				\
-									\
-	__TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd,	ret, vm);		\
-})
-
-void sev_vm_init(struct kvm_vm *vm);
-void sev_es_vm_init(struct kvm_vm *vm);
-
-static inline void sev_register_encrypted_memory(struct kvm_vm *vm,
-						 struct userspace_mem_region *region)
-{
-	struct kvm_enc_region range = {
-		.addr = region->region.userspace_addr,
-		.size = region->region.memory_size,
-	};
-
-	vm_ioctl(vm, KVM_MEMORY_ENCRYPT_REG_REGION, &range);
-}
-
-static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa,
-					  uint64_t size)
-{
-	struct kvm_sev_launch_update_data update_data = {
-		.uaddr = (unsigned long)addr_gpa2hva(vm, gpa),
-		.len = size,
-	};
-
-	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data);
-}
-
-#endif /* SELFTEST_KVM_SEV_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h
deleted file mode 100644
index 4803e1056055..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/svm.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * tools/testing/selftests/kvm/include/x86_64/svm.h
- * This is a copy of arch/x86/include/asm/svm.h
- *
- */
-
-#ifndef SELFTEST_KVM_SVM_H
-#define SELFTEST_KVM_SVM_H
-
-enum {
-	INTERCEPT_INTR,
-	INTERCEPT_NMI,
-	INTERCEPT_SMI,
-	INTERCEPT_INIT,
-	INTERCEPT_VINTR,
-	INTERCEPT_SELECTIVE_CR0,
-	INTERCEPT_STORE_IDTR,
-	INTERCEPT_STORE_GDTR,
-	INTERCEPT_STORE_LDTR,
-	INTERCEPT_STORE_TR,
-	INTERCEPT_LOAD_IDTR,
-	INTERCEPT_LOAD_GDTR,
-	INTERCEPT_LOAD_LDTR,
-	INTERCEPT_LOAD_TR,
-	INTERCEPT_RDTSC,
-	INTERCEPT_RDPMC,
-	INTERCEPT_PUSHF,
-	INTERCEPT_POPF,
-	INTERCEPT_CPUID,
-	INTERCEPT_RSM,
-	INTERCEPT_IRET,
-	INTERCEPT_INTn,
-	INTERCEPT_INVD,
-	INTERCEPT_PAUSE,
-	INTERCEPT_HLT,
-	INTERCEPT_INVLPG,
-	INTERCEPT_INVLPGA,
-	INTERCEPT_IOIO_PROT,
-	INTERCEPT_MSR_PROT,
-	INTERCEPT_TASK_SWITCH,
-	INTERCEPT_FERR_FREEZE,
-	INTERCEPT_SHUTDOWN,
-	INTERCEPT_VMRUN,
-	INTERCEPT_VMMCALL,
-	INTERCEPT_VMLOAD,
-	INTERCEPT_VMSAVE,
-	INTERCEPT_STGI,
-	INTERCEPT_CLGI,
-	INTERCEPT_SKINIT,
-	INTERCEPT_RDTSCP,
-	INTERCEPT_ICEBP,
-	INTERCEPT_WBINVD,
-	INTERCEPT_MONITOR,
-	INTERCEPT_MWAIT,
-	INTERCEPT_MWAIT_COND,
-	INTERCEPT_XSETBV,
-	INTERCEPT_RDPRU,
-};
-
-struct hv_vmcb_enlightenments {
-	struct __packed hv_enlightenments_control {
-		u32 nested_flush_hypercall:1;
-		u32 msr_bitmap:1;
-		u32 enlightened_npt_tlb: 1;
-		u32 reserved:29;
-	} __packed hv_enlightenments_control;
-	u32 hv_vp_id;
-	u64 hv_vm_id;
-	u64 partition_assist_page;
-	u64 reserved;
-} __packed;
-
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31)
-
-/* Synthetic VM-Exit */
-#define HV_SVM_EXITCODE_ENL			0xf0000000
-#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH	(1)
-
-struct __attribute__ ((__packed__)) vmcb_control_area {
-	u32 intercept_cr;
-	u32 intercept_dr;
-	u32 intercept_exceptions;
-	u64 intercept;
-	u8 reserved_1[40];
-	u16 pause_filter_thresh;
-	u16 pause_filter_count;
-	u64 iopm_base_pa;
-	u64 msrpm_base_pa;
-	u64 tsc_offset;
-	u32 asid;
-	u8 tlb_ctl;
-	u8 reserved_2[3];
-	u32 int_ctl;
-	u32 int_vector;
-	u32 int_state;
-	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
-	u64 exit_info_1;
-	u64 exit_info_2;
-	u32 exit_int_info;
-	u32 exit_int_info_err;
-	u64 nested_ctl;
-	u64 avic_vapic_bar;
-	u8 reserved_4[8];
-	u32 event_inj;
-	u32 event_inj_err;
-	u64 nested_cr3;
-	u64 virt_ext;
-	u32 clean;
-	u32 reserved_5;
-	u64 next_rip;
-	u8 insn_len;
-	u8 insn_bytes[15];
-	u64 avic_backing_page;	/* Offset 0xe0 */
-	u8 reserved_6[8];	/* Offset 0xe8 */
-	u64 avic_logical_id;	/* Offset 0xf0 */
-	u64 avic_physical_id;	/* Offset 0xf8 */
-	u8 reserved_7[8];
-	u64 vmsa_pa;		/* Used for an SEV-ES guest */
-	u8 reserved_8[720];
-	/*
-	 * Offset 0x3e0, 32 bytes reserved
-	 * for use by hypervisor/software.
-	 */
-	union {
-		struct hv_vmcb_enlightenments hv_enlightenments;
-		u8 reserved_sw[32];
-	};
-};
-
-
-#define TLB_CONTROL_DO_NOTHING 0
-#define TLB_CONTROL_FLUSH_ALL_ASID 1
-#define TLB_CONTROL_FLUSH_ASID 3
-#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
-
-#define V_TPR_MASK 0x0f
-
-#define V_IRQ_SHIFT 8
-#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
-
-#define V_GIF_SHIFT 9
-#define V_GIF_MASK (1 << V_GIF_SHIFT)
-
-#define V_INTR_PRIO_SHIFT 16
-#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
-
-#define V_IGN_TPR_SHIFT 20
-#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
-
-#define V_INTR_MASKING_SHIFT 24
-#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
-
-#define V_GIF_ENABLE_SHIFT 25
-#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
-
-#define AVIC_ENABLE_SHIFT 31
-#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
-
-#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
-#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
-
-#define SVM_INTERRUPT_SHADOW_MASK 1
-
-#define SVM_IOIO_STR_SHIFT 2
-#define SVM_IOIO_REP_SHIFT 3
-#define SVM_IOIO_SIZE_SHIFT 4
-#define SVM_IOIO_ASIZE_SHIFT 7
-
-#define SVM_IOIO_TYPE_MASK 1
-#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
-#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
-#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
-#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-
-#define SVM_VM_CR_VALID_MASK	0x001fULL
-#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
-#define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
-
-#define SVM_NESTED_CTL_NP_ENABLE	BIT(0)
-#define SVM_NESTED_CTL_SEV_ENABLE	BIT(1)
-
-struct __attribute__ ((__packed__)) vmcb_seg {
-	u16 selector;
-	u16 attrib;
-	u32 limit;
-	u64 base;
-};
-
-struct __attribute__ ((__packed__)) vmcb_save_area {
-	struct vmcb_seg es;
-	struct vmcb_seg cs;
-	struct vmcb_seg ss;
-	struct vmcb_seg ds;
-	struct vmcb_seg fs;
-	struct vmcb_seg gs;
-	struct vmcb_seg gdtr;
-	struct vmcb_seg ldtr;
-	struct vmcb_seg idtr;
-	struct vmcb_seg tr;
-	u8 reserved_1[43];
-	u8 cpl;
-	u8 reserved_2[4];
-	u64 efer;
-	u8 reserved_3[112];
-	u64 cr4;
-	u64 cr3;
-	u64 cr0;
-	u64 dr7;
-	u64 dr6;
-	u64 rflags;
-	u64 rip;
-	u8 reserved_4[88];
-	u64 rsp;
-	u8 reserved_5[24];
-	u64 rax;
-	u64 star;
-	u64 lstar;
-	u64 cstar;
-	u64 sfmask;
-	u64 kernel_gs_base;
-	u64 sysenter_cs;
-	u64 sysenter_esp;
-	u64 sysenter_eip;
-	u64 cr2;
-	u8 reserved_6[32];
-	u64 g_pat;
-	u64 dbgctl;
-	u64 br_from;
-	u64 br_to;
-	u64 last_excp_from;
-	u64 last_excp_to;
-};
-
-struct __attribute__ ((__packed__)) vmcb {
-	struct vmcb_control_area control;
-	struct vmcb_save_area save;
-};
-
-#define SVM_VM_CR_SVM_DISABLE 4
-
-#define SVM_SELECTOR_S_SHIFT 4
-#define SVM_SELECTOR_DPL_SHIFT 5
-#define SVM_SELECTOR_P_SHIFT 7
-#define SVM_SELECTOR_AVL_SHIFT 8
-#define SVM_SELECTOR_L_SHIFT 9
-#define SVM_SELECTOR_DB_SHIFT 10
-#define SVM_SELECTOR_G_SHIFT 11
-
-#define SVM_SELECTOR_TYPE_MASK (0xf)
-#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
-#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
-#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
-#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
-#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
-#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
-#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
-
-#define SVM_SELECTOR_WRITE_MASK (1 << 1)
-#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
-#define SVM_SELECTOR_CODE_MASK (1 << 3)
-
-#define INTERCEPT_CR0_READ	0
-#define INTERCEPT_CR3_READ	3
-#define INTERCEPT_CR4_READ	4
-#define INTERCEPT_CR8_READ	8
-#define INTERCEPT_CR0_WRITE	(16 + 0)
-#define INTERCEPT_CR3_WRITE	(16 + 3)
-#define INTERCEPT_CR4_WRITE	(16 + 4)
-#define INTERCEPT_CR8_WRITE	(16 + 8)
-
-#define INTERCEPT_DR0_READ	0
-#define INTERCEPT_DR1_READ	1
-#define INTERCEPT_DR2_READ	2
-#define INTERCEPT_DR3_READ	3
-#define INTERCEPT_DR4_READ	4
-#define INTERCEPT_DR5_READ	5
-#define INTERCEPT_DR6_READ	6
-#define INTERCEPT_DR7_READ	7
-#define INTERCEPT_DR0_WRITE	(16 + 0)
-#define INTERCEPT_DR1_WRITE	(16 + 1)
-#define INTERCEPT_DR2_WRITE	(16 + 2)
-#define INTERCEPT_DR3_WRITE	(16 + 3)
-#define INTERCEPT_DR4_WRITE	(16 + 4)
-#define INTERCEPT_DR5_WRITE	(16 + 5)
-#define INTERCEPT_DR6_WRITE	(16 + 6)
-#define INTERCEPT_DR7_WRITE	(16 + 7)
-
-#define SVM_EVTINJ_VEC_MASK 0xff
-
-#define SVM_EVTINJ_TYPE_SHIFT 8
-#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_VALID (1 << 31)
-#define SVM_EVTINJ_VALID_ERR (1 << 11)
-
-#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
-#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
-
-#define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
-#define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
-#define	SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
-#define	SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
-
-#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
-#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
-
-#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
-#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
-#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
-
-#define SVM_EXITINFO_REG_MASK 0x0F
-
-#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
-
-#endif /* SELFTEST_KVM_SVM_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
deleted file mode 100644
index 044f0f872ba9..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * tools/testing/selftests/kvm/include/x86_64/svm_utils.h
- * Header for nested SVM testing
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-
-#ifndef SELFTEST_KVM_SVM_UTILS_H
-#define SELFTEST_KVM_SVM_UTILS_H
-
-#include <asm/svm.h>
-
-#include <stdint.h>
-#include "svm.h"
-#include "processor.h"
-
-struct svm_test_data {
-	/* VMCB */
-	struct vmcb *vmcb; /* gva */
-	void *vmcb_hva;
-	uint64_t vmcb_gpa;
-
-	/* host state-save area */
-	struct vmcb_save_area *save_area; /* gva */
-	void *save_area_hva;
-	uint64_t save_area_gpa;
-
-	/* MSR-Bitmap */
-	void *msr; /* gva */
-	void *msr_hva;
-	uint64_t msr_gpa;
-};
-
-static inline void vmmcall(void)
-{
-	/*
-	 * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle
-	 * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended
-	 * use of this function is to exit to L1 from L2.  Clobber all other
-	 * GPRs as L1 doesn't correctly preserve them during vmexits.
-	 */
-	__asm__ __volatile__("push %%rbp; vmmcall; pop %%rbp"
-			     : : "a"(0xdeadbeef), "c"(0xbeefdead)
-			     : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
-			       "r10", "r11", "r12", "r13", "r14", "r15");
-}
-
-#define stgi()			\
-	__asm__ __volatile__(	\
-		"stgi\n"	\
-		)
-
-#define clgi()			\
-	__asm__ __volatile__(	\
-		"clgi\n"	\
-		)
-
-struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva);
-void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp);
-void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa);
-
-int open_sev_dev_path_or_exit(void);
-
-#endif /* SELFTEST_KVM_SVM_UTILS_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/ucall.h b/tools/testing/selftests/kvm/include/x86_64/ucall.h
deleted file mode 100644
index d3825dcc3cd9..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/ucall.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_UCALL_H
-#define SELFTEST_KVM_UCALL_H
-
-#include "kvm_util.h"
-
-#define UCALL_EXIT_REASON       KVM_EXIT_IO
-
-static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
-{
-}
-
-#endif
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
deleted file mode 100644
index 5f0c0a29c556..000000000000
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ /dev/null
@@ -1,577 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * tools/testing/selftests/kvm/include/x86_64/vmx.h
- *
- * Copyright (C) 2018, Google LLC.
- */
-
-#ifndef SELFTEST_KVM_VMX_H
-#define SELFTEST_KVM_VMX_H
-
-#include <asm/vmx.h>
-
-#include <stdint.h>
-#include "processor.h"
-#include "apic.h"
-
-/*
- * Definitions of Primary Processor-Based VM-Execution Controls.
- */
-#define CPU_BASED_INTR_WINDOW_EXITING		0x00000004
-#define CPU_BASED_USE_TSC_OFFSETTING		0x00000008
-#define CPU_BASED_HLT_EXITING			0x00000080
-#define CPU_BASED_INVLPG_EXITING		0x00000200
-#define CPU_BASED_MWAIT_EXITING			0x00000400
-#define CPU_BASED_RDPMC_EXITING			0x00000800
-#define CPU_BASED_RDTSC_EXITING			0x00001000
-#define CPU_BASED_CR3_LOAD_EXITING		0x00008000
-#define CPU_BASED_CR3_STORE_EXITING		0x00010000
-#define CPU_BASED_CR8_LOAD_EXITING		0x00080000
-#define CPU_BASED_CR8_STORE_EXITING		0x00100000
-#define CPU_BASED_TPR_SHADOW			0x00200000
-#define CPU_BASED_NMI_WINDOW_EXITING		0x00400000
-#define CPU_BASED_MOV_DR_EXITING		0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING		0x01000000
-#define CPU_BASED_USE_IO_BITMAPS		0x02000000
-#define CPU_BASED_MONITOR_TRAP			0x08000000
-#define CPU_BASED_USE_MSR_BITMAPS		0x10000000
-#define CPU_BASED_MONITOR_EXITING		0x20000000
-#define CPU_BASED_PAUSE_EXITING			0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS	0x80000000
-
-#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x0401e172
-
-/*
- * Definitions of Secondary Processor-Based VM-Execution Controls.
- */
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-#define SECONDARY_EXEC_ENABLE_EPT		0x00000002
-#define SECONDARY_EXEC_DESC			0x00000004
-#define SECONDARY_EXEC_ENABLE_RDTSCP		0x00000008
-#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE	0x00000010
-#define SECONDARY_EXEC_ENABLE_VPID		0x00000020
-#define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
-#define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
-#define SECONDARY_EXEC_APIC_REGISTER_VIRT	0x00000100
-#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY	0x00000200
-#define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
-#define SECONDARY_EXEC_RDRAND_EXITING		0x00000800
-#define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
-#define SECONDARY_EXEC_ENABLE_VMFUNC		0x00002000
-#define SECONDARY_EXEC_SHADOW_VMCS		0x00004000
-#define SECONDARY_EXEC_RDSEED_EXITING		0x00010000
-#define SECONDARY_EXEC_ENABLE_PML		0x00020000
-#define SECONDARY_EPT_VE			0x00040000
-#define SECONDARY_ENABLE_XSAV_RESTORE		0x00100000
-#define SECONDARY_EXEC_TSC_SCALING		0x02000000
-
-#define PIN_BASED_EXT_INTR_MASK			0x00000001
-#define PIN_BASED_NMI_EXITING			0x00000008
-#define PIN_BASED_VIRTUAL_NMIS			0x00000020
-#define PIN_BASED_VMX_PREEMPTION_TIMER		0x00000040
-#define PIN_BASED_POSTED_INTR			0x00000080
-
-#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x00000016
-
-#define VM_EXIT_SAVE_DEBUG_CONTROLS		0x00000004
-#define VM_EXIT_HOST_ADDR_SPACE_SIZE		0x00000200
-#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL	0x00001000
-#define VM_EXIT_ACK_INTR_ON_EXIT		0x00008000
-#define VM_EXIT_SAVE_IA32_PAT			0x00040000
-#define VM_EXIT_LOAD_IA32_PAT			0x00080000
-#define VM_EXIT_SAVE_IA32_EFER			0x00100000
-#define VM_EXIT_LOAD_IA32_EFER			0x00200000
-#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER	0x00400000
-
-#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR	0x00036dff
-
-#define VM_ENTRY_LOAD_DEBUG_CONTROLS		0x00000004
-#define VM_ENTRY_IA32E_MODE			0x00000200
-#define VM_ENTRY_SMM				0x00000400
-#define VM_ENTRY_DEACT_DUAL_MONITOR		0x00000800
-#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL	0x00002000
-#define VM_ENTRY_LOAD_IA32_PAT			0x00004000
-#define VM_ENTRY_LOAD_IA32_EFER			0x00008000
-
-#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR	0x000011ff
-
-#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK	0x0000001f
-#define VMX_MISC_SAVE_EFER_LMA			0x00000020
-
-#define VMX_EPT_VPID_CAP_1G_PAGES		0x00020000
-#define VMX_EPT_VPID_CAP_AD_BITS		0x00200000
-
-#define EXIT_REASON_FAILED_VMENTRY	0x80000000
-
-enum vmcs_field {
-	VIRTUAL_PROCESSOR_ID		= 0x00000000,
-	POSTED_INTR_NV			= 0x00000002,
-	GUEST_ES_SELECTOR		= 0x00000800,
-	GUEST_CS_SELECTOR		= 0x00000802,
-	GUEST_SS_SELECTOR		= 0x00000804,
-	GUEST_DS_SELECTOR		= 0x00000806,
-	GUEST_FS_SELECTOR		= 0x00000808,
-	GUEST_GS_SELECTOR		= 0x0000080a,
-	GUEST_LDTR_SELECTOR		= 0x0000080c,
-	GUEST_TR_SELECTOR		= 0x0000080e,
-	GUEST_INTR_STATUS		= 0x00000810,
-	GUEST_PML_INDEX			= 0x00000812,
-	HOST_ES_SELECTOR		= 0x00000c00,
-	HOST_CS_SELECTOR		= 0x00000c02,
-	HOST_SS_SELECTOR		= 0x00000c04,
-	HOST_DS_SELECTOR		= 0x00000c06,
-	HOST_FS_SELECTOR		= 0x00000c08,
-	HOST_GS_SELECTOR		= 0x00000c0a,
-	HOST_TR_SELECTOR		= 0x00000c0c,
-	IO_BITMAP_A			= 0x00002000,
-	IO_BITMAP_A_HIGH		= 0x00002001,
-	IO_BITMAP_B			= 0x00002002,
-	IO_BITMAP_B_HIGH		= 0x00002003,
-	MSR_BITMAP			= 0x00002004,
-	MSR_BITMAP_HIGH			= 0x00002005,
-	VM_EXIT_MSR_STORE_ADDR		= 0x00002006,
-	VM_EXIT_MSR_STORE_ADDR_HIGH	= 0x00002007,
-	VM_EXIT_MSR_LOAD_ADDR		= 0x00002008,
-	VM_EXIT_MSR_LOAD_ADDR_HIGH	= 0x00002009,
-	VM_ENTRY_MSR_LOAD_ADDR		= 0x0000200a,
-	VM_ENTRY_MSR_LOAD_ADDR_HIGH	= 0x0000200b,
-	PML_ADDRESS			= 0x0000200e,
-	PML_ADDRESS_HIGH		= 0x0000200f,
-	TSC_OFFSET			= 0x00002010,
-	TSC_OFFSET_HIGH			= 0x00002011,
-	VIRTUAL_APIC_PAGE_ADDR		= 0x00002012,
-	VIRTUAL_APIC_PAGE_ADDR_HIGH	= 0x00002013,
-	APIC_ACCESS_ADDR		= 0x00002014,
-	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
-	POSTED_INTR_DESC_ADDR		= 0x00002016,
-	POSTED_INTR_DESC_ADDR_HIGH	= 0x00002017,
-	EPT_POINTER			= 0x0000201a,
-	EPT_POINTER_HIGH		= 0x0000201b,
-	EOI_EXIT_BITMAP0		= 0x0000201c,
-	EOI_EXIT_BITMAP0_HIGH		= 0x0000201d,
-	EOI_EXIT_BITMAP1		= 0x0000201e,
-	EOI_EXIT_BITMAP1_HIGH		= 0x0000201f,
-	EOI_EXIT_BITMAP2		= 0x00002020,
-	EOI_EXIT_BITMAP2_HIGH		= 0x00002021,
-	EOI_EXIT_BITMAP3		= 0x00002022,
-	EOI_EXIT_BITMAP3_HIGH		= 0x00002023,
-	VMREAD_BITMAP			= 0x00002026,
-	VMREAD_BITMAP_HIGH		= 0x00002027,
-	VMWRITE_BITMAP			= 0x00002028,
-	VMWRITE_BITMAP_HIGH		= 0x00002029,
-	XSS_EXIT_BITMAP			= 0x0000202C,
-	XSS_EXIT_BITMAP_HIGH		= 0x0000202D,
-	ENCLS_EXITING_BITMAP		= 0x0000202E,
-	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
-	TSC_MULTIPLIER			= 0x00002032,
-	TSC_MULTIPLIER_HIGH		= 0x00002033,
-	GUEST_PHYSICAL_ADDRESS		= 0x00002400,
-	GUEST_PHYSICAL_ADDRESS_HIGH	= 0x00002401,
-	VMCS_LINK_POINTER		= 0x00002800,
-	VMCS_LINK_POINTER_HIGH		= 0x00002801,
-	GUEST_IA32_DEBUGCTL		= 0x00002802,
-	GUEST_IA32_DEBUGCTL_HIGH	= 0x00002803,
-	GUEST_IA32_PAT			= 0x00002804,
-	GUEST_IA32_PAT_HIGH		= 0x00002805,
-	GUEST_IA32_EFER			= 0x00002806,
-	GUEST_IA32_EFER_HIGH		= 0x00002807,
-	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
-	GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
-	GUEST_PDPTR0			= 0x0000280a,
-	GUEST_PDPTR0_HIGH		= 0x0000280b,
-	GUEST_PDPTR1			= 0x0000280c,
-	GUEST_PDPTR1_HIGH		= 0x0000280d,
-	GUEST_PDPTR2			= 0x0000280e,
-	GUEST_PDPTR2_HIGH		= 0x0000280f,
-	GUEST_PDPTR3			= 0x00002810,
-	GUEST_PDPTR3_HIGH		= 0x00002811,
-	GUEST_BNDCFGS			= 0x00002812,
-	GUEST_BNDCFGS_HIGH		= 0x00002813,
-	HOST_IA32_PAT			= 0x00002c00,
-	HOST_IA32_PAT_HIGH		= 0x00002c01,
-	HOST_IA32_EFER			= 0x00002c02,
-	HOST_IA32_EFER_HIGH		= 0x00002c03,
-	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
-	HOST_IA32_PERF_GLOBAL_CTRL_HIGH	= 0x00002c05,
-	PIN_BASED_VM_EXEC_CONTROL	= 0x00004000,
-	CPU_BASED_VM_EXEC_CONTROL	= 0x00004002,
-	EXCEPTION_BITMAP		= 0x00004004,
-	PAGE_FAULT_ERROR_CODE_MASK	= 0x00004006,
-	PAGE_FAULT_ERROR_CODE_MATCH	= 0x00004008,
-	CR3_TARGET_COUNT		= 0x0000400a,
-	VM_EXIT_CONTROLS		= 0x0000400c,
-	VM_EXIT_MSR_STORE_COUNT		= 0x0000400e,
-	VM_EXIT_MSR_LOAD_COUNT		= 0x00004010,
-	VM_ENTRY_CONTROLS		= 0x00004012,
-	VM_ENTRY_MSR_LOAD_COUNT		= 0x00004014,
-	VM_ENTRY_INTR_INFO_FIELD	= 0x00004016,
-	VM_ENTRY_EXCEPTION_ERROR_CODE	= 0x00004018,
-	VM_ENTRY_INSTRUCTION_LEN	= 0x0000401a,
-	TPR_THRESHOLD			= 0x0000401c,
-	SECONDARY_VM_EXEC_CONTROL	= 0x0000401e,
-	PLE_GAP				= 0x00004020,
-	PLE_WINDOW			= 0x00004022,
-	VM_INSTRUCTION_ERROR		= 0x00004400,
-	VM_EXIT_REASON			= 0x00004402,
-	VM_EXIT_INTR_INFO		= 0x00004404,
-	VM_EXIT_INTR_ERROR_CODE		= 0x00004406,
-	IDT_VECTORING_INFO_FIELD	= 0x00004408,
-	IDT_VECTORING_ERROR_CODE	= 0x0000440a,
-	VM_EXIT_INSTRUCTION_LEN		= 0x0000440c,
-	VMX_INSTRUCTION_INFO		= 0x0000440e,
-	GUEST_ES_LIMIT			= 0x00004800,
-	GUEST_CS_LIMIT			= 0x00004802,
-	GUEST_SS_LIMIT			= 0x00004804,
-	GUEST_DS_LIMIT			= 0x00004806,
-	GUEST_FS_LIMIT			= 0x00004808,
-	GUEST_GS_LIMIT			= 0x0000480a,
-	GUEST_LDTR_LIMIT		= 0x0000480c,
-	GUEST_TR_LIMIT			= 0x0000480e,
-	GUEST_GDTR_LIMIT		= 0x00004810,
-	GUEST_IDTR_LIMIT		= 0x00004812,
-	GUEST_ES_AR_BYTES		= 0x00004814,
-	GUEST_CS_AR_BYTES		= 0x00004816,
-	GUEST_SS_AR_BYTES		= 0x00004818,
-	GUEST_DS_AR_BYTES		= 0x0000481a,
-	GUEST_FS_AR_BYTES		= 0x0000481c,
-	GUEST_GS_AR_BYTES		= 0x0000481e,
-	GUEST_LDTR_AR_BYTES		= 0x00004820,
-	GUEST_TR_AR_BYTES		= 0x00004822,
-	GUEST_INTERRUPTIBILITY_INFO	= 0x00004824,
-	GUEST_ACTIVITY_STATE		= 0X00004826,
-	GUEST_SYSENTER_CS		= 0x0000482A,
-	VMX_PREEMPTION_TIMER_VALUE	= 0x0000482E,
-	HOST_IA32_SYSENTER_CS		= 0x00004c00,
-	CR0_GUEST_HOST_MASK		= 0x00006000,
-	CR4_GUEST_HOST_MASK		= 0x00006002,
-	CR0_READ_SHADOW			= 0x00006004,
-	CR4_READ_SHADOW			= 0x00006006,
-	CR3_TARGET_VALUE0		= 0x00006008,
-	CR3_TARGET_VALUE1		= 0x0000600a,
-	CR3_TARGET_VALUE2		= 0x0000600c,
-	CR3_TARGET_VALUE3		= 0x0000600e,
-	EXIT_QUALIFICATION		= 0x00006400,
-	GUEST_LINEAR_ADDRESS		= 0x0000640a,
-	GUEST_CR0			= 0x00006800,
-	GUEST_CR3			= 0x00006802,
-	GUEST_CR4			= 0x00006804,
-	GUEST_ES_BASE			= 0x00006806,
-	GUEST_CS_BASE			= 0x00006808,
-	GUEST_SS_BASE			= 0x0000680a,
-	GUEST_DS_BASE			= 0x0000680c,
-	GUEST_FS_BASE			= 0x0000680e,
-	GUEST_GS_BASE			= 0x00006810,
-	GUEST_LDTR_BASE			= 0x00006812,
-	GUEST_TR_BASE			= 0x00006814,
-	GUEST_GDTR_BASE			= 0x00006816,
-	GUEST_IDTR_BASE			= 0x00006818,
-	GUEST_DR7			= 0x0000681a,
-	GUEST_RSP			= 0x0000681c,
-	GUEST_RIP			= 0x0000681e,
-	GUEST_RFLAGS			= 0x00006820,
-	GUEST_PENDING_DBG_EXCEPTIONS	= 0x00006822,
-	GUEST_SYSENTER_ESP		= 0x00006824,
-	GUEST_SYSENTER_EIP		= 0x00006826,
-	HOST_CR0			= 0x00006c00,
-	HOST_CR3			= 0x00006c02,
-	HOST_CR4			= 0x00006c04,
-	HOST_FS_BASE			= 0x00006c06,
-	HOST_GS_BASE			= 0x00006c08,
-	HOST_TR_BASE			= 0x00006c0a,
-	HOST_GDTR_BASE			= 0x00006c0c,
-	HOST_IDTR_BASE			= 0x00006c0e,
-	HOST_IA32_SYSENTER_ESP		= 0x00006c10,
-	HOST_IA32_SYSENTER_EIP		= 0x00006c12,
-	HOST_RSP			= 0x00006c14,
-	HOST_RIP			= 0x00006c16,
-};
-
-struct vmx_msr_entry {
-	uint32_t index;
-	uint32_t reserved;
-	uint64_t value;
-} __attribute__ ((aligned(16)));
-
-#include "evmcs.h"
-
-static inline int vmxon(uint64_t phys)
-{
-	uint8_t ret;
-
-	__asm__ __volatile__ ("vmxon %[pa]; setna %[ret]"
-		: [ret]"=rm"(ret)
-		: [pa]"m"(phys)
-		: "cc", "memory");
-
-	return ret;
-}
-
-static inline void vmxoff(void)
-{
-	__asm__ __volatile__("vmxoff");
-}
-
-static inline int vmclear(uint64_t vmcs_pa)
-{
-	uint8_t ret;
-
-	__asm__ __volatile__ ("vmclear %[pa]; setna %[ret]"
-		: [ret]"=rm"(ret)
-		: [pa]"m"(vmcs_pa)
-		: "cc", "memory");
-
-	return ret;
-}
-
-static inline int vmptrld(uint64_t vmcs_pa)
-{
-	uint8_t ret;
-
-	if (enable_evmcs)
-		return -1;
-
-	__asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
-		: [ret]"=rm"(ret)
-		: [pa]"m"(vmcs_pa)
-		: "cc", "memory");
-
-	return ret;
-}
-
-static inline int vmptrst(uint64_t *value)
-{
-	uint64_t tmp;
-	uint8_t ret;
-
-	if (enable_evmcs)
-		return evmcs_vmptrst(value);
-
-	__asm__ __volatile__("vmptrst %[value]; setna %[ret]"
-		: [value]"=m"(tmp), [ret]"=rm"(ret)
-		: : "cc", "memory");
-
-	*value = tmp;
-	return ret;
-}
-
-/*
- * A wrapper around vmptrst that ignores errors and returns zero if the
- * vmptrst instruction fails.
- */
-static inline uint64_t vmptrstz(void)
-{
-	uint64_t value = 0;
-	vmptrst(&value);
-	return value;
-}
-
-/*
- * No guest state (e.g. GPRs) is established by this vmlaunch.
- */
-static inline int vmlaunch(void)
-{
-	int ret;
-
-	if (enable_evmcs)
-		return evmcs_vmlaunch();
-
-	__asm__ __volatile__("push %%rbp;"
-			     "push %%rcx;"
-			     "push %%rdx;"
-			     "push %%rsi;"
-			     "push %%rdi;"
-			     "push $0;"
-			     "vmwrite %%rsp, %[host_rsp];"
-			     "lea 1f(%%rip), %%rax;"
-			     "vmwrite %%rax, %[host_rip];"
-			     "vmlaunch;"
-			     "incq (%%rsp);"
-			     "1: pop %%rax;"
-			     "pop %%rdi;"
-			     "pop %%rsi;"
-			     "pop %%rdx;"
-			     "pop %%rcx;"
-			     "pop %%rbp;"
-			     : [ret]"=&a"(ret)
-			     : [host_rsp]"r"((uint64_t)HOST_RSP),
-			       [host_rip]"r"((uint64_t)HOST_RIP)
-			     : "memory", "cc", "rbx", "r8", "r9", "r10",
-			       "r11", "r12", "r13", "r14", "r15");
-	return ret;
-}
-
-/*
- * No guest state (e.g. GPRs) is established by this vmresume.
- */
-static inline int vmresume(void)
-{
-	int ret;
-
-	if (enable_evmcs)
-		return evmcs_vmresume();
-
-	__asm__ __volatile__("push %%rbp;"
-			     "push %%rcx;"
-			     "push %%rdx;"
-			     "push %%rsi;"
-			     "push %%rdi;"
-			     "push $0;"
-			     "vmwrite %%rsp, %[host_rsp];"
-			     "lea 1f(%%rip), %%rax;"
-			     "vmwrite %%rax, %[host_rip];"
-			     "vmresume;"
-			     "incq (%%rsp);"
-			     "1: pop %%rax;"
-			     "pop %%rdi;"
-			     "pop %%rsi;"
-			     "pop %%rdx;"
-			     "pop %%rcx;"
-			     "pop %%rbp;"
-			     : [ret]"=&a"(ret)
-			     : [host_rsp]"r"((uint64_t)HOST_RSP),
-			       [host_rip]"r"((uint64_t)HOST_RIP)
-			     : "memory", "cc", "rbx", "r8", "r9", "r10",
-			       "r11", "r12", "r13", "r14", "r15");
-	return ret;
-}
-
-static inline void vmcall(void)
-{
-	/*
-	 * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle
-	 * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended
-	 * use of this function is to exit to L1 from L2.  Clobber all other
-	 * GPRs as L1 doesn't correctly preserve them during vmexits.
-	 */
-	__asm__ __volatile__("push %%rbp; vmcall; pop %%rbp"
-			     : : "a"(0xdeadbeef), "c"(0xbeefdead)
-			     : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
-			       "r10", "r11", "r12", "r13", "r14", "r15");
-}
-
-static inline int vmread(uint64_t encoding, uint64_t *value)
-{
-	uint64_t tmp;
-	uint8_t ret;
-
-	if (enable_evmcs)
-		return evmcs_vmread(encoding, value);
-
-	__asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
-		: [value]"=rm"(tmp), [ret]"=rm"(ret)
-		: [encoding]"r"(encoding)
-		: "cc", "memory");
-
-	*value = tmp;
-	return ret;
-}
-
-/*
- * A wrapper around vmread that ignores errors and returns zero if the
- * vmread instruction fails.
- */
-static inline uint64_t vmreadz(uint64_t encoding)
-{
-	uint64_t value = 0;
-	vmread(encoding, &value);
-	return value;
-}
-
-static inline int vmwrite(uint64_t encoding, uint64_t value)
-{
-	uint8_t ret;
-
-	if (enable_evmcs)
-		return evmcs_vmwrite(encoding, value);
-
-	__asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
-		: [ret]"=rm"(ret)
-		: [value]"rm"(value), [encoding]"r"(encoding)
-		: "cc", "memory");
-
-	return ret;
-}
-
-static inline uint32_t vmcs_revision(void)
-{
-	return rdmsr(MSR_IA32_VMX_BASIC);
-}
-
-struct vmx_pages {
-	void *vmxon_hva;
-	uint64_t vmxon_gpa;
-	void *vmxon;
-
-	void *vmcs_hva;
-	uint64_t vmcs_gpa;
-	void *vmcs;
-
-	void *msr_hva;
-	uint64_t msr_gpa;
-	void *msr;
-
-	void *shadow_vmcs_hva;
-	uint64_t shadow_vmcs_gpa;
-	void *shadow_vmcs;
-
-	void *vmread_hva;
-	uint64_t vmread_gpa;
-	void *vmread;
-
-	void *vmwrite_hva;
-	uint64_t vmwrite_gpa;
-	void *vmwrite;
-
-	void *eptp_hva;
-	uint64_t eptp_gpa;
-	void *eptp;
-
-	void *apic_access_hva;
-	uint64_t apic_access_gpa;
-	void *apic_access;
-};
-
-union vmx_basic {
-	u64 val;
-	struct {
-		u32 revision;
-		u32	size:13,
-			reserved1:3,
-			width:1,
-			dual:1,
-			type:4,
-			insouts:1,
-			ctrl:1,
-			vm_entry_exception_ctrl:1,
-			reserved2:7;
-	};
-};
-
-union vmx_ctrl_msr {
-	u64 val;
-	struct {
-		u32 set, clr;
-	};
-};
-
-struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
-bool prepare_for_vmx_operation(struct vmx_pages *vmx);
-void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
-bool load_vmcs(struct vmx_pages *vmx);
-
-bool ept_1g_pages_supported(void);
-
-void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		   uint64_t nested_paddr, uint64_t paddr);
-void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		 uint64_t nested_paddr, uint64_t paddr, uint64_t size);
-void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
-			uint32_t memslot);
-void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
-			    uint64_t addr, uint64_t size);
-bool kvm_cpu_has_ept(void);
-void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
-		  uint32_t eptp_memslot);
-void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
-
-#endif /* SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c
deleted file mode 100644
index 7abbf8866512..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/gic.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM Generic Interrupt Controller (GIC) support
- */
-
-#include <errno.h>
-#include <linux/bits.h>
-#include <linux/sizes.h>
-
-#include "kvm_util.h"
-
-#include <gic.h>
-#include "gic_private.h"
-#include "processor.h"
-#include "spinlock.h"
-
-static const struct gic_common_ops *gic_common_ops;
-static struct spinlock gic_lock;
-
-static void gic_cpu_init(unsigned int cpu)
-{
-	gic_common_ops->gic_cpu_init(cpu);
-}
-
-static void gic_dist_init(enum gic_type type, unsigned int nr_cpus)
-{
-	const struct gic_common_ops *gic_ops = NULL;
-
-	spin_lock(&gic_lock);
-
-	/* Distributor initialization is needed only once per VM */
-	if (gic_common_ops) {
-		spin_unlock(&gic_lock);
-		return;
-	}
-
-	if (type == GIC_V3)
-		gic_ops = &gicv3_ops;
-
-	GUEST_ASSERT(gic_ops);
-
-	gic_ops->gic_init(nr_cpus);
-	gic_common_ops = gic_ops;
-
-	/* Make sure that the initialized data is visible to all the vCPUs */
-	dsb(sy);
-
-	spin_unlock(&gic_lock);
-}
-
-void gic_init(enum gic_type type, unsigned int nr_cpus)
-{
-	uint32_t cpu = guest_get_vcpuid();
-
-	GUEST_ASSERT(type < GIC_TYPE_MAX);
-	GUEST_ASSERT(nr_cpus);
-
-	gic_dist_init(type, nr_cpus);
-	gic_cpu_init(cpu);
-}
-
-void gic_irq_enable(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_enable(intid);
-}
-
-void gic_irq_disable(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_disable(intid);
-}
-
-unsigned int gic_get_and_ack_irq(void)
-{
-	uint64_t irqstat;
-	unsigned int intid;
-
-	GUEST_ASSERT(gic_common_ops);
-
-	irqstat = gic_common_ops->gic_read_iar();
-	intid = irqstat & GENMASK(23, 0);
-
-	return intid;
-}
-
-void gic_set_eoi(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_write_eoir(intid);
-}
-
-void gic_set_dir(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_write_dir(intid);
-}
-
-void gic_set_eoi_split(bool split)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_set_eoi_split(split);
-}
-
-void gic_set_priority_mask(uint64_t pmr)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_set_priority_mask(pmr);
-}
-
-void gic_set_priority(unsigned int intid, unsigned int prio)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_set_priority(intid, prio);
-}
-
-void gic_irq_set_active(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_set_active(intid);
-}
-
-void gic_irq_clear_active(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_clear_active(intid);
-}
-
-bool gic_irq_get_active(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	return gic_common_ops->gic_irq_get_active(intid);
-}
-
-void gic_irq_set_pending(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_set_pending(intid);
-}
-
-void gic_irq_clear_pending(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_clear_pending(intid);
-}
-
-bool gic_irq_get_pending(unsigned int intid)
-{
-	GUEST_ASSERT(gic_common_ops);
-	return gic_common_ops->gic_irq_get_pending(intid);
-}
-
-void gic_irq_set_config(unsigned int intid, bool is_edge)
-{
-	GUEST_ASSERT(gic_common_ops);
-	gic_common_ops->gic_irq_set_config(intid, is_edge);
-}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h
deleted file mode 100644
index d24e9ecc96c6..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM Generic Interrupt Controller (GIC) private defines that's only
- * shared among the GIC library code.
- */
-
-#ifndef SELFTEST_KVM_GIC_PRIVATE_H
-#define SELFTEST_KVM_GIC_PRIVATE_H
-
-struct gic_common_ops {
-	void (*gic_init)(unsigned int nr_cpus);
-	void (*gic_cpu_init)(unsigned int cpu);
-	void (*gic_irq_enable)(unsigned int intid);
-	void (*gic_irq_disable)(unsigned int intid);
-	uint64_t (*gic_read_iar)(void);
-	void (*gic_write_eoir)(uint32_t irq);
-	void (*gic_write_dir)(uint32_t irq);
-	void (*gic_set_eoi_split)(bool split);
-	void (*gic_set_priority_mask)(uint64_t mask);
-	void (*gic_set_priority)(uint32_t intid, uint32_t prio);
-	void (*gic_irq_set_active)(uint32_t intid);
-	void (*gic_irq_clear_active)(uint32_t intid);
-	bool (*gic_irq_get_active)(uint32_t intid);
-	void (*gic_irq_set_pending)(uint32_t intid);
-	void (*gic_irq_clear_pending)(uint32_t intid);
-	bool (*gic_irq_get_pending)(uint32_t intid);
-	void (*gic_irq_set_config)(uint32_t intid, bool is_edge);
-};
-
-extern const struct gic_common_ops gicv3_ops;
-
-#endif /* SELFTEST_KVM_GIC_PRIVATE_H */
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
deleted file mode 100644
index 66d05506f78b..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
+++ /dev/null
@@ -1,427 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM Generic Interrupt Controller (GIC) v3 support
- */
-
-#include <linux/sizes.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "delay.h"
-
-#include "gic.h"
-#include "gic_v3.h"
-#include "gic_private.h"
-
-#define GICV3_MAX_CPUS			512
-
-#define GICD_INT_DEF_PRI		0xa0
-#define GICD_INT_DEF_PRI_X4		((GICD_INT_DEF_PRI << 24) |\
-					(GICD_INT_DEF_PRI << 16) |\
-					(GICD_INT_DEF_PRI << 8) |\
-					GICD_INT_DEF_PRI)
-
-#define ICC_PMR_DEF_PRIO		0xf0
-
-struct gicv3_data {
-	unsigned int nr_cpus;
-	unsigned int nr_spis;
-};
-
-#define sgi_base_from_redist(redist_base)	(redist_base + SZ_64K)
-#define DIST_BIT				(1U << 31)
-
-enum gicv3_intid_range {
-	SGI_RANGE,
-	PPI_RANGE,
-	SPI_RANGE,
-	INVALID_RANGE,
-};
-
-static struct gicv3_data gicv3_data;
-
-static void gicv3_gicd_wait_for_rwp(void)
-{
-	unsigned int count = 100000; /* 1s */
-
-	while (readl(GICD_BASE_GVA + GICD_CTLR) & GICD_CTLR_RWP) {
-		GUEST_ASSERT(count--);
-		udelay(10);
-	}
-}
-
-static inline volatile void *gicr_base_cpu(uint32_t cpu)
-{
-	/* Align all the redistributors sequentially */
-	return GICR_BASE_GVA + cpu * SZ_64K * 2;
-}
-
-static void gicv3_gicr_wait_for_rwp(uint32_t cpu)
-{
-	unsigned int count = 100000; /* 1s */
-
-	while (readl(gicr_base_cpu(cpu) + GICR_CTLR) & GICR_CTLR_RWP) {
-		GUEST_ASSERT(count--);
-		udelay(10);
-	}
-}
-
-static void gicv3_wait_for_rwp(uint32_t cpu_or_dist)
-{
-	if (cpu_or_dist & DIST_BIT)
-		gicv3_gicd_wait_for_rwp();
-	else
-		gicv3_gicr_wait_for_rwp(cpu_or_dist);
-}
-
-static enum gicv3_intid_range get_intid_range(unsigned int intid)
-{
-	switch (intid) {
-	case 0 ... 15:
-		return SGI_RANGE;
-	case 16 ... 31:
-		return PPI_RANGE;
-	case 32 ... 1019:
-		return SPI_RANGE;
-	}
-
-	/* We should not be reaching here */
-	GUEST_ASSERT(0);
-
-	return INVALID_RANGE;
-}
-
-static uint64_t gicv3_read_iar(void)
-{
-	uint64_t irqstat = read_sysreg_s(SYS_ICC_IAR1_EL1);
-
-	dsb(sy);
-	return irqstat;
-}
-
-static void gicv3_write_eoir(uint32_t irq)
-{
-	write_sysreg_s(irq, SYS_ICC_EOIR1_EL1);
-	isb();
-}
-
-static void gicv3_write_dir(uint32_t irq)
-{
-	write_sysreg_s(irq, SYS_ICC_DIR_EL1);
-	isb();
-}
-
-static void gicv3_set_priority_mask(uint64_t mask)
-{
-	write_sysreg_s(mask, SYS_ICC_PMR_EL1);
-}
-
-static void gicv3_set_eoi_split(bool split)
-{
-	uint32_t val;
-
-	/*
-	 * All other fields are read-only, so no need to read CTLR first. In
-	 * fact, the kernel does the same.
-	 */
-	val = split ? (1U << 1) : 0;
-	write_sysreg_s(val, SYS_ICC_CTLR_EL1);
-	isb();
-}
-
-uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset)
-{
-	volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA
-			: sgi_base_from_redist(gicr_base_cpu(cpu_or_dist));
-	return readl(base + offset);
-}
-
-void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val)
-{
-	volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA
-			: sgi_base_from_redist(gicr_base_cpu(cpu_or_dist));
-	writel(reg_val, base + offset);
-}
-
-uint32_t gicv3_getl_fields(uint32_t cpu_or_dist, uint64_t offset, uint32_t mask)
-{
-	return gicv3_reg_readl(cpu_or_dist, offset) & mask;
-}
-
-void gicv3_setl_fields(uint32_t cpu_or_dist, uint64_t offset,
-		uint32_t mask, uint32_t reg_val)
-{
-	uint32_t tmp = gicv3_reg_readl(cpu_or_dist, offset) & ~mask;
-
-	tmp |= (reg_val & mask);
-	gicv3_reg_writel(cpu_or_dist, offset, tmp);
-}
-
-/*
- * We use a single offset for the distributor and redistributor maps as they
- * have the same value in both. The only exceptions are registers that only
- * exist in one and not the other, like GICR_WAKER that doesn't exist in the
- * distributor map. Such registers are conveniently marked as reserved in the
- * map that doesn't implement it; like GICR_WAKER's offset of 0x0014 being
- * marked as "Reserved" in the Distributor map.
- */
-static void gicv3_access_reg(uint32_t intid, uint64_t offset,
-		uint32_t reg_bits, uint32_t bits_per_field,
-		bool write, uint32_t *val)
-{
-	uint32_t cpu = guest_get_vcpuid();
-	enum gicv3_intid_range intid_range = get_intid_range(intid);
-	uint32_t fields_per_reg, index, mask, shift;
-	uint32_t cpu_or_dist;
-
-	GUEST_ASSERT(bits_per_field <= reg_bits);
-	GUEST_ASSERT(!write || *val < (1U << bits_per_field));
-	/*
-	 * This function does not support 64 bit accesses. Just asserting here
-	 * until we implement readq/writeq.
-	 */
-	GUEST_ASSERT(reg_bits == 32);
-
-	fields_per_reg = reg_bits / bits_per_field;
-	index = intid % fields_per_reg;
-	shift = index * bits_per_field;
-	mask = ((1U << bits_per_field) - 1) << shift;
-
-	/* Set offset to the actual register holding intid's config. */
-	offset += (intid / fields_per_reg) * (reg_bits / 8);
-
-	cpu_or_dist = (intid_range == SPI_RANGE) ? DIST_BIT : cpu;
-
-	if (write)
-		gicv3_setl_fields(cpu_or_dist, offset, mask, *val << shift);
-	*val = gicv3_getl_fields(cpu_or_dist, offset, mask) >> shift;
-}
-
-static void gicv3_write_reg(uint32_t intid, uint64_t offset,
-		uint32_t reg_bits, uint32_t bits_per_field, uint32_t val)
-{
-	gicv3_access_reg(intid, offset, reg_bits,
-			bits_per_field, true, &val);
-}
-
-static uint32_t gicv3_read_reg(uint32_t intid, uint64_t offset,
-		uint32_t reg_bits, uint32_t bits_per_field)
-{
-	uint32_t val;
-
-	gicv3_access_reg(intid, offset, reg_bits,
-			bits_per_field, false, &val);
-	return val;
-}
-
-static void gicv3_set_priority(uint32_t intid, uint32_t prio)
-{
-	gicv3_write_reg(intid, GICD_IPRIORITYR, 32, 8, prio);
-}
-
-/* Sets the intid to be level-sensitive or edge-triggered. */
-static void gicv3_irq_set_config(uint32_t intid, bool is_edge)
-{
-	uint32_t val;
-
-	/* N/A for private interrupts. */
-	GUEST_ASSERT(get_intid_range(intid) == SPI_RANGE);
-	val = is_edge ? 2 : 0;
-	gicv3_write_reg(intid, GICD_ICFGR, 32, 2, val);
-}
-
-static void gicv3_irq_enable(uint32_t intid)
-{
-	bool is_spi = get_intid_range(intid) == SPI_RANGE;
-	uint32_t cpu = guest_get_vcpuid();
-
-	gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, 1);
-	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
-}
-
-static void gicv3_irq_disable(uint32_t intid)
-{
-	bool is_spi = get_intid_range(intid) == SPI_RANGE;
-	uint32_t cpu = guest_get_vcpuid();
-
-	gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, 1);
-	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
-}
-
-static void gicv3_irq_set_active(uint32_t intid)
-{
-	gicv3_write_reg(intid, GICD_ISACTIVER, 32, 1, 1);
-}
-
-static void gicv3_irq_clear_active(uint32_t intid)
-{
-	gicv3_write_reg(intid, GICD_ICACTIVER, 32, 1, 1);
-}
-
-static bool gicv3_irq_get_active(uint32_t intid)
-{
-	return gicv3_read_reg(intid, GICD_ISACTIVER, 32, 1);
-}
-
-static void gicv3_irq_set_pending(uint32_t intid)
-{
-	gicv3_write_reg(intid, GICD_ISPENDR, 32, 1, 1);
-}
-
-static void gicv3_irq_clear_pending(uint32_t intid)
-{
-	gicv3_write_reg(intid, GICD_ICPENDR, 32, 1, 1);
-}
-
-static bool gicv3_irq_get_pending(uint32_t intid)
-{
-	return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1);
-}
-
-static void gicv3_enable_redist(volatile void *redist_base)
-{
-	uint32_t val = readl(redist_base + GICR_WAKER);
-	unsigned int count = 100000; /* 1s */
-
-	val &= ~GICR_WAKER_ProcessorSleep;
-	writel(val, redist_base + GICR_WAKER);
-
-	/* Wait until the processor is 'active' */
-	while (readl(redist_base + GICR_WAKER) & GICR_WAKER_ChildrenAsleep) {
-		GUEST_ASSERT(count--);
-		udelay(10);
-	}
-}
-
-static void gicv3_cpu_init(unsigned int cpu)
-{
-	volatile void *sgi_base;
-	unsigned int i;
-	volatile void *redist_base_cpu;
-
-	GUEST_ASSERT(cpu < gicv3_data.nr_cpus);
-
-	redist_base_cpu = gicr_base_cpu(cpu);
-	sgi_base = sgi_base_from_redist(redist_base_cpu);
-
-	gicv3_enable_redist(redist_base_cpu);
-
-	/*
-	 * Mark all the SGI and PPI interrupts as non-secure Group-1.
-	 * Also, deactivate and disable them.
-	 */
-	writel(~0, sgi_base + GICR_IGROUPR0);
-	writel(~0, sgi_base + GICR_ICACTIVER0);
-	writel(~0, sgi_base + GICR_ICENABLER0);
-
-	/* Set a default priority for all the SGIs and PPIs */
-	for (i = 0; i < 32; i += 4)
-		writel(GICD_INT_DEF_PRI_X4,
-				sgi_base + GICR_IPRIORITYR0 + i);
-
-	gicv3_gicr_wait_for_rwp(cpu);
-
-	/* Enable the GIC system register (ICC_*) access */
-	write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE,
-			SYS_ICC_SRE_EL1);
-
-	/* Set a default priority threshold */
-	write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1);
-
-	/* Enable non-secure Group-1 interrupts */
-	write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1);
-}
-
-static void gicv3_dist_init(void)
-{
-	unsigned int i;
-
-	/* Disable the distributor until we set things up */
-	writel(0, GICD_BASE_GVA + GICD_CTLR);
-	gicv3_gicd_wait_for_rwp();
-
-	/*
-	 * Mark all the SPI interrupts as non-secure Group-1.
-	 * Also, deactivate and disable them.
-	 */
-	for (i = 32; i < gicv3_data.nr_spis; i += 32) {
-		writel(~0, GICD_BASE_GVA + GICD_IGROUPR + i / 8);
-		writel(~0, GICD_BASE_GVA + GICD_ICACTIVER + i / 8);
-		writel(~0, GICD_BASE_GVA + GICD_ICENABLER + i / 8);
-	}
-
-	/* Set a default priority for all the SPIs */
-	for (i = 32; i < gicv3_data.nr_spis; i += 4)
-		writel(GICD_INT_DEF_PRI_X4,
-				GICD_BASE_GVA + GICD_IPRIORITYR + i);
-
-	/* Wait for the settings to sync-in */
-	gicv3_gicd_wait_for_rwp();
-
-	/* Finally, enable the distributor globally with ARE */
-	writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A |
-			GICD_CTLR_ENABLE_G1, GICD_BASE_GVA + GICD_CTLR);
-	gicv3_gicd_wait_for_rwp();
-}
-
-static void gicv3_init(unsigned int nr_cpus)
-{
-	GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS);
-
-	gicv3_data.nr_cpus = nr_cpus;
-	gicv3_data.nr_spis = GICD_TYPER_SPIS(
-				readl(GICD_BASE_GVA + GICD_TYPER));
-	if (gicv3_data.nr_spis > 1020)
-		gicv3_data.nr_spis = 1020;
-
-	/*
-	 * Initialize only the distributor for now.
-	 * The redistributor and CPU interfaces are initialized
-	 * later for every PE.
-	 */
-	gicv3_dist_init();
-}
-
-const struct gic_common_ops gicv3_ops = {
-	.gic_init = gicv3_init,
-	.gic_cpu_init = gicv3_cpu_init,
-	.gic_irq_enable = gicv3_irq_enable,
-	.gic_irq_disable = gicv3_irq_disable,
-	.gic_read_iar = gicv3_read_iar,
-	.gic_write_eoir = gicv3_write_eoir,
-	.gic_write_dir = gicv3_write_dir,
-	.gic_set_priority_mask = gicv3_set_priority_mask,
-	.gic_set_eoi_split = gicv3_set_eoi_split,
-	.gic_set_priority = gicv3_set_priority,
-	.gic_irq_set_active = gicv3_irq_set_active,
-	.gic_irq_clear_active = gicv3_irq_clear_active,
-	.gic_irq_get_active = gicv3_irq_get_active,
-	.gic_irq_set_pending = gicv3_irq_set_pending,
-	.gic_irq_clear_pending = gicv3_irq_clear_pending,
-	.gic_irq_get_pending = gicv3_irq_get_pending,
-	.gic_irq_set_config = gicv3_irq_set_config,
-};
-
-void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size,
-			   vm_paddr_t pend_table)
-{
-	volatile void *rdist_base = gicr_base_cpu(guest_get_vcpuid());
-
-	u32 ctlr;
-	u64 val;
-
-	val = (cfg_table |
-	       GICR_PROPBASER_InnerShareable |
-	       GICR_PROPBASER_RaWaWb |
-	       ((ilog2(cfg_table_size) - 1) & GICR_PROPBASER_IDBITS_MASK));
-	writeq_relaxed(val, rdist_base + GICR_PROPBASER);
-
-	val = (pend_table |
-	       GICR_PENDBASER_InnerShareable |
-	       GICR_PENDBASER_RaWaWb);
-	writeq_relaxed(val, rdist_base + GICR_PENDBASER);
-
-	ctlr = readl_relaxed(rdist_base + GICR_CTLR);
-	ctlr |= GICR_CTLR_ENABLE_LPIS;
-	writel_relaxed(ctlr, rdist_base + GICR_CTLR);
-}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c
deleted file mode 100644
index 09f270545646..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c
+++ /dev/null
@@ -1,248 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Guest ITS library, generously donated by drivers/irqchip/irq-gic-v3-its.c
- * over in the kernel tree.
- */
-
-#include <linux/kvm.h>
-#include <linux/sizes.h>
-#include <asm/kvm_para.h>
-#include <asm/kvm.h>
-
-#include "kvm_util.h"
-#include "vgic.h"
-#include "gic.h"
-#include "gic_v3.h"
-#include "processor.h"
-
-static u64 its_read_u64(unsigned long offset)
-{
-	return readq_relaxed(GITS_BASE_GVA + offset);
-}
-
-static void its_write_u64(unsigned long offset, u64 val)
-{
-	writeq_relaxed(val, GITS_BASE_GVA + offset);
-}
-
-static u32 its_read_u32(unsigned long offset)
-{
-	return readl_relaxed(GITS_BASE_GVA + offset);
-}
-
-static void its_write_u32(unsigned long offset, u32 val)
-{
-	writel_relaxed(val, GITS_BASE_GVA + offset);
-}
-
-static unsigned long its_find_baser(unsigned int type)
-{
-	int i;
-
-	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
-		u64 baser;
-		unsigned long offset = GITS_BASER + (i * sizeof(baser));
-
-		baser = its_read_u64(offset);
-		if (GITS_BASER_TYPE(baser) == type)
-			return offset;
-	}
-
-	GUEST_FAIL("Couldn't find an ITS BASER of type %u", type);
-	return -1;
-}
-
-static void its_install_table(unsigned int type, vm_paddr_t base, size_t size)
-{
-	unsigned long offset = its_find_baser(type);
-	u64 baser;
-
-	baser = ((size / SZ_64K) - 1) |
-		GITS_BASER_PAGE_SIZE_64K |
-		GITS_BASER_InnerShareable |
-		base |
-		GITS_BASER_RaWaWb |
-		GITS_BASER_VALID;
-
-	its_write_u64(offset, baser);
-}
-
-static void its_install_cmdq(vm_paddr_t base, size_t size)
-{
-	u64 cbaser;
-
-	cbaser = ((size / SZ_4K) - 1) |
-		 GITS_CBASER_InnerShareable |
-		 base |
-		 GITS_CBASER_RaWaWb |
-		 GITS_CBASER_VALID;
-
-	its_write_u64(GITS_CBASER, cbaser);
-}
-
-void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz,
-	      vm_paddr_t device_tbl, size_t device_tbl_sz,
-	      vm_paddr_t cmdq, size_t cmdq_size)
-{
-	u32 ctlr;
-
-	its_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz);
-	its_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz);
-	its_install_cmdq(cmdq, cmdq_size);
-
-	ctlr = its_read_u32(GITS_CTLR);
-	ctlr |= GITS_CTLR_ENABLE;
-	its_write_u32(GITS_CTLR, ctlr);
-}
-
-struct its_cmd_block {
-	union {
-		u64	raw_cmd[4];
-		__le64	raw_cmd_le[4];
-	};
-};
-
-static inline void its_fixup_cmd(struct its_cmd_block *cmd)
-{
-	/* Let's fixup BE commands */
-	cmd->raw_cmd_le[0] = cpu_to_le64(cmd->raw_cmd[0]);
-	cmd->raw_cmd_le[1] = cpu_to_le64(cmd->raw_cmd[1]);
-	cmd->raw_cmd_le[2] = cpu_to_le64(cmd->raw_cmd[2]);
-	cmd->raw_cmd_le[3] = cpu_to_le64(cmd->raw_cmd[3]);
-}
-
-static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l)
-{
-	u64 mask = GENMASK_ULL(h, l);
-	*raw_cmd &= ~mask;
-	*raw_cmd |= (val << l) & mask;
-}
-
-static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr)
-{
-	its_mask_encode(&cmd->raw_cmd[0], cmd_nr, 7, 0);
-}
-
-static void its_encode_devid(struct its_cmd_block *cmd, u32 devid)
-{
-	its_mask_encode(&cmd->raw_cmd[0], devid, 63, 32);
-}
-
-static void its_encode_event_id(struct its_cmd_block *cmd, u32 id)
-{
-	its_mask_encode(&cmd->raw_cmd[1], id, 31, 0);
-}
-
-static void its_encode_phys_id(struct its_cmd_block *cmd, u32 phys_id)
-{
-	its_mask_encode(&cmd->raw_cmd[1], phys_id, 63, 32);
-}
-
-static void its_encode_size(struct its_cmd_block *cmd, u8 size)
-{
-	its_mask_encode(&cmd->raw_cmd[1], size, 4, 0);
-}
-
-static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr)
-{
-	its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8);
-}
-
-static void its_encode_valid(struct its_cmd_block *cmd, int valid)
-{
-	its_mask_encode(&cmd->raw_cmd[2], !!valid, 63, 63);
-}
-
-static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr)
-{
-	its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16);
-}
-
-static void its_encode_collection(struct its_cmd_block *cmd, u16 col)
-{
-	its_mask_encode(&cmd->raw_cmd[2], col, 15, 0);
-}
-
-#define GITS_CMDQ_POLL_ITERATIONS	0
-
-static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd)
-{
-	u64 cwriter = its_read_u64(GITS_CWRITER);
-	struct its_cmd_block *dst = cmdq_base + cwriter;
-	u64 cbaser = its_read_u64(GITS_CBASER);
-	size_t cmdq_size;
-	u64 next;
-	int i;
-
-	cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K;
-
-	its_fixup_cmd(cmd);
-
-	WRITE_ONCE(*dst, *cmd);
-	dsb(ishst);
-	next = (cwriter + sizeof(*cmd)) % cmdq_size;
-	its_write_u64(GITS_CWRITER, next);
-
-	/*
-	 * Polling isn't necessary considering KVM's ITS emulation at the time
-	 * of writing this, as the CMDQ is processed synchronously after a write
-	 * to CWRITER.
-	 */
-	for (i = 0; its_read_u64(GITS_CREADR) != next; i++) {
-		__GUEST_ASSERT(i < GITS_CMDQ_POLL_ITERATIONS,
-			       "ITS didn't process command at offset %lu after %d iterations\n",
-			       cwriter, i);
-
-		cpu_relax();
-	}
-}
-
-void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base,
-		       size_t itt_size, bool valid)
-{
-	struct its_cmd_block cmd = {};
-
-	its_encode_cmd(&cmd, GITS_CMD_MAPD);
-	its_encode_devid(&cmd, device_id);
-	its_encode_size(&cmd, ilog2(itt_size) - 1);
-	its_encode_itt(&cmd, itt_base);
-	its_encode_valid(&cmd, valid);
-
-	its_send_cmd(cmdq_base, &cmd);
-}
-
-void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid)
-{
-	struct its_cmd_block cmd = {};
-
-	its_encode_cmd(&cmd, GITS_CMD_MAPC);
-	its_encode_collection(&cmd, collection_id);
-	its_encode_target(&cmd, vcpu_id);
-	its_encode_valid(&cmd, valid);
-
-	its_send_cmd(cmdq_base, &cmd);
-}
-
-void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id,
-			u32 collection_id, u32 intid)
-{
-	struct its_cmd_block cmd = {};
-
-	its_encode_cmd(&cmd, GITS_CMD_MAPTI);
-	its_encode_devid(&cmd, device_id);
-	its_encode_event_id(&cmd, event_id);
-	its_encode_phys_id(&cmd, intid);
-	its_encode_collection(&cmd, collection_id);
-
-	its_send_cmd(cmdq_base, &cmd);
-}
-
-void its_send_invall_cmd(void *cmdq_base, u32 collection_id)
-{
-	struct its_cmd_block cmd = {};
-
-	its_encode_cmd(&cmd, GITS_CMD_INVALL);
-	its_encode_collection(&cmd, collection_id);
-
-	its_send_cmd(cmdq_base, &cmd);
-}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/handlers.S b/tools/testing/selftests/kvm/lib/aarch64/handlers.S
deleted file mode 100644
index 0e443eadfac6..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/handlers.S
+++ /dev/null
@@ -1,126 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-.macro save_registers
-	add	sp, sp, #-16 * 17
-
-	stp	x0, x1, [sp, #16 * 0]
-	stp	x2, x3, [sp, #16 * 1]
-	stp	x4, x5, [sp, #16 * 2]
-	stp	x6, x7, [sp, #16 * 3]
-	stp	x8, x9, [sp, #16 * 4]
-	stp	x10, x11, [sp, #16 * 5]
-	stp	x12, x13, [sp, #16 * 6]
-	stp	x14, x15, [sp, #16 * 7]
-	stp	x16, x17, [sp, #16 * 8]
-	stp	x18, x19, [sp, #16 * 9]
-	stp	x20, x21, [sp, #16 * 10]
-	stp	x22, x23, [sp, #16 * 11]
-	stp	x24, x25, [sp, #16 * 12]
-	stp	x26, x27, [sp, #16 * 13]
-	stp	x28, x29, [sp, #16 * 14]
-
-	/*
-	 * This stores sp_el1 into ex_regs.sp so exception handlers can "look"
-	 * at it. It will _not_ be used to restore the sp on return from the
-	 * exception so handlers can not update it.
-	 */
-	add	x1, sp, #16 * 17
-	stp	x30, x1, [sp, #16 * 15] /* x30, SP */
-
-	mrs	x1, elr_el1
-	mrs	x2, spsr_el1
-	stp	x1, x2, [sp, #16 * 16] /* PC, PSTATE */
-.endm
-
-.macro restore_registers
-	ldp	x1, x2, [sp, #16 * 16] /* PC, PSTATE */
-	msr	elr_el1, x1
-	msr	spsr_el1, x2
-
-	/* sp is not restored */
-	ldp	x30, xzr, [sp, #16 * 15] /* x30, SP */
-
-	ldp	x28, x29, [sp, #16 * 14]
-	ldp	x26, x27, [sp, #16 * 13]
-	ldp	x24, x25, [sp, #16 * 12]
-	ldp	x22, x23, [sp, #16 * 11]
-	ldp	x20, x21, [sp, #16 * 10]
-	ldp	x18, x19, [sp, #16 * 9]
-	ldp	x16, x17, [sp, #16 * 8]
-	ldp	x14, x15, [sp, #16 * 7]
-	ldp	x12, x13, [sp, #16 * 6]
-	ldp	x10, x11, [sp, #16 * 5]
-	ldp	x8, x9, [sp, #16 * 4]
-	ldp	x6, x7, [sp, #16 * 3]
-	ldp	x4, x5, [sp, #16 * 2]
-	ldp	x2, x3, [sp, #16 * 1]
-	ldp	x0, x1, [sp, #16 * 0]
-
-	add	sp, sp, #16 * 17
-
-	eret
-.endm
-
-.pushsection ".entry.text", "ax"
-.balign 0x800
-.global vectors
-vectors:
-.popsection
-
-.set	vector, 0
-
-/*
- * Build an exception handler for vector and append a jump to it into
- * vectors (while making sure that it's 0x80 aligned).
- */
-.macro HANDLER, label
-handler_\label:
-	save_registers
-	mov	x0, sp
-	mov	x1, #vector
-	bl	route_exception
-	restore_registers
-
-.pushsection ".entry.text", "ax"
-.balign 0x80
-	b	handler_\label
-.popsection
-
-.set	vector, vector + 1
-.endm
-
-.macro HANDLER_INVALID
-.pushsection ".entry.text", "ax"
-.balign 0x80
-/* This will abort so no need to save and restore registers. */
-	mov	x0, #vector
-	mov	x1, #0 /* ec */
-	mov	x2, #0 /* valid_ec */
-	b	kvm_exit_unexpected_exception
-.popsection
-
-.set	vector, vector + 1
-.endm
-
-/*
- * Caution: be sure to not add anything between the declaration of vectors
- * above and these macro calls that will build the vectors table below it.
- */
-	HANDLER_INVALID                         // Synchronous EL1t
-	HANDLER_INVALID                         // IRQ EL1t
-	HANDLER_INVALID                         // FIQ EL1t
-	HANDLER_INVALID                         // Error EL1t
-
-	HANDLER	el1h_sync                       // Synchronous EL1h
-	HANDLER	el1h_irq                        // IRQ EL1h
-	HANDLER el1h_fiq                        // FIQ EL1h
-	HANDLER	el1h_error                      // Error EL1h
-
-	HANDLER	el0_sync_64                     // Synchronous 64-bit EL0
-	HANDLER	el0_irq_64                      // IRQ 64-bit EL0
-	HANDLER	el0_fiq_64                      // FIQ 64-bit EL0
-	HANDLER	el0_error_64                    // Error 64-bit EL0
-
-	HANDLER	el0_sync_32                     // Synchronous 32-bit EL0
-	HANDLER	el0_irq_32                      // IRQ 32-bit EL0
-	HANDLER	el0_fiq_32                      // FIQ 32-bit EL0
-	HANDLER	el0_error_32                    // Error 32-bit EL0
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
deleted file mode 100644
index 7ba3aa3755f3..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ /dev/null
@@ -1,647 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * AArch64 code
- *
- * Copyright (C) 2018, Red Hat, Inc.
- */
-
-#include <linux/compiler.h>
-#include <assert.h>
-
-#include "guest_modes.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "ucall_common.h"
-
-#include <linux/bitfield.h>
-#include <linux/sizes.h>
-
-#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN	0xac0000
-
-static vm_vaddr_t exception_handlers;
-
-static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
-{
-	return (v + vm->page_size) & ~(vm->page_size - 1);
-}
-
-static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
-	uint64_t mask = (1UL << (vm->va_bits - shift)) - 1;
-
-	return (gva >> shift) & mask;
-}
-
-static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift;
-	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
-
-	TEST_ASSERT(vm->pgtable_levels == 4,
-		"Mode %d does not have 4 page table levels", vm->mode);
-
-	return (gva >> shift) & mask;
-}
-
-static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	unsigned int shift = (vm->page_shift - 3) + vm->page_shift;
-	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
-
-	TEST_ASSERT(vm->pgtable_levels >= 3,
-		"Mode %d does not have >= 3 page table levels", vm->mode);
-
-	return (gva >> shift) & mask;
-}
-
-static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
-	return (gva >> vm->page_shift) & mask;
-}
-
-static inline bool use_lpa2_pte_format(struct kvm_vm *vm)
-{
-	return (vm->page_size == SZ_4K || vm->page_size == SZ_16K) &&
-	    (vm->pa_bits > 48 || vm->va_bits > 48);
-}
-
-static uint64_t addr_pte(struct kvm_vm *vm, uint64_t pa, uint64_t attrs)
-{
-	uint64_t pte;
-
-	if (use_lpa2_pte_format(vm)) {
-		pte = pa & GENMASK(49, vm->page_shift);
-		pte |= FIELD_GET(GENMASK(51, 50), pa) << 8;
-		attrs &= ~GENMASK(9, 8);
-	} else {
-		pte = pa & GENMASK(47, vm->page_shift);
-		if (vm->page_shift == 16)
-			pte |= FIELD_GET(GENMASK(51, 48), pa) << 12;
-	}
-	pte |= attrs;
-
-	return pte;
-}
-
-static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte)
-{
-	uint64_t pa;
-
-	if (use_lpa2_pte_format(vm)) {
-		pa = pte & GENMASK(49, vm->page_shift);
-		pa |= FIELD_GET(GENMASK(9, 8), pte) << 50;
-	} else {
-		pa = pte & GENMASK(47, vm->page_shift);
-		if (vm->page_shift == 16)
-			pa |= FIELD_GET(GENMASK(15, 12), pte) << 48;
-	}
-
-	return pa;
-}
-
-static uint64_t ptrs_per_pgd(struct kvm_vm *vm)
-{
-	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
-	return 1 << (vm->va_bits - shift);
-}
-
-static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
-{
-	return 1 << (vm->page_shift - 3);
-}
-
-void virt_arch_pgd_alloc(struct kvm_vm *vm)
-{
-	size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
-
-	if (vm->pgd_created)
-		return;
-
-	vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
-				     KVM_GUEST_PAGE_TABLE_MIN_PADDR,
-				     vm->memslots[MEM_REGION_PT]);
-	vm->pgd_created = true;
-}
-
-static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-			 uint64_t flags)
-{
-	uint8_t attr_idx = flags & 7;
-	uint64_t *ptep;
-
-	TEST_ASSERT((vaddr % vm->page_size) == 0,
-		"Virtual address not on page boundary,\n"
-		"  vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size);
-	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
-		(vaddr >> vm->page_shift)),
-		"Invalid virtual address, vaddr: 0x%lx", vaddr);
-	TEST_ASSERT((paddr % vm->page_size) == 0,
-		"Physical address not on page boundary,\n"
-		"  paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size);
-	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
-		"Physical address beyond beyond maximum supported,\n"
-		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		paddr, vm->max_gfn, vm->page_size);
-
-	ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8;
-	if (!*ptep)
-		*ptep = addr_pte(vm, vm_alloc_page_table(vm), 3);
-
-	switch (vm->pgtable_levels) {
-	case 4:
-		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8;
-		if (!*ptep)
-			*ptep = addr_pte(vm, vm_alloc_page_table(vm), 3);
-		/* fall through */
-	case 3:
-		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8;
-		if (!*ptep)
-			*ptep = addr_pte(vm, vm_alloc_page_table(vm), 3);
-		/* fall through */
-	case 2:
-		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8;
-		break;
-	default:
-		TEST_FAIL("Page table levels must be 2, 3, or 4");
-	}
-
-	*ptep = addr_pte(vm, paddr, (attr_idx << 2) | (1 << 10) | 3);  /* AF */
-}
-
-void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
-{
-	uint64_t attr_idx = MT_NORMAL;
-
-	_virt_pg_map(vm, vaddr, paddr, attr_idx);
-}
-
-uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	uint64_t *ptep;
-
-	if (!vm->pgd_created)
-		goto unmapped_gva;
-
-	ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8;
-	if (!ptep)
-		goto unmapped_gva;
-
-	switch (vm->pgtable_levels) {
-	case 4:
-		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8;
-		if (!ptep)
-			goto unmapped_gva;
-		/* fall through */
-	case 3:
-		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8;
-		if (!ptep)
-			goto unmapped_gva;
-		/* fall through */
-	case 2:
-		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8;
-		if (!ptep)
-			goto unmapped_gva;
-		break;
-	default:
-		TEST_FAIL("Page table levels must be 2, 3, or 4");
-	}
-
-	return ptep;
-
-unmapped_gva:
-	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
-	exit(EXIT_FAILURE);
-}
-
-vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	uint64_t *ptep = virt_get_pte_hva(vm, gva);
-
-	return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
-}
-
-static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
-{
-#ifdef DEBUG
-	static const char * const type[] = { "", "pud", "pmd", "pte" };
-	uint64_t pte, *ptep;
-
-	if (level == 4)
-		return;
-
-	for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) {
-		ptep = addr_gpa2hva(vm, pte);
-		if (!*ptep)
-			continue;
-		fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
-		pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1);
-	}
-#endif
-}
-
-void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
-{
-	int level = 4 - (vm->pgtable_levels - 1);
-	uint64_t pgd, *ptep;
-
-	if (!vm->pgd_created)
-		return;
-
-	for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
-		ptep = addr_gpa2hva(vm, pgd);
-		if (!*ptep)
-			continue;
-		fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
-		pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level);
-	}
-}
-
-void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init)
-{
-	struct kvm_vcpu_init default_init = { .target = -1, };
-	struct kvm_vm *vm = vcpu->vm;
-	uint64_t sctlr_el1, tcr_el1, ttbr0_el1;
-
-	if (!init)
-		init = &default_init;
-
-	if (init->target == -1) {
-		struct kvm_vcpu_init preferred;
-		vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &preferred);
-		init->target = preferred.target;
-	}
-
-	vcpu_ioctl(vcpu, KVM_ARM_VCPU_INIT, init);
-
-	/*
-	 * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15
-	 * registers, which the variable argument list macros do.
-	 */
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20);
-
-	sctlr_el1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1));
-	tcr_el1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1));
-
-	/* Configure base granule size */
-	switch (vm->mode) {
-	case VM_MODE_PXXV48_4K:
-		TEST_FAIL("AArch64 does not support 4K sized pages "
-			  "with ANY-bit physical address ranges");
-	case VM_MODE_P52V48_64K:
-	case VM_MODE_P48V48_64K:
-	case VM_MODE_P40V48_64K:
-	case VM_MODE_P36V48_64K:
-		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
-		break;
-	case VM_MODE_P52V48_16K:
-	case VM_MODE_P48V48_16K:
-	case VM_MODE_P40V48_16K:
-	case VM_MODE_P36V48_16K:
-	case VM_MODE_P36V47_16K:
-		tcr_el1 |= 2ul << 14; /* TG0 = 16KB */
-		break;
-	case VM_MODE_P52V48_4K:
-	case VM_MODE_P48V48_4K:
-	case VM_MODE_P40V48_4K:
-	case VM_MODE_P36V48_4K:
-		tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
-		break;
-	default:
-		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
-	}
-
-	ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift);
-
-	/* Configure output size */
-	switch (vm->mode) {
-	case VM_MODE_P52V48_4K:
-	case VM_MODE_P52V48_16K:
-	case VM_MODE_P52V48_64K:
-		tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
-		ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2;
-		break;
-	case VM_MODE_P48V48_4K:
-	case VM_MODE_P48V48_16K:
-	case VM_MODE_P48V48_64K:
-		tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
-		break;
-	case VM_MODE_P40V48_4K:
-	case VM_MODE_P40V48_16K:
-	case VM_MODE_P40V48_64K:
-		tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
-		break;
-	case VM_MODE_P36V48_4K:
-	case VM_MODE_P36V48_16K:
-	case VM_MODE_P36V48_64K:
-	case VM_MODE_P36V47_16K:
-		tcr_el1 |= 1ul << 32; /* IPS = 36 bits */
-		break;
-	default:
-		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
-	}
-
-	sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */;
-	/* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */;
-	tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12);
-	tcr_el1 |= (64 - vm->va_bits) /* T0SZ */;
-	if (use_lpa2_pte_format(vm))
-		tcr_el1 |= (1ul << 59) /* DS */;
-
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1);
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1);
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1);
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), ttbr0_el1);
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpu->id);
-}
-
-void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
-{
-	uint64_t pstate, pc;
-
-	pstate = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate));
-	pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
-
-	fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n",
-		indent, "", pstate, pc);
-}
-
-void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
-{
-	vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
-}
-
-static struct kvm_vcpu *__aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-					   struct kvm_vcpu_init *init)
-{
-	size_t stack_size;
-	uint64_t stack_vaddr;
-	struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
-
-	stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size :
-					     vm->page_size;
-	stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
-				       DEFAULT_ARM64_GUEST_STACK_VADDR_MIN,
-				       MEM_REGION_DATA);
-
-	aarch64_vcpu_setup(vcpu, init);
-
-	vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
-	return vcpu;
-}
-
-struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  struct kvm_vcpu_init *init, void *guest_code)
-{
-	struct kvm_vcpu *vcpu = __aarch64_vcpu_add(vm, vcpu_id, init);
-
-	vcpu_arch_set_entry_point(vcpu, guest_code);
-
-	return vcpu;
-}
-
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
-{
-	return __aarch64_vcpu_add(vm, vcpu_id, NULL);
-}
-
-void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
-{
-	va_list ap;
-	int i;
-
-	TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n"
-		    "  num: %u", num);
-
-	va_start(ap, num);
-
-	for (i = 0; i < num; i++) {
-		vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.regs[i]),
-			     va_arg(ap, uint64_t));
-	}
-
-	va_end(ap);
-}
-
-void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec)
-{
-	ucall(UCALL_UNHANDLED, 3, vector, ec, valid_ec);
-	while (1)
-		;
-}
-
-void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	if (get_ucall(vcpu, &uc) != UCALL_UNHANDLED)
-		return;
-
-	if (uc.args[2]) /* valid_ec */ {
-		assert(VECTOR_IS_SYNC(uc.args[0]));
-		TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)",
-			  uc.args[0], uc.args[1]);
-	} else {
-		assert(!VECTOR_IS_SYNC(uc.args[0]));
-		TEST_FAIL("Unexpected exception (vector:0x%lx)",
-			  uc.args[0]);
-	}
-}
-
-struct handlers {
-	handler_fn exception_handlers[VECTOR_NUM][ESR_ELx_EC_MAX + 1];
-};
-
-void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu)
-{
-	extern char vectors;
-
-	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors);
-}
-
-void route_exception(struct ex_regs *regs, int vector)
-{
-	struct handlers *handlers = (struct handlers *)exception_handlers;
-	bool valid_ec;
-	int ec = 0;
-
-	switch (vector) {
-	case VECTOR_SYNC_CURRENT:
-	case VECTOR_SYNC_LOWER_64:
-		ec = ESR_ELx_EC(read_sysreg(esr_el1));
-		valid_ec = true;
-		break;
-	case VECTOR_IRQ_CURRENT:
-	case VECTOR_IRQ_LOWER_64:
-	case VECTOR_FIQ_CURRENT:
-	case VECTOR_FIQ_LOWER_64:
-	case VECTOR_ERROR_CURRENT:
-	case VECTOR_ERROR_LOWER_64:
-		ec = 0;
-		valid_ec = false;
-		break;
-	default:
-		valid_ec = false;
-		goto unexpected_exception;
-	}
-
-	if (handlers && handlers->exception_handlers[vector][ec])
-		return handlers->exception_handlers[vector][ec](regs);
-
-unexpected_exception:
-	kvm_exit_unexpected_exception(vector, ec, valid_ec);
-}
-
-void vm_init_descriptor_tables(struct kvm_vm *vm)
-{
-	vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers),
-					vm->page_size, MEM_REGION_DATA);
-
-	*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
-}
-
-void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec,
-			 void (*handler)(struct ex_regs *))
-{
-	struct handlers *handlers = addr_gva2hva(vm, vm->handlers);
-
-	assert(VECTOR_IS_SYNC(vector));
-	assert(vector < VECTOR_NUM);
-	assert(ec <= ESR_ELx_EC_MAX);
-	handlers->exception_handlers[vector][ec] = handler;
-}
-
-void vm_install_exception_handler(struct kvm_vm *vm, int vector,
-			 void (*handler)(struct ex_regs *))
-{
-	struct handlers *handlers = addr_gva2hva(vm, vm->handlers);
-
-	assert(!VECTOR_IS_SYNC(vector));
-	assert(vector < VECTOR_NUM);
-	handlers->exception_handlers[vector][0] = handler;
-}
-
-uint32_t guest_get_vcpuid(void)
-{
-	return read_sysreg(tpidr_el1);
-}
-
-static uint32_t max_ipa_for_page_size(uint32_t vm_ipa, uint32_t gran,
-				uint32_t not_sup_val, uint32_t ipa52_min_val)
-{
-	if (gran == not_sup_val)
-		return 0;
-	else if (gran >= ipa52_min_val && vm_ipa >= 52)
-		return 52;
-	else
-		return min(vm_ipa, 48U);
-}
-
-void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k,
-					uint32_t *ipa16k, uint32_t *ipa64k)
-{
-	struct kvm_vcpu_init preferred_init;
-	int kvm_fd, vm_fd, vcpu_fd, err;
-	uint64_t val;
-	uint32_t gran;
-	struct kvm_one_reg reg = {
-		.id	= KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR0_EL1),
-		.addr	= (uint64_t)&val,
-	};
-
-	kvm_fd = open_kvm_dev_path_or_exit();
-	vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, (void *)(unsigned long)ipa);
-	TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd));
-
-	vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
-	TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd));
-
-	err = ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init);
-	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_PREFERRED_TARGET, err));
-	err = ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &preferred_init);
-	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_VCPU_INIT, err));
-
-	err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
-	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd));
-
-	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val);
-	*ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI,
-					ID_AA64MMFR0_EL1_TGRAN4_52_BIT);
-
-	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val);
-	*ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI,
-					ID_AA64MMFR0_EL1_TGRAN64_IMP);
-
-	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val);
-	*ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI,
-					ID_AA64MMFR0_EL1_TGRAN16_52_BIT);
-
-	close(vcpu_fd);
-	close(vm_fd);
-	close(kvm_fd);
-}
-
-#define __smccc_call(insn, function_id, arg0, arg1, arg2, arg3, arg4, arg5,	\
-		     arg6, res)							\
-	asm volatile("mov   w0, %w[function_id]\n"				\
-		     "mov   x1, %[arg0]\n"					\
-		     "mov   x2, %[arg1]\n"					\
-		     "mov   x3, %[arg2]\n"					\
-		     "mov   x4, %[arg3]\n"					\
-		     "mov   x5, %[arg4]\n"					\
-		     "mov   x6, %[arg5]\n"					\
-		     "mov   x7, %[arg6]\n"					\
-		     #insn  "#0\n"						\
-		     "mov   %[res0], x0\n"					\
-		     "mov   %[res1], x1\n"					\
-		     "mov   %[res2], x2\n"					\
-		     "mov   %[res3], x3\n"					\
-		     : [res0] "=r"(res->a0), [res1] "=r"(res->a1),		\
-		       [res2] "=r"(res->a2), [res3] "=r"(res->a3)		\
-		     : [function_id] "r"(function_id), [arg0] "r"(arg0),	\
-		       [arg1] "r"(arg1), [arg2] "r"(arg2), [arg3] "r"(arg3),	\
-		       [arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6)	\
-		     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7")
-
-
-void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
-	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
-	       uint64_t arg6, struct arm_smccc_res *res)
-{
-	__smccc_call(hvc, function_id, arg0, arg1, arg2, arg3, arg4, arg5,
-		     arg6, res);
-}
-
-void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
-	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
-	       uint64_t arg6, struct arm_smccc_res *res)
-{
-	__smccc_call(smc, function_id, arg0, arg1, arg2, arg3, arg4, arg5,
-		     arg6, res);
-}
-
-void kvm_selftest_arch_init(void)
-{
-	/*
-	 * arm64 doesn't have a true default mode, so start by computing the
-	 * available IPA space and page sizes early.
-	 */
-	guest_modes_append_default();
-}
-
-void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
-{
-	/*
-	 * arm64 selftests use only TTBR0_EL1, meaning that the valid VA space
-	 * is [0, 2^(64 - TCR_EL1.T0SZ)).
-	 */
-	sparsebit_set_num(vm->vpages_valid, 0,
-			  (1ULL << vm->va_bits) >> vm->page_shift);
-}
-
-/* Helper to call wfi instruction. */
-void wfi(void)
-{
-	asm volatile("wfi");
-}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/spinlock.c b/tools/testing/selftests/kvm/lib/aarch64/spinlock.c
deleted file mode 100644
index a076e780be5d..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/spinlock.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM64 Spinlock support
- */
-#include <stdint.h>
-
-#include "spinlock.h"
-
-void spin_lock(struct spinlock *lock)
-{
-	int val, res;
-
-	asm volatile(
-	"1:	ldaxr	%w0, [%2]\n"
-	"	cbnz	%w0, 1b\n"
-	"	mov	%w0, #1\n"
-	"	stxr	%w1, %w0, [%2]\n"
-	"	cbnz	%w1, 1b\n"
-	: "=&r" (val), "=&r" (res)
-	: "r" (&lock->v)
-	: "memory");
-}
-
-void spin_unlock(struct spinlock *lock)
-{
-	asm volatile("stlr wzr, [%0]\n"	: : "r" (&lock->v) : "memory");
-}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
deleted file mode 100644
index ddab0ce89d4d..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ucall support. A ucall is a "hypercall to userspace".
- *
- * Copyright (C) 2018, Red Hat, Inc.
- */
-#include "kvm_util.h"
-
-vm_vaddr_t *ucall_exit_mmio_addr;
-
-void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
-{
-	vm_vaddr_t mmio_gva = vm_vaddr_unused_gap(vm, vm->page_size, KVM_UTIL_MIN_VADDR);
-
-	virt_map(vm, mmio_gva, mmio_gpa, 1);
-
-	vm->ucall_mmio_addr = mmio_gpa;
-
-	write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gva);
-}
-
-void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-
-	if (run->exit_reason == KVM_EXIT_MMIO &&
-	    run->mmio.phys_addr == vcpu->vm->ucall_mmio_addr) {
-		TEST_ASSERT(run->mmio.is_write && run->mmio.len == sizeof(uint64_t),
-			    "Unexpected ucall exit mmio address access");
-		return (void *)(*((uint64_t *)run->mmio.data));
-	}
-
-	return NULL;
-}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
deleted file mode 100644
index 4427f43f73ea..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM Generic Interrupt Controller (GIC) v3 host support
- */
-
-#include <linux/kernel.h>
-#include <linux/kvm.h>
-#include <linux/sizes.h>
-#include <asm/cputype.h>
-#include <asm/kvm_para.h>
-#include <asm/kvm.h>
-
-#include "kvm_util.h"
-#include "vgic.h"
-#include "gic.h"
-#include "gic_v3.h"
-
-/*
- * vGIC-v3 default host setup
- *
- * Input args:
- *	vm - KVM VM
- *	nr_vcpus - Number of vCPUs supported by this VM
- *
- * Output args: None
- *
- * Return: GIC file-descriptor or negative error code upon failure
- *
- * The function creates a vGIC-v3 device and maps the distributor and
- * redistributor regions of the guest. Since it depends on the number of
- * vCPUs for the VM, it must be called after all the vCPUs have been created.
- */
-int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs)
-{
-	int gic_fd;
-	uint64_t attr;
-	struct list_head *iter;
-	unsigned int nr_gic_pages, nr_vcpus_created = 0;
-
-	TEST_ASSERT(nr_vcpus, "Number of vCPUs cannot be empty");
-
-	/*
-	 * Make sure that the caller is infact calling this
-	 * function after all the vCPUs are added.
-	 */
-	list_for_each(iter, &vm->vcpus)
-		nr_vcpus_created++;
-	TEST_ASSERT(nr_vcpus == nr_vcpus_created,
-			"Number of vCPUs requested (%u) doesn't match with the ones created for the VM (%u)",
-			nr_vcpus, nr_vcpus_created);
-
-	/* Distributor setup */
-	gic_fd = __kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3);
-	if (gic_fd < 0)
-		return gic_fd;
-
-	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs);
-
-	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	attr = GICD_BASE_GPA;
-	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_DIST, &attr);
-	nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE);
-	virt_map(vm, GICD_BASE_GPA, GICD_BASE_GPA, nr_gic_pages);
-
-	/* Redistributor setup */
-	attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, GICR_BASE_GPA, 0, 0);
-	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &attr);
-	nr_gic_pages = vm_calc_num_guest_pages(vm->mode,
-						KVM_VGIC_V3_REDIST_SIZE * nr_vcpus);
-	virt_map(vm, GICR_BASE_GPA, GICR_BASE_GPA, nr_gic_pages);
-
-	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	return gic_fd;
-}
-
-/* should only work for level sensitive interrupts */
-int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level)
-{
-	uint64_t attr = 32 * (intid / 32);
-	uint64_t index = intid % 32;
-	uint64_t val;
-	int ret;
-
-	ret = __kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
-				    attr, &val);
-	if (ret != 0)
-		return ret;
-
-	val |= 1U << index;
-	ret = __kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
-				    attr, &val);
-	return ret;
-}
-
-void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level)
-{
-	int ret = _kvm_irq_set_level_info(gic_fd, intid, level);
-
-	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, ret));
-}
-
-int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
-{
-	uint32_t irq = intid & KVM_ARM_IRQ_NUM_MASK;
-
-	TEST_ASSERT(!INTID_IS_SGI(intid), "KVM_IRQ_LINE's interface itself "
-		"doesn't allow injecting SGIs. There's no mask for it.");
-
-	if (INTID_IS_PPI(intid))
-		irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT;
-	else
-		irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT;
-
-	return _kvm_irq_line(vm, irq, level);
-}
-
-void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
-{
-	int ret = _kvm_arm_irq_line(vm, intid, level);
-
-	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
-}
-
-static void vgic_poke_irq(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu,
-			  uint64_t reg_off)
-{
-	uint64_t reg = intid / 32;
-	uint64_t index = intid % 32;
-	uint64_t attr = reg_off + reg * 4;
-	uint64_t val;
-	bool intid_is_private = INTID_IS_SGI(intid) || INTID_IS_PPI(intid);
-
-	uint32_t group = intid_is_private ? KVM_DEV_ARM_VGIC_GRP_REDIST_REGS
-					  : KVM_DEV_ARM_VGIC_GRP_DIST_REGS;
-
-	if (intid_is_private) {
-		/* TODO: only vcpu 0 implemented for now. */
-		assert(vcpu->id == 0);
-		attr += SZ_64K;
-	}
-
-	/* Check that the addr part of the attr is within 32 bits. */
-	assert((attr & ~KVM_DEV_ARM_VGIC_OFFSET_MASK) == 0);
-
-	/*
-	 * All calls will succeed, even with invalid intid's, as long as the
-	 * addr part of the attr is within 32 bits (checked above). An invalid
-	 * intid will just make the read/writes point to above the intended
-	 * register space (i.e., ICPENDR after ISPENDR).
-	 */
-	kvm_device_attr_get(gic_fd, group, attr, &val);
-	val |= 1ULL << index;
-	kvm_device_attr_set(gic_fd, group, attr, &val);
-}
-
-void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu)
-{
-	vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISPENDR);
-}
-
-void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu)
-{
-	vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER);
-}
-
-int vgic_its_setup(struct kvm_vm *vm)
-{
-	int its_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS);
-	u64 attr;
-
-	attr = GITS_BASE_GPA;
-	kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
-			    KVM_VGIC_ITS_ADDR_TYPE, &attr);
-
-	kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
-			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
-
-	virt_map(vm, GITS_BASE_GPA, GITS_BASE_GPA,
-		 vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE));
-
-	return its_fd;
-}
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic.c b/tools/testing/selftests/kvm/lib/arm64/gic.c
new file mode 100644
index 000000000000..7abbf8866512
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/gic.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM Generic Interrupt Controller (GIC) support
+ */
+
+#include <errno.h>
+#include <linux/bits.h>
+#include <linux/sizes.h>
+
+#include "kvm_util.h"
+
+#include <gic.h>
+#include "gic_private.h"
+#include "processor.h"
+#include "spinlock.h"
+
+static const struct gic_common_ops *gic_common_ops;
+static struct spinlock gic_lock;
+
+static void gic_cpu_init(unsigned int cpu)
+{
+	gic_common_ops->gic_cpu_init(cpu);
+}
+
+static void gic_dist_init(enum gic_type type, unsigned int nr_cpus)
+{
+	const struct gic_common_ops *gic_ops = NULL;
+
+	spin_lock(&gic_lock);
+
+	/* Distributor initialization is needed only once per VM */
+	if (gic_common_ops) {
+		spin_unlock(&gic_lock);
+		return;
+	}
+
+	if (type == GIC_V3)
+		gic_ops = &gicv3_ops;
+
+	GUEST_ASSERT(gic_ops);
+
+	gic_ops->gic_init(nr_cpus);
+	gic_common_ops = gic_ops;
+
+	/* Make sure that the initialized data is visible to all the vCPUs */
+	dsb(sy);
+
+	spin_unlock(&gic_lock);
+}
+
+void gic_init(enum gic_type type, unsigned int nr_cpus)
+{
+	uint32_t cpu = guest_get_vcpuid();
+
+	GUEST_ASSERT(type < GIC_TYPE_MAX);
+	GUEST_ASSERT(nr_cpus);
+
+	gic_dist_init(type, nr_cpus);
+	gic_cpu_init(cpu);
+}
+
+void gic_irq_enable(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_enable(intid);
+}
+
+void gic_irq_disable(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_disable(intid);
+}
+
+unsigned int gic_get_and_ack_irq(void)
+{
+	uint64_t irqstat;
+	unsigned int intid;
+
+	GUEST_ASSERT(gic_common_ops);
+
+	irqstat = gic_common_ops->gic_read_iar();
+	intid = irqstat & GENMASK(23, 0);
+
+	return intid;
+}
+
+void gic_set_eoi(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_write_eoir(intid);
+}
+
+void gic_set_dir(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_write_dir(intid);
+}
+
+void gic_set_eoi_split(bool split)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_set_eoi_split(split);
+}
+
+void gic_set_priority_mask(uint64_t pmr)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_set_priority_mask(pmr);
+}
+
+void gic_set_priority(unsigned int intid, unsigned int prio)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_set_priority(intid, prio);
+}
+
+void gic_irq_set_active(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_set_active(intid);
+}
+
+void gic_irq_clear_active(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_clear_active(intid);
+}
+
+bool gic_irq_get_active(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	return gic_common_ops->gic_irq_get_active(intid);
+}
+
+void gic_irq_set_pending(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_set_pending(intid);
+}
+
+void gic_irq_clear_pending(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_clear_pending(intid);
+}
+
+bool gic_irq_get_pending(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	return gic_common_ops->gic_irq_get_pending(intid);
+}
+
+void gic_irq_set_config(unsigned int intid, bool is_edge)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_set_config(intid, is_edge);
+}
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_private.h b/tools/testing/selftests/kvm/lib/arm64/gic_private.h
new file mode 100644
index 000000000000..d24e9ecc96c6
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_private.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM Generic Interrupt Controller (GIC) private defines that's only
+ * shared among the GIC library code.
+ */
+
+#ifndef SELFTEST_KVM_GIC_PRIVATE_H
+#define SELFTEST_KVM_GIC_PRIVATE_H
+
+struct gic_common_ops {
+	void (*gic_init)(unsigned int nr_cpus);
+	void (*gic_cpu_init)(unsigned int cpu);
+	void (*gic_irq_enable)(unsigned int intid);
+	void (*gic_irq_disable)(unsigned int intid);
+	uint64_t (*gic_read_iar)(void);
+	void (*gic_write_eoir)(uint32_t irq);
+	void (*gic_write_dir)(uint32_t irq);
+	void (*gic_set_eoi_split)(bool split);
+	void (*gic_set_priority_mask)(uint64_t mask);
+	void (*gic_set_priority)(uint32_t intid, uint32_t prio);
+	void (*gic_irq_set_active)(uint32_t intid);
+	void (*gic_irq_clear_active)(uint32_t intid);
+	bool (*gic_irq_get_active)(uint32_t intid);
+	void (*gic_irq_set_pending)(uint32_t intid);
+	void (*gic_irq_clear_pending)(uint32_t intid);
+	bool (*gic_irq_get_pending)(uint32_t intid);
+	void (*gic_irq_set_config)(uint32_t intid, bool is_edge);
+};
+
+extern const struct gic_common_ops gicv3_ops;
+
+#endif /* SELFTEST_KVM_GIC_PRIVATE_H */
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c
new file mode 100644
index 000000000000..66d05506f78b
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM Generic Interrupt Controller (GIC) v3 support
+ */
+
+#include <linux/sizes.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "delay.h"
+
+#include "gic.h"
+#include "gic_v3.h"
+#include "gic_private.h"
+
+#define GICV3_MAX_CPUS			512
+
+#define GICD_INT_DEF_PRI		0xa0
+#define GICD_INT_DEF_PRI_X4		((GICD_INT_DEF_PRI << 24) |\
+					(GICD_INT_DEF_PRI << 16) |\
+					(GICD_INT_DEF_PRI << 8) |\
+					GICD_INT_DEF_PRI)
+
+#define ICC_PMR_DEF_PRIO		0xf0
+
+struct gicv3_data {
+	unsigned int nr_cpus;
+	unsigned int nr_spis;
+};
+
+#define sgi_base_from_redist(redist_base)	(redist_base + SZ_64K)
+#define DIST_BIT				(1U << 31)
+
+enum gicv3_intid_range {
+	SGI_RANGE,
+	PPI_RANGE,
+	SPI_RANGE,
+	INVALID_RANGE,
+};
+
+static struct gicv3_data gicv3_data;
+
+static void gicv3_gicd_wait_for_rwp(void)
+{
+	unsigned int count = 100000; /* 1s */
+
+	while (readl(GICD_BASE_GVA + GICD_CTLR) & GICD_CTLR_RWP) {
+		GUEST_ASSERT(count--);
+		udelay(10);
+	}
+}
+
+static inline volatile void *gicr_base_cpu(uint32_t cpu)
+{
+	/* Align all the redistributors sequentially */
+	return GICR_BASE_GVA + cpu * SZ_64K * 2;
+}
+
+static void gicv3_gicr_wait_for_rwp(uint32_t cpu)
+{
+	unsigned int count = 100000; /* 1s */
+
+	while (readl(gicr_base_cpu(cpu) + GICR_CTLR) & GICR_CTLR_RWP) {
+		GUEST_ASSERT(count--);
+		udelay(10);
+	}
+}
+
+static void gicv3_wait_for_rwp(uint32_t cpu_or_dist)
+{
+	if (cpu_or_dist & DIST_BIT)
+		gicv3_gicd_wait_for_rwp();
+	else
+		gicv3_gicr_wait_for_rwp(cpu_or_dist);
+}
+
+static enum gicv3_intid_range get_intid_range(unsigned int intid)
+{
+	switch (intid) {
+	case 0 ... 15:
+		return SGI_RANGE;
+	case 16 ... 31:
+		return PPI_RANGE;
+	case 32 ... 1019:
+		return SPI_RANGE;
+	}
+
+	/* We should not be reaching here */
+	GUEST_ASSERT(0);
+
+	return INVALID_RANGE;
+}
+
+static uint64_t gicv3_read_iar(void)
+{
+	uint64_t irqstat = read_sysreg_s(SYS_ICC_IAR1_EL1);
+
+	dsb(sy);
+	return irqstat;
+}
+
+static void gicv3_write_eoir(uint32_t irq)
+{
+	write_sysreg_s(irq, SYS_ICC_EOIR1_EL1);
+	isb();
+}
+
+static void gicv3_write_dir(uint32_t irq)
+{
+	write_sysreg_s(irq, SYS_ICC_DIR_EL1);
+	isb();
+}
+
+static void gicv3_set_priority_mask(uint64_t mask)
+{
+	write_sysreg_s(mask, SYS_ICC_PMR_EL1);
+}
+
+static void gicv3_set_eoi_split(bool split)
+{
+	uint32_t val;
+
+	/*
+	 * All other fields are read-only, so no need to read CTLR first. In
+	 * fact, the kernel does the same.
+	 */
+	val = split ? (1U << 1) : 0;
+	write_sysreg_s(val, SYS_ICC_CTLR_EL1);
+	isb();
+}
+
+uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset)
+{
+	volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA
+			: sgi_base_from_redist(gicr_base_cpu(cpu_or_dist));
+	return readl(base + offset);
+}
+
+void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val)
+{
+	volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA
+			: sgi_base_from_redist(gicr_base_cpu(cpu_or_dist));
+	writel(reg_val, base + offset);
+}
+
+uint32_t gicv3_getl_fields(uint32_t cpu_or_dist, uint64_t offset, uint32_t mask)
+{
+	return gicv3_reg_readl(cpu_or_dist, offset) & mask;
+}
+
+void gicv3_setl_fields(uint32_t cpu_or_dist, uint64_t offset,
+		uint32_t mask, uint32_t reg_val)
+{
+	uint32_t tmp = gicv3_reg_readl(cpu_or_dist, offset) & ~mask;
+
+	tmp |= (reg_val & mask);
+	gicv3_reg_writel(cpu_or_dist, offset, tmp);
+}
+
+/*
+ * We use a single offset for the distributor and redistributor maps as they
+ * have the same value in both. The only exceptions are registers that only
+ * exist in one and not the other, like GICR_WAKER that doesn't exist in the
+ * distributor map. Such registers are conveniently marked as reserved in the
+ * map that doesn't implement it; like GICR_WAKER's offset of 0x0014 being
+ * marked as "Reserved" in the Distributor map.
+ */
+static void gicv3_access_reg(uint32_t intid, uint64_t offset,
+		uint32_t reg_bits, uint32_t bits_per_field,
+		bool write, uint32_t *val)
+{
+	uint32_t cpu = guest_get_vcpuid();
+	enum gicv3_intid_range intid_range = get_intid_range(intid);
+	uint32_t fields_per_reg, index, mask, shift;
+	uint32_t cpu_or_dist;
+
+	GUEST_ASSERT(bits_per_field <= reg_bits);
+	GUEST_ASSERT(!write || *val < (1U << bits_per_field));
+	/*
+	 * This function does not support 64 bit accesses. Just asserting here
+	 * until we implement readq/writeq.
+	 */
+	GUEST_ASSERT(reg_bits == 32);
+
+	fields_per_reg = reg_bits / bits_per_field;
+	index = intid % fields_per_reg;
+	shift = index * bits_per_field;
+	mask = ((1U << bits_per_field) - 1) << shift;
+
+	/* Set offset to the actual register holding intid's config. */
+	offset += (intid / fields_per_reg) * (reg_bits / 8);
+
+	cpu_or_dist = (intid_range == SPI_RANGE) ? DIST_BIT : cpu;
+
+	if (write)
+		gicv3_setl_fields(cpu_or_dist, offset, mask, *val << shift);
+	*val = gicv3_getl_fields(cpu_or_dist, offset, mask) >> shift;
+}
+
+static void gicv3_write_reg(uint32_t intid, uint64_t offset,
+		uint32_t reg_bits, uint32_t bits_per_field, uint32_t val)
+{
+	gicv3_access_reg(intid, offset, reg_bits,
+			bits_per_field, true, &val);
+}
+
+static uint32_t gicv3_read_reg(uint32_t intid, uint64_t offset,
+		uint32_t reg_bits, uint32_t bits_per_field)
+{
+	uint32_t val;
+
+	gicv3_access_reg(intid, offset, reg_bits,
+			bits_per_field, false, &val);
+	return val;
+}
+
+static void gicv3_set_priority(uint32_t intid, uint32_t prio)
+{
+	gicv3_write_reg(intid, GICD_IPRIORITYR, 32, 8, prio);
+}
+
+/* Sets the intid to be level-sensitive or edge-triggered. */
+static void gicv3_irq_set_config(uint32_t intid, bool is_edge)
+{
+	uint32_t val;
+
+	/* N/A for private interrupts. */
+	GUEST_ASSERT(get_intid_range(intid) == SPI_RANGE);
+	val = is_edge ? 2 : 0;
+	gicv3_write_reg(intid, GICD_ICFGR, 32, 2, val);
+}
+
+static void gicv3_irq_enable(uint32_t intid)
+{
+	bool is_spi = get_intid_range(intid) == SPI_RANGE;
+	uint32_t cpu = guest_get_vcpuid();
+
+	gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, 1);
+	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
+}
+
+static void gicv3_irq_disable(uint32_t intid)
+{
+	bool is_spi = get_intid_range(intid) == SPI_RANGE;
+	uint32_t cpu = guest_get_vcpuid();
+
+	gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, 1);
+	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
+}
+
+static void gicv3_irq_set_active(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ISACTIVER, 32, 1, 1);
+}
+
+static void gicv3_irq_clear_active(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ICACTIVER, 32, 1, 1);
+}
+
+static bool gicv3_irq_get_active(uint32_t intid)
+{
+	return gicv3_read_reg(intid, GICD_ISACTIVER, 32, 1);
+}
+
+static void gicv3_irq_set_pending(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ISPENDR, 32, 1, 1);
+}
+
+static void gicv3_irq_clear_pending(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ICPENDR, 32, 1, 1);
+}
+
+static bool gicv3_irq_get_pending(uint32_t intid)
+{
+	return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1);
+}
+
+static void gicv3_enable_redist(volatile void *redist_base)
+{
+	uint32_t val = readl(redist_base + GICR_WAKER);
+	unsigned int count = 100000; /* 1s */
+
+	val &= ~GICR_WAKER_ProcessorSleep;
+	writel(val, redist_base + GICR_WAKER);
+
+	/* Wait until the processor is 'active' */
+	while (readl(redist_base + GICR_WAKER) & GICR_WAKER_ChildrenAsleep) {
+		GUEST_ASSERT(count--);
+		udelay(10);
+	}
+}
+
+static void gicv3_cpu_init(unsigned int cpu)
+{
+	volatile void *sgi_base;
+	unsigned int i;
+	volatile void *redist_base_cpu;
+
+	GUEST_ASSERT(cpu < gicv3_data.nr_cpus);
+
+	redist_base_cpu = gicr_base_cpu(cpu);
+	sgi_base = sgi_base_from_redist(redist_base_cpu);
+
+	gicv3_enable_redist(redist_base_cpu);
+
+	/*
+	 * Mark all the SGI and PPI interrupts as non-secure Group-1.
+	 * Also, deactivate and disable them.
+	 */
+	writel(~0, sgi_base + GICR_IGROUPR0);
+	writel(~0, sgi_base + GICR_ICACTIVER0);
+	writel(~0, sgi_base + GICR_ICENABLER0);
+
+	/* Set a default priority for all the SGIs and PPIs */
+	for (i = 0; i < 32; i += 4)
+		writel(GICD_INT_DEF_PRI_X4,
+				sgi_base + GICR_IPRIORITYR0 + i);
+
+	gicv3_gicr_wait_for_rwp(cpu);
+
+	/* Enable the GIC system register (ICC_*) access */
+	write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE,
+			SYS_ICC_SRE_EL1);
+
+	/* Set a default priority threshold */
+	write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1);
+
+	/* Enable non-secure Group-1 interrupts */
+	write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1);
+}
+
+static void gicv3_dist_init(void)
+{
+	unsigned int i;
+
+	/* Disable the distributor until we set things up */
+	writel(0, GICD_BASE_GVA + GICD_CTLR);
+	gicv3_gicd_wait_for_rwp();
+
+	/*
+	 * Mark all the SPI interrupts as non-secure Group-1.
+	 * Also, deactivate and disable them.
+	 */
+	for (i = 32; i < gicv3_data.nr_spis; i += 32) {
+		writel(~0, GICD_BASE_GVA + GICD_IGROUPR + i / 8);
+		writel(~0, GICD_BASE_GVA + GICD_ICACTIVER + i / 8);
+		writel(~0, GICD_BASE_GVA + GICD_ICENABLER + i / 8);
+	}
+
+	/* Set a default priority for all the SPIs */
+	for (i = 32; i < gicv3_data.nr_spis; i += 4)
+		writel(GICD_INT_DEF_PRI_X4,
+				GICD_BASE_GVA + GICD_IPRIORITYR + i);
+
+	/* Wait for the settings to sync-in */
+	gicv3_gicd_wait_for_rwp();
+
+	/* Finally, enable the distributor globally with ARE */
+	writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A |
+			GICD_CTLR_ENABLE_G1, GICD_BASE_GVA + GICD_CTLR);
+	gicv3_gicd_wait_for_rwp();
+}
+
+static void gicv3_init(unsigned int nr_cpus)
+{
+	GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS);
+
+	gicv3_data.nr_cpus = nr_cpus;
+	gicv3_data.nr_spis = GICD_TYPER_SPIS(
+				readl(GICD_BASE_GVA + GICD_TYPER));
+	if (gicv3_data.nr_spis > 1020)
+		gicv3_data.nr_spis = 1020;
+
+	/*
+	 * Initialize only the distributor for now.
+	 * The redistributor and CPU interfaces are initialized
+	 * later for every PE.
+	 */
+	gicv3_dist_init();
+}
+
+const struct gic_common_ops gicv3_ops = {
+	.gic_init = gicv3_init,
+	.gic_cpu_init = gicv3_cpu_init,
+	.gic_irq_enable = gicv3_irq_enable,
+	.gic_irq_disable = gicv3_irq_disable,
+	.gic_read_iar = gicv3_read_iar,
+	.gic_write_eoir = gicv3_write_eoir,
+	.gic_write_dir = gicv3_write_dir,
+	.gic_set_priority_mask = gicv3_set_priority_mask,
+	.gic_set_eoi_split = gicv3_set_eoi_split,
+	.gic_set_priority = gicv3_set_priority,
+	.gic_irq_set_active = gicv3_irq_set_active,
+	.gic_irq_clear_active = gicv3_irq_clear_active,
+	.gic_irq_get_active = gicv3_irq_get_active,
+	.gic_irq_set_pending = gicv3_irq_set_pending,
+	.gic_irq_clear_pending = gicv3_irq_clear_pending,
+	.gic_irq_get_pending = gicv3_irq_get_pending,
+	.gic_irq_set_config = gicv3_irq_set_config,
+};
+
+void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size,
+			   vm_paddr_t pend_table)
+{
+	volatile void *rdist_base = gicr_base_cpu(guest_get_vcpuid());
+
+	u32 ctlr;
+	u64 val;
+
+	val = (cfg_table |
+	       GICR_PROPBASER_InnerShareable |
+	       GICR_PROPBASER_RaWaWb |
+	       ((ilog2(cfg_table_size) - 1) & GICR_PROPBASER_IDBITS_MASK));
+	writeq_relaxed(val, rdist_base + GICR_PROPBASER);
+
+	val = (pend_table |
+	       GICR_PENDBASER_InnerShareable |
+	       GICR_PENDBASER_RaWaWb);
+	writeq_relaxed(val, rdist_base + GICR_PENDBASER);
+
+	ctlr = readl_relaxed(rdist_base + GICR_CTLR);
+	ctlr |= GICR_CTLR_ENABLE_LPIS;
+	writel_relaxed(ctlr, rdist_base + GICR_CTLR);
+}
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
new file mode 100644
index 000000000000..09f270545646
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Guest ITS library, generously donated by drivers/irqchip/irq-gic-v3-its.c
+ * over in the kernel tree.
+ */
+
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm.h>
+
+#include "kvm_util.h"
+#include "vgic.h"
+#include "gic.h"
+#include "gic_v3.h"
+#include "processor.h"
+
+static u64 its_read_u64(unsigned long offset)
+{
+	return readq_relaxed(GITS_BASE_GVA + offset);
+}
+
+static void its_write_u64(unsigned long offset, u64 val)
+{
+	writeq_relaxed(val, GITS_BASE_GVA + offset);
+}
+
+static u32 its_read_u32(unsigned long offset)
+{
+	return readl_relaxed(GITS_BASE_GVA + offset);
+}
+
+static void its_write_u32(unsigned long offset, u32 val)
+{
+	writel_relaxed(val, GITS_BASE_GVA + offset);
+}
+
+static unsigned long its_find_baser(unsigned int type)
+{
+	int i;
+
+	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
+		u64 baser;
+		unsigned long offset = GITS_BASER + (i * sizeof(baser));
+
+		baser = its_read_u64(offset);
+		if (GITS_BASER_TYPE(baser) == type)
+			return offset;
+	}
+
+	GUEST_FAIL("Couldn't find an ITS BASER of type %u", type);
+	return -1;
+}
+
+static void its_install_table(unsigned int type, vm_paddr_t base, size_t size)
+{
+	unsigned long offset = its_find_baser(type);
+	u64 baser;
+
+	baser = ((size / SZ_64K) - 1) |
+		GITS_BASER_PAGE_SIZE_64K |
+		GITS_BASER_InnerShareable |
+		base |
+		GITS_BASER_RaWaWb |
+		GITS_BASER_VALID;
+
+	its_write_u64(offset, baser);
+}
+
+static void its_install_cmdq(vm_paddr_t base, size_t size)
+{
+	u64 cbaser;
+
+	cbaser = ((size / SZ_4K) - 1) |
+		 GITS_CBASER_InnerShareable |
+		 base |
+		 GITS_CBASER_RaWaWb |
+		 GITS_CBASER_VALID;
+
+	its_write_u64(GITS_CBASER, cbaser);
+}
+
+void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz,
+	      vm_paddr_t device_tbl, size_t device_tbl_sz,
+	      vm_paddr_t cmdq, size_t cmdq_size)
+{
+	u32 ctlr;
+
+	its_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz);
+	its_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz);
+	its_install_cmdq(cmdq, cmdq_size);
+
+	ctlr = its_read_u32(GITS_CTLR);
+	ctlr |= GITS_CTLR_ENABLE;
+	its_write_u32(GITS_CTLR, ctlr);
+}
+
+struct its_cmd_block {
+	union {
+		u64	raw_cmd[4];
+		__le64	raw_cmd_le[4];
+	};
+};
+
+static inline void its_fixup_cmd(struct its_cmd_block *cmd)
+{
+	/* Let's fixup BE commands */
+	cmd->raw_cmd_le[0] = cpu_to_le64(cmd->raw_cmd[0]);
+	cmd->raw_cmd_le[1] = cpu_to_le64(cmd->raw_cmd[1]);
+	cmd->raw_cmd_le[2] = cpu_to_le64(cmd->raw_cmd[2]);
+	cmd->raw_cmd_le[3] = cpu_to_le64(cmd->raw_cmd[3]);
+}
+
+static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l)
+{
+	u64 mask = GENMASK_ULL(h, l);
+	*raw_cmd &= ~mask;
+	*raw_cmd |= (val << l) & mask;
+}
+
+static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr)
+{
+	its_mask_encode(&cmd->raw_cmd[0], cmd_nr, 7, 0);
+}
+
+static void its_encode_devid(struct its_cmd_block *cmd, u32 devid)
+{
+	its_mask_encode(&cmd->raw_cmd[0], devid, 63, 32);
+}
+
+static void its_encode_event_id(struct its_cmd_block *cmd, u32 id)
+{
+	its_mask_encode(&cmd->raw_cmd[1], id, 31, 0);
+}
+
+static void its_encode_phys_id(struct its_cmd_block *cmd, u32 phys_id)
+{
+	its_mask_encode(&cmd->raw_cmd[1], phys_id, 63, 32);
+}
+
+static void its_encode_size(struct its_cmd_block *cmd, u8 size)
+{
+	its_mask_encode(&cmd->raw_cmd[1], size, 4, 0);
+}
+
+static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr)
+{
+	its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8);
+}
+
+static void its_encode_valid(struct its_cmd_block *cmd, int valid)
+{
+	its_mask_encode(&cmd->raw_cmd[2], !!valid, 63, 63);
+}
+
+static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr)
+{
+	its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16);
+}
+
+static void its_encode_collection(struct its_cmd_block *cmd, u16 col)
+{
+	its_mask_encode(&cmd->raw_cmd[2], col, 15, 0);
+}
+
+#define GITS_CMDQ_POLL_ITERATIONS	0
+
+static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd)
+{
+	u64 cwriter = its_read_u64(GITS_CWRITER);
+	struct its_cmd_block *dst = cmdq_base + cwriter;
+	u64 cbaser = its_read_u64(GITS_CBASER);
+	size_t cmdq_size;
+	u64 next;
+	int i;
+
+	cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K;
+
+	its_fixup_cmd(cmd);
+
+	WRITE_ONCE(*dst, *cmd);
+	dsb(ishst);
+	next = (cwriter + sizeof(*cmd)) % cmdq_size;
+	its_write_u64(GITS_CWRITER, next);
+
+	/*
+	 * Polling isn't necessary considering KVM's ITS emulation at the time
+	 * of writing this, as the CMDQ is processed synchronously after a write
+	 * to CWRITER.
+	 */
+	for (i = 0; its_read_u64(GITS_CREADR) != next; i++) {
+		__GUEST_ASSERT(i < GITS_CMDQ_POLL_ITERATIONS,
+			       "ITS didn't process command at offset %lu after %d iterations\n",
+			       cwriter, i);
+
+		cpu_relax();
+	}
+}
+
+void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base,
+		       size_t itt_size, bool valid)
+{
+	struct its_cmd_block cmd = {};
+
+	its_encode_cmd(&cmd, GITS_CMD_MAPD);
+	its_encode_devid(&cmd, device_id);
+	its_encode_size(&cmd, ilog2(itt_size) - 1);
+	its_encode_itt(&cmd, itt_base);
+	its_encode_valid(&cmd, valid);
+
+	its_send_cmd(cmdq_base, &cmd);
+}
+
+void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid)
+{
+	struct its_cmd_block cmd = {};
+
+	its_encode_cmd(&cmd, GITS_CMD_MAPC);
+	its_encode_collection(&cmd, collection_id);
+	its_encode_target(&cmd, vcpu_id);
+	its_encode_valid(&cmd, valid);
+
+	its_send_cmd(cmdq_base, &cmd);
+}
+
+void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id,
+			u32 collection_id, u32 intid)
+{
+	struct its_cmd_block cmd = {};
+
+	its_encode_cmd(&cmd, GITS_CMD_MAPTI);
+	its_encode_devid(&cmd, device_id);
+	its_encode_event_id(&cmd, event_id);
+	its_encode_phys_id(&cmd, intid);
+	its_encode_collection(&cmd, collection_id);
+
+	its_send_cmd(cmdq_base, &cmd);
+}
+
+void its_send_invall_cmd(void *cmdq_base, u32 collection_id)
+{
+	struct its_cmd_block cmd = {};
+
+	its_encode_cmd(&cmd, GITS_CMD_INVALL);
+	its_encode_collection(&cmd, collection_id);
+
+	its_send_cmd(cmdq_base, &cmd);
+}
diff --git a/tools/testing/selftests/kvm/lib/arm64/handlers.S b/tools/testing/selftests/kvm/lib/arm64/handlers.S
new file mode 100644
index 000000000000..0e443eadfac6
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/handlers.S
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+.macro save_registers
+	add	sp, sp, #-16 * 17
+
+	stp	x0, x1, [sp, #16 * 0]
+	stp	x2, x3, [sp, #16 * 1]
+	stp	x4, x5, [sp, #16 * 2]
+	stp	x6, x7, [sp, #16 * 3]
+	stp	x8, x9, [sp, #16 * 4]
+	stp	x10, x11, [sp, #16 * 5]
+	stp	x12, x13, [sp, #16 * 6]
+	stp	x14, x15, [sp, #16 * 7]
+	stp	x16, x17, [sp, #16 * 8]
+	stp	x18, x19, [sp, #16 * 9]
+	stp	x20, x21, [sp, #16 * 10]
+	stp	x22, x23, [sp, #16 * 11]
+	stp	x24, x25, [sp, #16 * 12]
+	stp	x26, x27, [sp, #16 * 13]
+	stp	x28, x29, [sp, #16 * 14]
+
+	/*
+	 * This stores sp_el1 into ex_regs.sp so exception handlers can "look"
+	 * at it. It will _not_ be used to restore the sp on return from the
+	 * exception so handlers can not update it.
+	 */
+	add	x1, sp, #16 * 17
+	stp	x30, x1, [sp, #16 * 15] /* x30, SP */
+
+	mrs	x1, elr_el1
+	mrs	x2, spsr_el1
+	stp	x1, x2, [sp, #16 * 16] /* PC, PSTATE */
+.endm
+
+.macro restore_registers
+	ldp	x1, x2, [sp, #16 * 16] /* PC, PSTATE */
+	msr	elr_el1, x1
+	msr	spsr_el1, x2
+
+	/* sp is not restored */
+	ldp	x30, xzr, [sp, #16 * 15] /* x30, SP */
+
+	ldp	x28, x29, [sp, #16 * 14]
+	ldp	x26, x27, [sp, #16 * 13]
+	ldp	x24, x25, [sp, #16 * 12]
+	ldp	x22, x23, [sp, #16 * 11]
+	ldp	x20, x21, [sp, #16 * 10]
+	ldp	x18, x19, [sp, #16 * 9]
+	ldp	x16, x17, [sp, #16 * 8]
+	ldp	x14, x15, [sp, #16 * 7]
+	ldp	x12, x13, [sp, #16 * 6]
+	ldp	x10, x11, [sp, #16 * 5]
+	ldp	x8, x9, [sp, #16 * 4]
+	ldp	x6, x7, [sp, #16 * 3]
+	ldp	x4, x5, [sp, #16 * 2]
+	ldp	x2, x3, [sp, #16 * 1]
+	ldp	x0, x1, [sp, #16 * 0]
+
+	add	sp, sp, #16 * 17
+
+	eret
+.endm
+
+.pushsection ".entry.text", "ax"
+.balign 0x800
+.global vectors
+vectors:
+.popsection
+
+.set	vector, 0
+
+/*
+ * Build an exception handler for vector and append a jump to it into
+ * vectors (while making sure that it's 0x80 aligned).
+ */
+.macro HANDLER, label
+handler_\label:
+	save_registers
+	mov	x0, sp
+	mov	x1, #vector
+	bl	route_exception
+	restore_registers
+
+.pushsection ".entry.text", "ax"
+.balign 0x80
+	b	handler_\label
+.popsection
+
+.set	vector, vector + 1
+.endm
+
+.macro HANDLER_INVALID
+.pushsection ".entry.text", "ax"
+.balign 0x80
+/* This will abort so no need to save and restore registers. */
+	mov	x0, #vector
+	mov	x1, #0 /* ec */
+	mov	x2, #0 /* valid_ec */
+	b	kvm_exit_unexpected_exception
+.popsection
+
+.set	vector, vector + 1
+.endm
+
+/*
+ * Caution: be sure to not add anything between the declaration of vectors
+ * above and these macro calls that will build the vectors table below it.
+ */
+	HANDLER_INVALID                         // Synchronous EL1t
+	HANDLER_INVALID                         // IRQ EL1t
+	HANDLER_INVALID                         // FIQ EL1t
+	HANDLER_INVALID                         // Error EL1t
+
+	HANDLER	el1h_sync                       // Synchronous EL1h
+	HANDLER	el1h_irq                        // IRQ EL1h
+	HANDLER el1h_fiq                        // FIQ EL1h
+	HANDLER	el1h_error                      // Error EL1h
+
+	HANDLER	el0_sync_64                     // Synchronous 64-bit EL0
+	HANDLER	el0_irq_64                      // IRQ 64-bit EL0
+	HANDLER	el0_fiq_64                      // FIQ 64-bit EL0
+	HANDLER	el0_error_64                    // Error 64-bit EL0
+
+	HANDLER	el0_sync_32                     // Synchronous 32-bit EL0
+	HANDLER	el0_irq_32                      // IRQ 32-bit EL0
+	HANDLER	el0_fiq_32                      // FIQ 32-bit EL0
+	HANDLER	el0_error_32                    // Error 32-bit EL0
diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c
new file mode 100644
index 000000000000..7ba3aa3755f3
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/processor.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AArch64 code
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#include <linux/compiler.h>
+#include <assert.h>
+
+#include "guest_modes.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "ucall_common.h"
+
+#include <linux/bitfield.h>
+#include <linux/sizes.h>
+
+#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN	0xac0000
+
+static vm_vaddr_t exception_handlers;
+
+static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
+{
+	return (v + vm->page_size) & ~(vm->page_size - 1);
+}
+
+static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+	uint64_t mask = (1UL << (vm->va_bits - shift)) - 1;
+
+	return (gva >> shift) & mask;
+}
+
+static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift;
+	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+	TEST_ASSERT(vm->pgtable_levels == 4,
+		"Mode %d does not have 4 page table levels", vm->mode);
+
+	return (gva >> shift) & mask;
+}
+
+static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	unsigned int shift = (vm->page_shift - 3) + vm->page_shift;
+	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+	TEST_ASSERT(vm->pgtable_levels >= 3,
+		"Mode %d does not have >= 3 page table levels", vm->mode);
+
+	return (gva >> shift) & mask;
+}
+
+static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+	return (gva >> vm->page_shift) & mask;
+}
+
+static inline bool use_lpa2_pte_format(struct kvm_vm *vm)
+{
+	return (vm->page_size == SZ_4K || vm->page_size == SZ_16K) &&
+	    (vm->pa_bits > 48 || vm->va_bits > 48);
+}
+
+static uint64_t addr_pte(struct kvm_vm *vm, uint64_t pa, uint64_t attrs)
+{
+	uint64_t pte;
+
+	if (use_lpa2_pte_format(vm)) {
+		pte = pa & GENMASK(49, vm->page_shift);
+		pte |= FIELD_GET(GENMASK(51, 50), pa) << 8;
+		attrs &= ~GENMASK(9, 8);
+	} else {
+		pte = pa & GENMASK(47, vm->page_shift);
+		if (vm->page_shift == 16)
+			pte |= FIELD_GET(GENMASK(51, 48), pa) << 12;
+	}
+	pte |= attrs;
+
+	return pte;
+}
+
+static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte)
+{
+	uint64_t pa;
+
+	if (use_lpa2_pte_format(vm)) {
+		pa = pte & GENMASK(49, vm->page_shift);
+		pa |= FIELD_GET(GENMASK(9, 8), pte) << 50;
+	} else {
+		pa = pte & GENMASK(47, vm->page_shift);
+		if (vm->page_shift == 16)
+			pa |= FIELD_GET(GENMASK(15, 12), pte) << 48;
+	}
+
+	return pa;
+}
+
+static uint64_t ptrs_per_pgd(struct kvm_vm *vm)
+{
+	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+	return 1 << (vm->va_bits - shift);
+}
+
+static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
+{
+	return 1 << (vm->page_shift - 3);
+}
+
+void virt_arch_pgd_alloc(struct kvm_vm *vm)
+{
+	size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
+
+	if (vm->pgd_created)
+		return;
+
+	vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
+				     KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+				     vm->memslots[MEM_REGION_PT]);
+	vm->pgd_created = true;
+}
+
+static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+			 uint64_t flags)
+{
+	uint8_t attr_idx = flags & 7;
+	uint64_t *ptep;
+
+	TEST_ASSERT((vaddr % vm->page_size) == 0,
+		"Virtual address not on page boundary,\n"
+		"  vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size);
+	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+		(vaddr >> vm->page_shift)),
+		"Invalid virtual address, vaddr: 0x%lx", vaddr);
+	TEST_ASSERT((paddr % vm->page_size) == 0,
+		"Physical address not on page boundary,\n"
+		"  paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size);
+	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+		"Physical address beyond beyond maximum supported,\n"
+		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+		paddr, vm->max_gfn, vm->page_size);
+
+	ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8;
+	if (!*ptep)
+		*ptep = addr_pte(vm, vm_alloc_page_table(vm), 3);
+
+	switch (vm->pgtable_levels) {
+	case 4:
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8;
+		if (!*ptep)
+			*ptep = addr_pte(vm, vm_alloc_page_table(vm), 3);
+		/* fall through */
+	case 3:
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8;
+		if (!*ptep)
+			*ptep = addr_pte(vm, vm_alloc_page_table(vm), 3);
+		/* fall through */
+	case 2:
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8;
+		break;
+	default:
+		TEST_FAIL("Page table levels must be 2, 3, or 4");
+	}
+
+	*ptep = addr_pte(vm, paddr, (attr_idx << 2) | (1 << 10) | 3);  /* AF */
+}
+
+void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
+{
+	uint64_t attr_idx = MT_NORMAL;
+
+	_virt_pg_map(vm, vaddr, paddr, attr_idx);
+}
+
+uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	uint64_t *ptep;
+
+	if (!vm->pgd_created)
+		goto unmapped_gva;
+
+	ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8;
+	if (!ptep)
+		goto unmapped_gva;
+
+	switch (vm->pgtable_levels) {
+	case 4:
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8;
+		if (!ptep)
+			goto unmapped_gva;
+		/* fall through */
+	case 3:
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8;
+		if (!ptep)
+			goto unmapped_gva;
+		/* fall through */
+	case 2:
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8;
+		if (!ptep)
+			goto unmapped_gva;
+		break;
+	default:
+		TEST_FAIL("Page table levels must be 2, 3, or 4");
+	}
+
+	return ptep;
+
+unmapped_gva:
+	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
+	exit(EXIT_FAILURE);
+}
+
+vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	uint64_t *ptep = virt_get_pte_hva(vm, gva);
+
+	return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
+}
+
+static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
+{
+#ifdef DEBUG
+	static const char * const type[] = { "", "pud", "pmd", "pte" };
+	uint64_t pte, *ptep;
+
+	if (level == 4)
+		return;
+
+	for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) {
+		ptep = addr_gpa2hva(vm, pte);
+		if (!*ptep)
+			continue;
+		fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
+		pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1);
+	}
+#endif
+}
+
+void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+	int level = 4 - (vm->pgtable_levels - 1);
+	uint64_t pgd, *ptep;
+
+	if (!vm->pgd_created)
+		return;
+
+	for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
+		ptep = addr_gpa2hva(vm, pgd);
+		if (!*ptep)
+			continue;
+		fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
+		pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level);
+	}
+}
+
+void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init)
+{
+	struct kvm_vcpu_init default_init = { .target = -1, };
+	struct kvm_vm *vm = vcpu->vm;
+	uint64_t sctlr_el1, tcr_el1, ttbr0_el1;
+
+	if (!init)
+		init = &default_init;
+
+	if (init->target == -1) {
+		struct kvm_vcpu_init preferred;
+		vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &preferred);
+		init->target = preferred.target;
+	}
+
+	vcpu_ioctl(vcpu, KVM_ARM_VCPU_INIT, init);
+
+	/*
+	 * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15
+	 * registers, which the variable argument list macros do.
+	 */
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20);
+
+	sctlr_el1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1));
+	tcr_el1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1));
+
+	/* Configure base granule size */
+	switch (vm->mode) {
+	case VM_MODE_PXXV48_4K:
+		TEST_FAIL("AArch64 does not support 4K sized pages "
+			  "with ANY-bit physical address ranges");
+	case VM_MODE_P52V48_64K:
+	case VM_MODE_P48V48_64K:
+	case VM_MODE_P40V48_64K:
+	case VM_MODE_P36V48_64K:
+		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+		break;
+	case VM_MODE_P52V48_16K:
+	case VM_MODE_P48V48_16K:
+	case VM_MODE_P40V48_16K:
+	case VM_MODE_P36V48_16K:
+	case VM_MODE_P36V47_16K:
+		tcr_el1 |= 2ul << 14; /* TG0 = 16KB */
+		break;
+	case VM_MODE_P52V48_4K:
+	case VM_MODE_P48V48_4K:
+	case VM_MODE_P40V48_4K:
+	case VM_MODE_P36V48_4K:
+		tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+		break;
+	default:
+		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+	}
+
+	ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift);
+
+	/* Configure output size */
+	switch (vm->mode) {
+	case VM_MODE_P52V48_4K:
+	case VM_MODE_P52V48_16K:
+	case VM_MODE_P52V48_64K:
+		tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
+		ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2;
+		break;
+	case VM_MODE_P48V48_4K:
+	case VM_MODE_P48V48_16K:
+	case VM_MODE_P48V48_64K:
+		tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
+		break;
+	case VM_MODE_P40V48_4K:
+	case VM_MODE_P40V48_16K:
+	case VM_MODE_P40V48_64K:
+		tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+		break;
+	case VM_MODE_P36V48_4K:
+	case VM_MODE_P36V48_16K:
+	case VM_MODE_P36V48_64K:
+	case VM_MODE_P36V47_16K:
+		tcr_el1 |= 1ul << 32; /* IPS = 36 bits */
+		break;
+	default:
+		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+	}
+
+	sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */;
+	/* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */;
+	tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12);
+	tcr_el1 |= (64 - vm->va_bits) /* T0SZ */;
+	if (use_lpa2_pte_format(vm))
+		tcr_el1 |= (1ul << 59) /* DS */;
+
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), ttbr0_el1);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpu->id);
+}
+
+void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
+{
+	uint64_t pstate, pc;
+
+	pstate = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate));
+	pc = vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc));
+
+	fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n",
+		indent, "", pstate, pc);
+}
+
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+}
+
+static struct kvm_vcpu *__aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
+					   struct kvm_vcpu_init *init)
+{
+	size_t stack_size;
+	uint64_t stack_vaddr;
+	struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
+
+	stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size :
+					     vm->page_size;
+	stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
+				       DEFAULT_ARM64_GUEST_STACK_VADDR_MIN,
+				       MEM_REGION_DATA);
+
+	aarch64_vcpu_setup(vcpu, init);
+
+	vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
+	return vcpu;
+}
+
+struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
+				  struct kvm_vcpu_init *init, void *guest_code)
+{
+	struct kvm_vcpu *vcpu = __aarch64_vcpu_add(vm, vcpu_id, init);
+
+	vcpu_arch_set_entry_point(vcpu, guest_code);
+
+	return vcpu;
+}
+
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
+{
+	return __aarch64_vcpu_add(vm, vcpu_id, NULL);
+}
+
+void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
+{
+	va_list ap;
+	int i;
+
+	TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n"
+		    "  num: %u", num);
+
+	va_start(ap, num);
+
+	for (i = 0; i < num; i++) {
+		vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.regs[i]),
+			     va_arg(ap, uint64_t));
+	}
+
+	va_end(ap);
+}
+
+void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec)
+{
+	ucall(UCALL_UNHANDLED, 3, vector, ec, valid_ec);
+	while (1)
+		;
+}
+
+void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	if (get_ucall(vcpu, &uc) != UCALL_UNHANDLED)
+		return;
+
+	if (uc.args[2]) /* valid_ec */ {
+		assert(VECTOR_IS_SYNC(uc.args[0]));
+		TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)",
+			  uc.args[0], uc.args[1]);
+	} else {
+		assert(!VECTOR_IS_SYNC(uc.args[0]));
+		TEST_FAIL("Unexpected exception (vector:0x%lx)",
+			  uc.args[0]);
+	}
+}
+
+struct handlers {
+	handler_fn exception_handlers[VECTOR_NUM][ESR_ELx_EC_MAX + 1];
+};
+
+void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu)
+{
+	extern char vectors;
+
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors);
+}
+
+void route_exception(struct ex_regs *regs, int vector)
+{
+	struct handlers *handlers = (struct handlers *)exception_handlers;
+	bool valid_ec;
+	int ec = 0;
+
+	switch (vector) {
+	case VECTOR_SYNC_CURRENT:
+	case VECTOR_SYNC_LOWER_64:
+		ec = ESR_ELx_EC(read_sysreg(esr_el1));
+		valid_ec = true;
+		break;
+	case VECTOR_IRQ_CURRENT:
+	case VECTOR_IRQ_LOWER_64:
+	case VECTOR_FIQ_CURRENT:
+	case VECTOR_FIQ_LOWER_64:
+	case VECTOR_ERROR_CURRENT:
+	case VECTOR_ERROR_LOWER_64:
+		ec = 0;
+		valid_ec = false;
+		break;
+	default:
+		valid_ec = false;
+		goto unexpected_exception;
+	}
+
+	if (handlers && handlers->exception_handlers[vector][ec])
+		return handlers->exception_handlers[vector][ec](regs);
+
+unexpected_exception:
+	kvm_exit_unexpected_exception(vector, ec, valid_ec);
+}
+
+void vm_init_descriptor_tables(struct kvm_vm *vm)
+{
+	vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers),
+					vm->page_size, MEM_REGION_DATA);
+
+	*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
+}
+
+void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec,
+			 void (*handler)(struct ex_regs *))
+{
+	struct handlers *handlers = addr_gva2hva(vm, vm->handlers);
+
+	assert(VECTOR_IS_SYNC(vector));
+	assert(vector < VECTOR_NUM);
+	assert(ec <= ESR_ELx_EC_MAX);
+	handlers->exception_handlers[vector][ec] = handler;
+}
+
+void vm_install_exception_handler(struct kvm_vm *vm, int vector,
+			 void (*handler)(struct ex_regs *))
+{
+	struct handlers *handlers = addr_gva2hva(vm, vm->handlers);
+
+	assert(!VECTOR_IS_SYNC(vector));
+	assert(vector < VECTOR_NUM);
+	handlers->exception_handlers[vector][0] = handler;
+}
+
+uint32_t guest_get_vcpuid(void)
+{
+	return read_sysreg(tpidr_el1);
+}
+
+static uint32_t max_ipa_for_page_size(uint32_t vm_ipa, uint32_t gran,
+				uint32_t not_sup_val, uint32_t ipa52_min_val)
+{
+	if (gran == not_sup_val)
+		return 0;
+	else if (gran >= ipa52_min_val && vm_ipa >= 52)
+		return 52;
+	else
+		return min(vm_ipa, 48U);
+}
+
+void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k,
+					uint32_t *ipa16k, uint32_t *ipa64k)
+{
+	struct kvm_vcpu_init preferred_init;
+	int kvm_fd, vm_fd, vcpu_fd, err;
+	uint64_t val;
+	uint32_t gran;
+	struct kvm_one_reg reg = {
+		.id	= KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR0_EL1),
+		.addr	= (uint64_t)&val,
+	};
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, (void *)(unsigned long)ipa);
+	TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd));
+
+	vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
+	TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd));
+
+	err = ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init);
+	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_PREFERRED_TARGET, err));
+	err = ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &preferred_init);
+	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_VCPU_INIT, err));
+
+	err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
+	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd));
+
+	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val);
+	*ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI,
+					ID_AA64MMFR0_EL1_TGRAN4_52_BIT);
+
+	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val);
+	*ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI,
+					ID_AA64MMFR0_EL1_TGRAN64_IMP);
+
+	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val);
+	*ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI,
+					ID_AA64MMFR0_EL1_TGRAN16_52_BIT);
+
+	close(vcpu_fd);
+	close(vm_fd);
+	close(kvm_fd);
+}
+
+#define __smccc_call(insn, function_id, arg0, arg1, arg2, arg3, arg4, arg5,	\
+		     arg6, res)							\
+	asm volatile("mov   w0, %w[function_id]\n"				\
+		     "mov   x1, %[arg0]\n"					\
+		     "mov   x2, %[arg1]\n"					\
+		     "mov   x3, %[arg2]\n"					\
+		     "mov   x4, %[arg3]\n"					\
+		     "mov   x5, %[arg4]\n"					\
+		     "mov   x6, %[arg5]\n"					\
+		     "mov   x7, %[arg6]\n"					\
+		     #insn  "#0\n"						\
+		     "mov   %[res0], x0\n"					\
+		     "mov   %[res1], x1\n"					\
+		     "mov   %[res2], x2\n"					\
+		     "mov   %[res3], x3\n"					\
+		     : [res0] "=r"(res->a0), [res1] "=r"(res->a1),		\
+		       [res2] "=r"(res->a2), [res3] "=r"(res->a3)		\
+		     : [function_id] "r"(function_id), [arg0] "r"(arg0),	\
+		       [arg1] "r"(arg1), [arg2] "r"(arg2), [arg3] "r"(arg3),	\
+		       [arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6)	\
+		     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7")
+
+
+void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
+	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
+	       uint64_t arg6, struct arm_smccc_res *res)
+{
+	__smccc_call(hvc, function_id, arg0, arg1, arg2, arg3, arg4, arg5,
+		     arg6, res);
+}
+
+void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
+	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
+	       uint64_t arg6, struct arm_smccc_res *res)
+{
+	__smccc_call(smc, function_id, arg0, arg1, arg2, arg3, arg4, arg5,
+		     arg6, res);
+}
+
+void kvm_selftest_arch_init(void)
+{
+	/*
+	 * arm64 doesn't have a true default mode, so start by computing the
+	 * available IPA space and page sizes early.
+	 */
+	guest_modes_append_default();
+}
+
+void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
+{
+	/*
+	 * arm64 selftests use only TTBR0_EL1, meaning that the valid VA space
+	 * is [0, 2^(64 - TCR_EL1.T0SZ)).
+	 */
+	sparsebit_set_num(vm->vpages_valid, 0,
+			  (1ULL << vm->va_bits) >> vm->page_shift);
+}
+
+/* Helper to call wfi instruction. */
+void wfi(void)
+{
+	asm volatile("wfi");
+}
diff --git a/tools/testing/selftests/kvm/lib/arm64/spinlock.c b/tools/testing/selftests/kvm/lib/arm64/spinlock.c
new file mode 100644
index 000000000000..a076e780be5d
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/spinlock.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM64 Spinlock support
+ */
+#include <stdint.h>
+
+#include "spinlock.h"
+
+void spin_lock(struct spinlock *lock)
+{
+	int val, res;
+
+	asm volatile(
+	"1:	ldaxr	%w0, [%2]\n"
+	"	cbnz	%w0, 1b\n"
+	"	mov	%w0, #1\n"
+	"	stxr	%w1, %w0, [%2]\n"
+	"	cbnz	%w1, 1b\n"
+	: "=&r" (val), "=&r" (res)
+	: "r" (&lock->v)
+	: "memory");
+}
+
+void spin_unlock(struct spinlock *lock)
+{
+	asm volatile("stlr wzr, [%0]\n"	: : "r" (&lock->v) : "memory");
+}
diff --git a/tools/testing/selftests/kvm/lib/arm64/ucall.c b/tools/testing/selftests/kvm/lib/arm64/ucall.c
new file mode 100644
index 000000000000..ddab0ce89d4d
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/ucall.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+vm_vaddr_t *ucall_exit_mmio_addr;
+
+void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
+{
+	vm_vaddr_t mmio_gva = vm_vaddr_unused_gap(vm, vm->page_size, KVM_UTIL_MIN_VADDR);
+
+	virt_map(vm, mmio_gva, mmio_gpa, 1);
+
+	vm->ucall_mmio_addr = mmio_gpa;
+
+	write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gva);
+}
+
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	if (run->exit_reason == KVM_EXIT_MMIO &&
+	    run->mmio.phys_addr == vcpu->vm->ucall_mmio_addr) {
+		TEST_ASSERT(run->mmio.is_write && run->mmio.len == sizeof(uint64_t),
+			    "Unexpected ucall exit mmio address access");
+		return (void *)(*((uint64_t *)run->mmio.data));
+	}
+
+	return NULL;
+}
diff --git a/tools/testing/selftests/kvm/lib/arm64/vgic.c b/tools/testing/selftests/kvm/lib/arm64/vgic.c
new file mode 100644
index 000000000000..4427f43f73ea
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/arm64/vgic.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM Generic Interrupt Controller (GIC) v3 host support
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+#include <asm/cputype.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm.h>
+
+#include "kvm_util.h"
+#include "vgic.h"
+#include "gic.h"
+#include "gic_v3.h"
+
+/*
+ * vGIC-v3 default host setup
+ *
+ * Input args:
+ *	vm - KVM VM
+ *	nr_vcpus - Number of vCPUs supported by this VM
+ *
+ * Output args: None
+ *
+ * Return: GIC file-descriptor or negative error code upon failure
+ *
+ * The function creates a vGIC-v3 device and maps the distributor and
+ * redistributor regions of the guest. Since it depends on the number of
+ * vCPUs for the VM, it must be called after all the vCPUs have been created.
+ */
+int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs)
+{
+	int gic_fd;
+	uint64_t attr;
+	struct list_head *iter;
+	unsigned int nr_gic_pages, nr_vcpus_created = 0;
+
+	TEST_ASSERT(nr_vcpus, "Number of vCPUs cannot be empty");
+
+	/*
+	 * Make sure that the caller is infact calling this
+	 * function after all the vCPUs are added.
+	 */
+	list_for_each(iter, &vm->vcpus)
+		nr_vcpus_created++;
+	TEST_ASSERT(nr_vcpus == nr_vcpus_created,
+			"Number of vCPUs requested (%u) doesn't match with the ones created for the VM (%u)",
+			nr_vcpus, nr_vcpus_created);
+
+	/* Distributor setup */
+	gic_fd = __kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3);
+	if (gic_fd < 0)
+		return gic_fd;
+
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs);
+
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	attr = GICD_BASE_GPA;
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_DIST, &attr);
+	nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE);
+	virt_map(vm, GICD_BASE_GPA, GICD_BASE_GPA, nr_gic_pages);
+
+	/* Redistributor setup */
+	attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, GICR_BASE_GPA, 0, 0);
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &attr);
+	nr_gic_pages = vm_calc_num_guest_pages(vm->mode,
+						KVM_VGIC_V3_REDIST_SIZE * nr_vcpus);
+	virt_map(vm, GICR_BASE_GPA, GICR_BASE_GPA, nr_gic_pages);
+
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	return gic_fd;
+}
+
+/* should only work for level sensitive interrupts */
+int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level)
+{
+	uint64_t attr = 32 * (intid / 32);
+	uint64_t index = intid % 32;
+	uint64_t val;
+	int ret;
+
+	ret = __kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
+				    attr, &val);
+	if (ret != 0)
+		return ret;
+
+	val |= 1U << index;
+	ret = __kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
+				    attr, &val);
+	return ret;
+}
+
+void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level)
+{
+	int ret = _kvm_irq_set_level_info(gic_fd, intid, level);
+
+	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, ret));
+}
+
+int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
+{
+	uint32_t irq = intid & KVM_ARM_IRQ_NUM_MASK;
+
+	TEST_ASSERT(!INTID_IS_SGI(intid), "KVM_IRQ_LINE's interface itself "
+		"doesn't allow injecting SGIs. There's no mask for it.");
+
+	if (INTID_IS_PPI(intid))
+		irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT;
+	else
+		irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT;
+
+	return _kvm_irq_line(vm, irq, level);
+}
+
+void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
+{
+	int ret = _kvm_arm_irq_line(vm, intid, level);
+
+	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
+}
+
+static void vgic_poke_irq(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu,
+			  uint64_t reg_off)
+{
+	uint64_t reg = intid / 32;
+	uint64_t index = intid % 32;
+	uint64_t attr = reg_off + reg * 4;
+	uint64_t val;
+	bool intid_is_private = INTID_IS_SGI(intid) || INTID_IS_PPI(intid);
+
+	uint32_t group = intid_is_private ? KVM_DEV_ARM_VGIC_GRP_REDIST_REGS
+					  : KVM_DEV_ARM_VGIC_GRP_DIST_REGS;
+
+	if (intid_is_private) {
+		/* TODO: only vcpu 0 implemented for now. */
+		assert(vcpu->id == 0);
+		attr += SZ_64K;
+	}
+
+	/* Check that the addr part of the attr is within 32 bits. */
+	assert((attr & ~KVM_DEV_ARM_VGIC_OFFSET_MASK) == 0);
+
+	/*
+	 * All calls will succeed, even with invalid intid's, as long as the
+	 * addr part of the attr is within 32 bits (checked above). An invalid
+	 * intid will just make the read/writes point to above the intended
+	 * register space (i.e., ICPENDR after ISPENDR).
+	 */
+	kvm_device_attr_get(gic_fd, group, attr, &val);
+	val |= 1ULL << index;
+	kvm_device_attr_set(gic_fd, group, attr, &val);
+}
+
+void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu)
+{
+	vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISPENDR);
+}
+
+void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu)
+{
+	vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER);
+}
+
+int vgic_its_setup(struct kvm_vm *vm)
+{
+	int its_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS);
+	u64 attr;
+
+	attr = GITS_BASE_GPA;
+	kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_ITS_ADDR_TYPE, &attr);
+
+	kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	virt_map(vm, GITS_BASE_GPA, GITS_BASE_GPA,
+		 vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE));
+
+	return its_fd;
+}
diff --git a/tools/testing/selftests/kvm/lib/s390/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390/diag318_test_handler.c
new file mode 100644
index 000000000000..2c432fa164f1
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390/diag318_test_handler.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test handler for the s390x DIAGNOSE 0x0318 instruction.
+ *
+ * Copyright (C) 2020, IBM
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define ICPT_INSTRUCTION	0x04
+#define IPA0_DIAG		0x8300
+
+static void guest_code(void)
+{
+	uint64_t diag318_info = 0x12345678;
+
+	asm volatile ("diag %0,0,0x318\n" : : "d" (diag318_info));
+}
+
+/*
+ * The DIAGNOSE 0x0318 instruction call must be handled via userspace. As such,
+ * we create an ad-hoc VM here to handle the instruction then extract the
+ * necessary data. It is up to the caller to decide what to do with that data.
+ */
+static uint64_t diag318_handler(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	uint64_t reg;
+	uint64_t diag318_info;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_run(vcpu);
+	run = vcpu->run;
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT(run->s390_sieic.icptcode == ICPT_INSTRUCTION,
+		    "Unexpected intercept code: 0x%x", run->s390_sieic.icptcode);
+	TEST_ASSERT((run->s390_sieic.ipa & 0xff00) == IPA0_DIAG,
+		    "Unexpected IPA0 code: 0x%x", (run->s390_sieic.ipa & 0xff00));
+
+	reg = (run->s390_sieic.ipa & 0x00f0) >> 4;
+	diag318_info = run->s.regs.gprs[reg];
+
+	TEST_ASSERT(diag318_info != 0, "DIAGNOSE 0x0318 info not set");
+
+	kvm_vm_free(vm);
+
+	return diag318_info;
+}
+
+uint64_t get_diag318_info(void)
+{
+	static uint64_t diag318_info;
+	static bool printed_skip;
+
+	/*
+	 * If KVM does not support diag318, then return 0 to
+	 * ensure tests do not break.
+	 */
+	if (!kvm_has_cap(KVM_CAP_S390_DIAG318)) {
+		if (!printed_skip) {
+			fprintf(stdout, "KVM_CAP_S390_DIAG318 not supported. "
+				"Skipping diag318 test.\n");
+			printed_skip = true;
+		}
+		return 0;
+	}
+
+	/*
+	 * If a test has previously requested the diag318 info,
+	 * then don't bother spinning up a temporary VM again.
+	 */
+	if (!diag318_info)
+		diag318_info = diag318_handler();
+
+	return diag318_info;
+}
diff --git a/tools/testing/selftests/kvm/lib/s390/facility.c b/tools/testing/selftests/kvm/lib/s390/facility.c
new file mode 100644
index 000000000000..d540812d911a
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390/facility.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Hariharan Mari <hari55@linux.ibm.com>
+ *
+ * Contains the definition for the global variables to have the test facitlity feature.
+ */
+
+#include "facility.h"
+
+uint64_t stfl_doublewords[NB_STFL_DOUBLEWORDS];
+bool stfle_flag;
diff --git a/tools/testing/selftests/kvm/lib/s390/processor.c b/tools/testing/selftests/kvm/lib/s390/processor.c
new file mode 100644
index 000000000000..20cfe970e3e3
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390/processor.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM selftest s390x library code - CPU-related functions (page tables...)
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ */
+
+#include "processor.h"
+#include "kvm_util.h"
+
+#define PAGES_PER_REGION 4
+
+void virt_arch_pgd_alloc(struct kvm_vm *vm)
+{
+	vm_paddr_t paddr;
+
+	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
+		    vm->page_size);
+
+	if (vm->pgd_created)
+		return;
+
+	paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
+				   KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+				   vm->memslots[MEM_REGION_PT]);
+	memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
+
+	vm->pgd = paddr;
+	vm->pgd_created = true;
+}
+
+/*
+ * Allocate 4 pages for a region/segment table (ri < 4), or one page for
+ * a page table (ri == 4). Returns a suitable region/segment table entry
+ * which points to the freshly allocated pages.
+ */
+static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri)
+{
+	uint64_t taddr;
+
+	taddr = vm_phy_pages_alloc(vm,  ri < 4 ? PAGES_PER_REGION : 1,
+				   KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
+	memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size);
+
+	return (taddr & REGION_ENTRY_ORIGIN)
+		| (((4 - ri) << 2) & REGION_ENTRY_TYPE)
+		| ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH);
+}
+
+void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa)
+{
+	int ri, idx;
+	uint64_t *entry;
+
+	TEST_ASSERT((gva % vm->page_size) == 0,
+		"Virtual address not on page boundary,\n"
+		"  vaddr: 0x%lx vm->page_size: 0x%x",
+		gva, vm->page_size);
+	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+		(gva >> vm->page_shift)),
+		"Invalid virtual address, vaddr: 0x%lx",
+		gva);
+	TEST_ASSERT((gpa % vm->page_size) == 0,
+		"Physical address not on page boundary,\n"
+		"  paddr: 0x%lx vm->page_size: 0x%x",
+		gva, vm->page_size);
+	TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn,
+		"Physical address beyond beyond maximum supported,\n"
+		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+		gva, vm->max_gfn, vm->page_size);
+
+	/* Walk through region and segment tables */
+	entry = addr_gpa2hva(vm, vm->pgd);
+	for (ri = 1; ri <= 4; ri++) {
+		idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
+		if (entry[idx] & REGION_ENTRY_INVALID)
+			entry[idx] = virt_alloc_region(vm, ri);
+		entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
+	}
+
+	/* Fill in page table entry */
+	idx = (gva >> PAGE_SHIFT) & 0x0ffu;		/* page index */
+	if (!(entry[idx] & PAGE_INVALID))
+		fprintf(stderr,
+			"WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa);
+	entry[idx] = gpa;
+}
+
+vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	int ri, idx;
+	uint64_t *entry;
+
+	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
+		    vm->page_size);
+
+	entry = addr_gpa2hva(vm, vm->pgd);
+	for (ri = 1; ri <= 4; ri++) {
+		idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
+		TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID),
+			    "No region mapping for vm virtual address 0x%lx",
+			    gva);
+		entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
+	}
+
+	idx = (gva >> PAGE_SHIFT) & 0x0ffu;		/* page index */
+
+	TEST_ASSERT(!(entry[idx] & PAGE_INVALID),
+		    "No page mapping for vm virtual address 0x%lx", gva);
+
+	return (entry[idx] & ~0xffful) + (gva & 0xffful);
+}
+
+static void virt_dump_ptes(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+			   uint64_t ptea_start)
+{
+	uint64_t *pte, ptea;
+
+	for (ptea = ptea_start; ptea < ptea_start + 0x100 * 8; ptea += 8) {
+		pte = addr_gpa2hva(vm, ptea);
+		if (*pte & PAGE_INVALID)
+			continue;
+		fprintf(stream, "%*spte @ 0x%lx: 0x%016lx\n",
+			indent, "", ptea, *pte);
+	}
+}
+
+static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+			     uint64_t reg_tab_addr)
+{
+	uint64_t addr, *entry;
+
+	for (addr = reg_tab_addr; addr < reg_tab_addr + 0x400 * 8; addr += 8) {
+		entry = addr_gpa2hva(vm, addr);
+		if (*entry & REGION_ENTRY_INVALID)
+			continue;
+		fprintf(stream, "%*srt%lde @ 0x%lx: 0x%016lx\n",
+			indent, "", 4 - ((*entry & REGION_ENTRY_TYPE) >> 2),
+			addr, *entry);
+		if (*entry & REGION_ENTRY_TYPE) {
+			virt_dump_region(stream, vm, indent + 2,
+					 *entry & REGION_ENTRY_ORIGIN);
+		} else {
+			virt_dump_ptes(stream, vm, indent + 2,
+				       *entry & REGION_ENTRY_ORIGIN);
+		}
+	}
+}
+
+void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+	if (!vm->pgd_created)
+		return;
+
+	virt_dump_region(stream, vm, indent, vm->pgd);
+}
+
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	vcpu->run->psw_addr = (uintptr_t)guest_code;
+}
+
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
+{
+	size_t stack_size =  DEFAULT_STACK_PGS * getpagesize();
+	uint64_t stack_vaddr;
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu *vcpu;
+
+	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
+		    vm->page_size);
+
+	stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
+				       DEFAULT_GUEST_STACK_VADDR_MIN,
+				       MEM_REGION_DATA);
+
+	vcpu = __vm_vcpu_add(vm, vcpu_id);
+
+	/* Setup guest registers */
+	vcpu_regs_get(vcpu, &regs);
+	regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160;
+	vcpu_regs_set(vcpu, &regs);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	sregs.crs[0] |= 0x00040000;		/* Enable floating point regs */
+	sregs.crs[1] = vm->pgd | 0xf;		/* Primary region table */
+	vcpu_sregs_set(vcpu, &sregs);
+
+	vcpu->run->psw_mask = 0x0400000180000000ULL;  /* DAT enabled + 64 bit mode */
+
+	return vcpu;
+}
+
+void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
+{
+	va_list ap;
+	struct kvm_regs regs;
+	int i;
+
+	TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n"
+		    "  num: %u",
+		    num);
+
+	va_start(ap, num);
+	vcpu_regs_get(vcpu, &regs);
+
+	for (i = 0; i < num; i++)
+		regs.gprs[i + 2] = va_arg(ap, uint64_t);
+
+	vcpu_regs_set(vcpu, &regs);
+	va_end(ap);
+}
+
+void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
+{
+	fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
+		indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr);
+}
+
+void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
+{
+}
diff --git a/tools/testing/selftests/kvm/lib/s390/ucall.c b/tools/testing/selftests/kvm/lib/s390/ucall.c
new file mode 100644
index 000000000000..cca98734653d
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390/ucall.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	if (run->exit_reason == KVM_EXIT_S390_SIEIC &&
+	    run->s390_sieic.icptcode == 4 &&
+	    (run->s390_sieic.ipa >> 8) == 0x83 &&    /* 0x83 means DIAGNOSE */
+	    (run->s390_sieic.ipb >> 16) == 0x501) {
+		int reg = run->s390_sieic.ipa & 0xf;
+
+		return (void *)run->s.regs.gprs[reg];
+	}
+	return NULL;
+}
diff --git a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c
deleted file mode 100644
index 2c432fa164f1..000000000000
--- a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Test handler for the s390x DIAGNOSE 0x0318 instruction.
- *
- * Copyright (C) 2020, IBM
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-
-#define ICPT_INSTRUCTION	0x04
-#define IPA0_DIAG		0x8300
-
-static void guest_code(void)
-{
-	uint64_t diag318_info = 0x12345678;
-
-	asm volatile ("diag %0,0,0x318\n" : : "d" (diag318_info));
-}
-
-/*
- * The DIAGNOSE 0x0318 instruction call must be handled via userspace. As such,
- * we create an ad-hoc VM here to handle the instruction then extract the
- * necessary data. It is up to the caller to decide what to do with that data.
- */
-static uint64_t diag318_handler(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_run *run;
-	uint64_t reg;
-	uint64_t diag318_info;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vcpu_run(vcpu);
-	run = vcpu->run;
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
-	TEST_ASSERT(run->s390_sieic.icptcode == ICPT_INSTRUCTION,
-		    "Unexpected intercept code: 0x%x", run->s390_sieic.icptcode);
-	TEST_ASSERT((run->s390_sieic.ipa & 0xff00) == IPA0_DIAG,
-		    "Unexpected IPA0 code: 0x%x", (run->s390_sieic.ipa & 0xff00));
-
-	reg = (run->s390_sieic.ipa & 0x00f0) >> 4;
-	diag318_info = run->s.regs.gprs[reg];
-
-	TEST_ASSERT(diag318_info != 0, "DIAGNOSE 0x0318 info not set");
-
-	kvm_vm_free(vm);
-
-	return diag318_info;
-}
-
-uint64_t get_diag318_info(void)
-{
-	static uint64_t diag318_info;
-	static bool printed_skip;
-
-	/*
-	 * If KVM does not support diag318, then return 0 to
-	 * ensure tests do not break.
-	 */
-	if (!kvm_has_cap(KVM_CAP_S390_DIAG318)) {
-		if (!printed_skip) {
-			fprintf(stdout, "KVM_CAP_S390_DIAG318 not supported. "
-				"Skipping diag318 test.\n");
-			printed_skip = true;
-		}
-		return 0;
-	}
-
-	/*
-	 * If a test has previously requested the diag318 info,
-	 * then don't bother spinning up a temporary VM again.
-	 */
-	if (!diag318_info)
-		diag318_info = diag318_handler();
-
-	return diag318_info;
-}
diff --git a/tools/testing/selftests/kvm/lib/s390x/facility.c b/tools/testing/selftests/kvm/lib/s390x/facility.c
deleted file mode 100644
index d540812d911a..000000000000
--- a/tools/testing/selftests/kvm/lib/s390x/facility.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright IBM Corp. 2024
- *
- * Authors:
- *  Hariharan Mari <hari55@linux.ibm.com>
- *
- * Contains the definition for the global variables to have the test facitlity feature.
- */
-
-#include "facility.h"
-
-uint64_t stfl_doublewords[NB_STFL_DOUBLEWORDS];
-bool stfle_flag;
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c
deleted file mode 100644
index 20cfe970e3e3..000000000000
--- a/tools/testing/selftests/kvm/lib/s390x/processor.c
+++ /dev/null
@@ -1,223 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * KVM selftest s390x library code - CPU-related functions (page tables...)
- *
- * Copyright (C) 2019, Red Hat, Inc.
- */
-
-#include "processor.h"
-#include "kvm_util.h"
-
-#define PAGES_PER_REGION 4
-
-void virt_arch_pgd_alloc(struct kvm_vm *vm)
-{
-	vm_paddr_t paddr;
-
-	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
-		    vm->page_size);
-
-	if (vm->pgd_created)
-		return;
-
-	paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
-				   KVM_GUEST_PAGE_TABLE_MIN_PADDR,
-				   vm->memslots[MEM_REGION_PT]);
-	memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
-
-	vm->pgd = paddr;
-	vm->pgd_created = true;
-}
-
-/*
- * Allocate 4 pages for a region/segment table (ri < 4), or one page for
- * a page table (ri == 4). Returns a suitable region/segment table entry
- * which points to the freshly allocated pages.
- */
-static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri)
-{
-	uint64_t taddr;
-
-	taddr = vm_phy_pages_alloc(vm,  ri < 4 ? PAGES_PER_REGION : 1,
-				   KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
-	memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size);
-
-	return (taddr & REGION_ENTRY_ORIGIN)
-		| (((4 - ri) << 2) & REGION_ENTRY_TYPE)
-		| ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH);
-}
-
-void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa)
-{
-	int ri, idx;
-	uint64_t *entry;
-
-	TEST_ASSERT((gva % vm->page_size) == 0,
-		"Virtual address not on page boundary,\n"
-		"  vaddr: 0x%lx vm->page_size: 0x%x",
-		gva, vm->page_size);
-	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
-		(gva >> vm->page_shift)),
-		"Invalid virtual address, vaddr: 0x%lx",
-		gva);
-	TEST_ASSERT((gpa % vm->page_size) == 0,
-		"Physical address not on page boundary,\n"
-		"  paddr: 0x%lx vm->page_size: 0x%x",
-		gva, vm->page_size);
-	TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn,
-		"Physical address beyond beyond maximum supported,\n"
-		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		gva, vm->max_gfn, vm->page_size);
-
-	/* Walk through region and segment tables */
-	entry = addr_gpa2hva(vm, vm->pgd);
-	for (ri = 1; ri <= 4; ri++) {
-		idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
-		if (entry[idx] & REGION_ENTRY_INVALID)
-			entry[idx] = virt_alloc_region(vm, ri);
-		entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
-	}
-
-	/* Fill in page table entry */
-	idx = (gva >> PAGE_SHIFT) & 0x0ffu;		/* page index */
-	if (!(entry[idx] & PAGE_INVALID))
-		fprintf(stderr,
-			"WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa);
-	entry[idx] = gpa;
-}
-
-vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	int ri, idx;
-	uint64_t *entry;
-
-	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
-		    vm->page_size);
-
-	entry = addr_gpa2hva(vm, vm->pgd);
-	for (ri = 1; ri <= 4; ri++) {
-		idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
-		TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID),
-			    "No region mapping for vm virtual address 0x%lx",
-			    gva);
-		entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
-	}
-
-	idx = (gva >> PAGE_SHIFT) & 0x0ffu;		/* page index */
-
-	TEST_ASSERT(!(entry[idx] & PAGE_INVALID),
-		    "No page mapping for vm virtual address 0x%lx", gva);
-
-	return (entry[idx] & ~0xffful) + (gva & 0xffful);
-}
-
-static void virt_dump_ptes(FILE *stream, struct kvm_vm *vm, uint8_t indent,
-			   uint64_t ptea_start)
-{
-	uint64_t *pte, ptea;
-
-	for (ptea = ptea_start; ptea < ptea_start + 0x100 * 8; ptea += 8) {
-		pte = addr_gpa2hva(vm, ptea);
-		if (*pte & PAGE_INVALID)
-			continue;
-		fprintf(stream, "%*spte @ 0x%lx: 0x%016lx\n",
-			indent, "", ptea, *pte);
-	}
-}
-
-static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent,
-			     uint64_t reg_tab_addr)
-{
-	uint64_t addr, *entry;
-
-	for (addr = reg_tab_addr; addr < reg_tab_addr + 0x400 * 8; addr += 8) {
-		entry = addr_gpa2hva(vm, addr);
-		if (*entry & REGION_ENTRY_INVALID)
-			continue;
-		fprintf(stream, "%*srt%lde @ 0x%lx: 0x%016lx\n",
-			indent, "", 4 - ((*entry & REGION_ENTRY_TYPE) >> 2),
-			addr, *entry);
-		if (*entry & REGION_ENTRY_TYPE) {
-			virt_dump_region(stream, vm, indent + 2,
-					 *entry & REGION_ENTRY_ORIGIN);
-		} else {
-			virt_dump_ptes(stream, vm, indent + 2,
-				       *entry & REGION_ENTRY_ORIGIN);
-		}
-	}
-}
-
-void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
-{
-	if (!vm->pgd_created)
-		return;
-
-	virt_dump_region(stream, vm, indent, vm->pgd);
-}
-
-void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
-{
-	vcpu->run->psw_addr = (uintptr_t)guest_code;
-}
-
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
-{
-	size_t stack_size =  DEFAULT_STACK_PGS * getpagesize();
-	uint64_t stack_vaddr;
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
-	struct kvm_vcpu *vcpu;
-
-	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
-		    vm->page_size);
-
-	stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
-				       DEFAULT_GUEST_STACK_VADDR_MIN,
-				       MEM_REGION_DATA);
-
-	vcpu = __vm_vcpu_add(vm, vcpu_id);
-
-	/* Setup guest registers */
-	vcpu_regs_get(vcpu, &regs);
-	regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160;
-	vcpu_regs_set(vcpu, &regs);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	sregs.crs[0] |= 0x00040000;		/* Enable floating point regs */
-	sregs.crs[1] = vm->pgd | 0xf;		/* Primary region table */
-	vcpu_sregs_set(vcpu, &sregs);
-
-	vcpu->run->psw_mask = 0x0400000180000000ULL;  /* DAT enabled + 64 bit mode */
-
-	return vcpu;
-}
-
-void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
-{
-	va_list ap;
-	struct kvm_regs regs;
-	int i;
-
-	TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n"
-		    "  num: %u",
-		    num);
-
-	va_start(ap, num);
-	vcpu_regs_get(vcpu, &regs);
-
-	for (i = 0; i < num; i++)
-		regs.gprs[i + 2] = va_arg(ap, uint64_t);
-
-	vcpu_regs_set(vcpu, &regs);
-	va_end(ap);
-}
-
-void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
-{
-	fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
-		indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr);
-}
-
-void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
-{
-}
diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c
deleted file mode 100644
index cca98734653d..000000000000
--- a/tools/testing/selftests/kvm/lib/s390x/ucall.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ucall support. A ucall is a "hypercall to userspace".
- *
- * Copyright (C) 2019 Red Hat, Inc.
- */
-#include "kvm_util.h"
-
-void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-
-	if (run->exit_reason == KVM_EXIT_S390_SIEIC &&
-	    run->s390_sieic.icptcode == 4 &&
-	    (run->s390_sieic.ipa >> 8) == 0x83 &&    /* 0x83 means DIAGNOSE */
-	    (run->s390_sieic.ipb >> 16) == 0x501) {
-		int reg = run->s390_sieic.ipa & 0xf;
-
-		return (void *)run->s.regs.gprs[reg];
-	}
-	return NULL;
-}
diff --git a/tools/testing/selftests/kvm/lib/x86/apic.c b/tools/testing/selftests/kvm/lib/x86/apic.c
new file mode 100644
index 000000000000..89153a333e83
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/apic.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#include "apic.h"
+
+void apic_disable(void)
+{
+	wrmsr(MSR_IA32_APICBASE,
+	      rdmsr(MSR_IA32_APICBASE) &
+		~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD));
+}
+
+void xapic_enable(void)
+{
+	uint64_t val = rdmsr(MSR_IA32_APICBASE);
+
+	/* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */
+	if (val & MSR_IA32_APICBASE_EXTD) {
+		apic_disable();
+		wrmsr(MSR_IA32_APICBASE,
+		      rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE);
+	} else if (!(val & MSR_IA32_APICBASE_ENABLE)) {
+		wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE);
+	}
+
+	/*
+	 * Per SDM: reset value of spurious interrupt vector register has the
+	 * APIC software enabled bit=0. It must be enabled in addition to the
+	 * enable bit in the MSR.
+	 */
+	val = xapic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED;
+	xapic_write_reg(APIC_SPIV, val);
+}
+
+void x2apic_enable(void)
+{
+	wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) |
+	      MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD);
+	x2apic_write_reg(APIC_SPIV,
+			 x2apic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED);
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/handlers.S b/tools/testing/selftests/kvm/lib/x86/handlers.S
new file mode 100644
index 000000000000..7629819734af
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/handlers.S
@@ -0,0 +1,81 @@
+handle_exception:
+	push %r15
+	push %r14
+	push %r13
+	push %r12
+	push %r11
+	push %r10
+	push %r9
+	push %r8
+
+	push %rdi
+	push %rsi
+	push %rbp
+	push %rbx
+	push %rdx
+	push %rcx
+	push %rax
+	mov %rsp, %rdi
+
+	call route_exception
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rbx
+	pop %rbp
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+	pop %r10
+	pop %r11
+	pop %r12
+	pop %r13
+	pop %r14
+	pop %r15
+
+	/* Discard vector and error code. */
+	add $16, %rsp
+	iretq
+
+/*
+ * Build the handle_exception wrappers which push the vector/error code on the
+ * stack and an array of pointers to those wrappers.
+ */
+.pushsection .rodata
+.globl idt_handlers
+idt_handlers:
+.popsection
+
+.macro HANDLERS has_error from to
+	vector = \from
+	.rept \to - \from + 1
+	.align 8
+
+	/* Fetch current address and append it to idt_handlers. */
+666 :
+.pushsection .rodata
+	.quad 666b
+.popsection
+
+	.if ! \has_error
+	pushq $0
+	.endif
+	pushq $vector
+	jmp handle_exception
+	vector = vector + 1
+	.endr
+.endm
+
+.global idt_handler_code
+idt_handler_code:
+	HANDLERS has_error=0 from=0  to=7
+	HANDLERS has_error=1 from=8  to=8
+	HANDLERS has_error=0 from=9  to=9
+	HANDLERS has_error=1 from=10 to=14
+	HANDLERS has_error=0 from=15 to=16
+	HANDLERS has_error=1 from=17 to=17
+	HANDLERS has_error=0 from=18 to=255
+
+.section        .note.GNU-stack, "", %progbits
diff --git a/tools/testing/selftests/kvm/lib/x86/hyperv.c b/tools/testing/selftests/kvm/lib/x86/hyperv.c
new file mode 100644
index 000000000000..15bc8cd583aa
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/hyperv.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Hyper-V specific functions.
+ *
+ * Copyright (C) 2021, Red Hat Inc.
+ */
+#include <stdint.h>
+#include "processor.h"
+#include "hyperv.h"
+
+const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
+{
+	static struct kvm_cpuid2 *cpuid;
+	int kvm_fd;
+
+	if (cpuid)
+		return cpuid;
+
+	cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
+	kvm_fd = open_kvm_dev_path_or_exit();
+
+	kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+	close(kvm_fd);
+	return cpuid;
+}
+
+void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu)
+{
+	static struct kvm_cpuid2 *cpuid_full;
+	const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
+	int i, nent = 0;
+
+	if (!cpuid_full) {
+		cpuid_sys = kvm_get_supported_cpuid();
+		cpuid_hv = kvm_get_supported_hv_cpuid();
+
+		cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent);
+		if (!cpuid_full) {
+			perror("malloc");
+			abort();
+		}
+
+		/* Need to skip KVM CPUID leaves 0x400000xx */
+		for (i = 0; i < cpuid_sys->nent; i++) {
+			if (cpuid_sys->entries[i].function >= 0x40000000 &&
+			    cpuid_sys->entries[i].function < 0x40000100)
+				continue;
+			cpuid_full->entries[nent] = cpuid_sys->entries[i];
+			nent++;
+		}
+
+		memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
+		       cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
+		cpuid_full->nent = nent + cpuid_hv->nent;
+	}
+
+	vcpu_init_cpuid(vcpu, cpuid_full);
+}
+
+const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
+
+	vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+	return cpuid;
+}
+
+bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature)
+{
+	if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID))
+		return false;
+
+	return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature);
+}
+
+struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
+						       vm_vaddr_t *p_hv_pages_gva)
+{
+	vm_vaddr_t hv_pages_gva = vm_vaddr_alloc_page(vm);
+	struct hyperv_test_pages *hv = addr_gva2hva(vm, hv_pages_gva);
+
+	/* Setup of a region of guest memory for the VP Assist page. */
+	hv->vp_assist = (void *)vm_vaddr_alloc_page(vm);
+	hv->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->vp_assist);
+	hv->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->vp_assist);
+
+	/* Setup of a region of guest memory for the partition assist page. */
+	hv->partition_assist = (void *)vm_vaddr_alloc_page(vm);
+	hv->partition_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->partition_assist);
+	hv->partition_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->partition_assist);
+
+	/* Setup of a region of guest memory for the enlightened VMCS. */
+	hv->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm);
+	hv->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)hv->enlightened_vmcs);
+	hv->enlightened_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)hv->enlightened_vmcs);
+
+	*p_hv_pages_gva = hv_pages_gva;
+	return hv;
+}
+
+int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
+{
+	uint64_t val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
+		HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+	wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
+
+	current_vp_assist = vp_assist;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
new file mode 100644
index 000000000000..7f5d62a65c68
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86-specific extensions to memstress.c.
+ *
+ * Copyright (C) 2022, Google, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "memstress.h"
+#include "processor.h"
+#include "vmx.h"
+
+void memstress_l2_guest_code(uint64_t vcpu_id)
+{
+	memstress_guest_code(vcpu_id);
+	vmcall();
+}
+
+extern char memstress_l2_guest_entry[];
+__asm__(
+"memstress_l2_guest_entry:"
+"	mov (%rsp), %rdi;"
+"	call memstress_l2_guest_code;"
+"	ud2;"
+);
+
+static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	unsigned long *rsp;
+
+	GUEST_ASSERT(vmx->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+	GUEST_ASSERT(load_vmcs(vmx));
+	GUEST_ASSERT(ept_1g_pages_supported());
+
+	rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
+	*rsp = vcpu_id;
+	prepare_vmcs(vmx, memstress_l2_guest_entry, rsp);
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_DONE();
+}
+
+uint64_t memstress_nested_pages(int nr_vcpus)
+{
+	/*
+	 * 513 page tables is enough to identity-map 256 TiB of L2 with 1G
+	 * pages and 4-level paging, plus a few pages per-vCPU for data
+	 * structures such as the VMCS.
+	 */
+	return 513 + 10 * nr_vcpus;
+}
+
+void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
+{
+	uint64_t start, end;
+
+	prepare_eptp(vmx, vm, 0);
+
+	/*
+	 * Identity map the first 4G and the test region with 1G pages so that
+	 * KVM can shadow the EPT12 with the maximum huge page size supported
+	 * by the backing source.
+	 */
+	nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
+
+	start = align_down(memstress_args.gpa, PG_SIZE_1G);
+	end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G);
+	nested_identity_map_1g(vmx, vm, start, end - start);
+}
+
+void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
+{
+	struct vmx_pages *vmx, *vmx0 = NULL;
+	struct kvm_regs regs;
+	vm_vaddr_t vmx_gva;
+	int vcpu_id;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_cpu_has_ept());
+
+	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+		vmx = vcpu_alloc_vmx(vm, &vmx_gva);
+
+		if (vcpu_id == 0) {
+			memstress_setup_ept(vmx, vm);
+			vmx0 = vmx;
+		} else {
+			/* Share the same EPT table across all vCPUs. */
+			vmx->eptp = vmx0->eptp;
+			vmx->eptp_hva = vmx0->eptp_hva;
+			vmx->eptp_gpa = vmx0->eptp_gpa;
+		}
+
+		/*
+		 * Override the vCPU to run memstress_l1_guest_code() which will
+		 * bounce it into L2 before calling memstress_guest_code().
+		 */
+		vcpu_regs_get(vcpus[vcpu_id], &regs);
+		regs.rip = (unsigned long) memstress_l1_guest_code;
+		vcpu_regs_set(vcpus[vcpu_id], &regs);
+		vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id);
+	}
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/pmu.c b/tools/testing/selftests/kvm/lib/x86/pmu.c
new file mode 100644
index 000000000000..f31f0427c17c
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/pmu.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+
+#include <stdint.h>
+
+#include <linux/kernel.h>
+
+#include "kvm_util.h"
+#include "pmu.h"
+
+const uint64_t intel_pmu_arch_events[] = {
+	INTEL_ARCH_CPU_CYCLES,
+	INTEL_ARCH_INSTRUCTIONS_RETIRED,
+	INTEL_ARCH_REFERENCE_CYCLES,
+	INTEL_ARCH_LLC_REFERENCES,
+	INTEL_ARCH_LLC_MISSES,
+	INTEL_ARCH_BRANCHES_RETIRED,
+	INTEL_ARCH_BRANCHES_MISPREDICTED,
+	INTEL_ARCH_TOPDOWN_SLOTS,
+};
+kvm_static_assert(ARRAY_SIZE(intel_pmu_arch_events) == NR_INTEL_ARCH_EVENTS);
+
+const uint64_t amd_pmu_zen_events[] = {
+	AMD_ZEN_CORE_CYCLES,
+	AMD_ZEN_INSTRUCTIONS_RETIRED,
+	AMD_ZEN_BRANCHES_RETIRED,
+	AMD_ZEN_BRANCHES_MISPREDICTED,
+};
+kvm_static_assert(ARRAY_SIZE(amd_pmu_zen_events) == NR_AMD_ZEN_EVENTS);
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
new file mode 100644
index 000000000000..bd5a802fa7a5
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -0,0 +1,1293 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "linux/bitmap.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "sev.h"
+
+#ifndef NUM_INTERRUPTS
+#define NUM_INTERRUPTS 256
+#endif
+
+#define KERNEL_CS	0x8
+#define KERNEL_DS	0x10
+#define KERNEL_TSS	0x18
+
+vm_vaddr_t exception_handlers;
+bool host_cpu_is_amd;
+bool host_cpu_is_intel;
+bool is_forced_emulation_enabled;
+uint64_t guest_tsc_khz;
+
+static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
+{
+	fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
+		"rcx: 0x%.16llx rdx: 0x%.16llx\n",
+		indent, "",
+		regs->rax, regs->rbx, regs->rcx, regs->rdx);
+	fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
+		"rsp: 0x%.16llx rbp: 0x%.16llx\n",
+		indent, "",
+		regs->rsi, regs->rdi, regs->rsp, regs->rbp);
+	fprintf(stream, "%*sr8:  0x%.16llx r9:  0x%.16llx "
+		"r10: 0x%.16llx r11: 0x%.16llx\n",
+		indent, "",
+		regs->r8, regs->r9, regs->r10, regs->r11);
+	fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
+		"r14: 0x%.16llx r15: 0x%.16llx\n",
+		indent, "",
+		regs->r12, regs->r13, regs->r14, regs->r15);
+	fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
+		indent, "",
+		regs->rip, regs->rflags);
+}
+
+static void segment_dump(FILE *stream, struct kvm_segment *segment,
+			 uint8_t indent)
+{
+	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
+		"selector: 0x%.4x type: 0x%.2x\n",
+		indent, "", segment->base, segment->limit,
+		segment->selector, segment->type);
+	fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
+		"db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
+		indent, "", segment->present, segment->dpl,
+		segment->db, segment->s, segment->l);
+	fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
+		"unusable: 0x%.2x padding: 0x%.2x\n",
+		indent, "", segment->g, segment->avl,
+		segment->unusable, segment->padding);
+}
+
+static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
+			uint8_t indent)
+{
+	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
+		"padding: 0x%.4x 0x%.4x 0x%.4x\n",
+		indent, "", dtable->base, dtable->limit,
+		dtable->padding[0], dtable->padding[1], dtable->padding[2]);
+}
+
+static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
+{
+	unsigned int i;
+
+	fprintf(stream, "%*scs:\n", indent, "");
+	segment_dump(stream, &sregs->cs, indent + 2);
+	fprintf(stream, "%*sds:\n", indent, "");
+	segment_dump(stream, &sregs->ds, indent + 2);
+	fprintf(stream, "%*ses:\n", indent, "");
+	segment_dump(stream, &sregs->es, indent + 2);
+	fprintf(stream, "%*sfs:\n", indent, "");
+	segment_dump(stream, &sregs->fs, indent + 2);
+	fprintf(stream, "%*sgs:\n", indent, "");
+	segment_dump(stream, &sregs->gs, indent + 2);
+	fprintf(stream, "%*sss:\n", indent, "");
+	segment_dump(stream, &sregs->ss, indent + 2);
+	fprintf(stream, "%*str:\n", indent, "");
+	segment_dump(stream, &sregs->tr, indent + 2);
+	fprintf(stream, "%*sldt:\n", indent, "");
+	segment_dump(stream, &sregs->ldt, indent + 2);
+
+	fprintf(stream, "%*sgdt:\n", indent, "");
+	dtable_dump(stream, &sregs->gdt, indent + 2);
+	fprintf(stream, "%*sidt:\n", indent, "");
+	dtable_dump(stream, &sregs->idt, indent + 2);
+
+	fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
+		"cr3: 0x%.16llx cr4: 0x%.16llx\n",
+		indent, "",
+		sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
+	fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
+		"apic_base: 0x%.16llx\n",
+		indent, "",
+		sregs->cr8, sregs->efer, sregs->apic_base);
+
+	fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
+	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
+		fprintf(stream, "%*s%.16llx\n", indent + 2, "",
+			sregs->interrupt_bitmap[i]);
+	}
+}
+
+bool kvm_is_tdp_enabled(void)
+{
+	if (host_cpu_is_intel)
+		return get_kvm_intel_param_bool("ept");
+	else
+		return get_kvm_amd_param_bool("npt");
+}
+
+void virt_arch_pgd_alloc(struct kvm_vm *vm)
+{
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+	/* If needed, create page map l4 table. */
+	if (!vm->pgd_created) {
+		vm->pgd = vm_alloc_page_table(vm);
+		vm->pgd_created = true;
+	}
+}
+
+static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
+			  uint64_t vaddr, int level)
+{
+	uint64_t pt_gpa = PTE_GET_PA(*parent_pte);
+	uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
+	int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
+
+	TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,
+		    "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
+		    level + 1, vaddr);
+
+	return &page_table[index];
+}
+
+static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
+				       uint64_t *parent_pte,
+				       uint64_t vaddr,
+				       uint64_t paddr,
+				       int current_level,
+				       int target_level)
+{
+	uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
+
+	paddr = vm_untag_gpa(vm, paddr);
+
+	if (!(*pte & PTE_PRESENT_MASK)) {
+		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
+		if (current_level == target_level)
+			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+		else
+			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
+	} else {
+		/*
+		 * Entry already present.  Assert that the caller doesn't want
+		 * a hugepage at this level, and that there isn't a hugepage at
+		 * this level.
+		 */
+		TEST_ASSERT(current_level != target_level,
+			    "Cannot create hugepage at level: %u, vaddr: 0x%lx",
+			    current_level, vaddr);
+		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
+			    "Cannot create page table at level: %u, vaddr: 0x%lx",
+			    current_level, vaddr);
+	}
+	return pte;
+}
+
+void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
+{
+	const uint64_t pg_size = PG_LEVEL_SIZE(level);
+	uint64_t *pml4e, *pdpe, *pde;
+	uint64_t *pte;
+
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
+		    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+	TEST_ASSERT((vaddr % pg_size) == 0,
+		    "Virtual address not aligned,\n"
+		    "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size);
+	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)),
+		    "Invalid virtual address, vaddr: 0x%lx", vaddr);
+	TEST_ASSERT((paddr % pg_size) == 0,
+		    "Physical address not aligned,\n"
+		    "  paddr: 0x%lx page size: 0x%lx", paddr, pg_size);
+	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+		    "Physical address beyond maximum supported,\n"
+		    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+		    paddr, vm->max_gfn, vm->page_size);
+	TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,
+		    "Unexpected bits in paddr: %lx", paddr);
+
+	/*
+	 * Allocate upper level page tables, if not already present.  Return
+	 * early if a hugepage was created.
+	 */
+	pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level);
+	if (*pml4e & PTE_LARGE_MASK)
+		return;
+
+	pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level);
+	if (*pdpe & PTE_LARGE_MASK)
+		return;
+
+	pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level);
+	if (*pde & PTE_LARGE_MASK)
+		return;
+
+	/* Fill in page table entry. */
+	pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
+	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
+		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
+	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+
+	/*
+	 * Neither SEV nor TDX supports shared page tables, so only the final
+	 * leaf PTE needs manually set the C/S-bit.
+	 */
+	if (vm_is_gpa_protected(vm, paddr))
+		*pte |= vm->arch.c_bit;
+	else
+		*pte |= vm->arch.s_bit;
+}
+
+void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
+{
+	__virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
+}
+
+void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+		    uint64_t nr_bytes, int level)
+{
+	uint64_t pg_size = PG_LEVEL_SIZE(level);
+	uint64_t nr_pages = nr_bytes / pg_size;
+	int i;
+
+	TEST_ASSERT(nr_bytes % pg_size == 0,
+		    "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",
+		    nr_bytes, pg_size);
+
+	for (i = 0; i < nr_pages; i++) {
+		__virt_pg_map(vm, vaddr, paddr, level);
+
+		vaddr += pg_size;
+		paddr += pg_size;
+	}
+}
+
+static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
+{
+	if (*pte & PTE_LARGE_MASK) {
+		TEST_ASSERT(*level == PG_LEVEL_NONE ||
+			    *level == current_level,
+			    "Unexpected hugepage at level %d", current_level);
+		*level = current_level;
+	}
+
+	return *level == current_level;
+}
+
+uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
+				    int *level)
+{
+	uint64_t *pml4e, *pdpe, *pde;
+
+	TEST_ASSERT(!vm->arch.is_pt_protected,
+		    "Walking page tables of protected guests is impossible");
+
+	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM,
+		    "Invalid PG_LEVEL_* '%d'", *level);
+
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+		(vaddr >> vm->page_shift)),
+		"Invalid virtual address, vaddr: 0x%lx",
+		vaddr);
+	/*
+	 * Based on the mode check above there are 48 bits in the vaddr, so
+	 * shift 16 to sign extend the last bit (bit-47),
+	 */
+	TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16),
+		"Canonical check failed.  The virtual address is invalid.");
+
+	pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G);
+	if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G))
+		return pml4e;
+
+	pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G);
+	if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G))
+		return pdpe;
+
+	pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M);
+	if (vm_is_target_pte(pde, level, PG_LEVEL_2M))
+		return pde;
+
+	return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
+}
+
+uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
+{
+	int level = PG_LEVEL_4K;
+
+	return __vm_get_page_table_entry(vm, vaddr, &level);
+}
+
+void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+	uint64_t *pml4e, *pml4e_start;
+	uint64_t *pdpe, *pdpe_start;
+	uint64_t *pde, *pde_start;
+	uint64_t *pte, *pte_start;
+
+	if (!vm->pgd_created)
+		return;
+
+	fprintf(stream, "%*s                                          "
+		"                no\n", indent, "");
+	fprintf(stream, "%*s      index hvaddr         gpaddr         "
+		"addr         w exec dirty\n",
+		indent, "");
+	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
+	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
+		pml4e = &pml4e_start[n1];
+		if (!(*pml4e & PTE_PRESENT_MASK))
+			continue;
+		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
+			" %u\n",
+			indent, "",
+			pml4e - pml4e_start, pml4e,
+			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
+			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
+
+		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
+		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
+			pdpe = &pdpe_start[n2];
+			if (!(*pdpe & PTE_PRESENT_MASK))
+				continue;
+			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
+				"%u  %u\n",
+				indent, "",
+				pdpe - pdpe_start, pdpe,
+				addr_hva2gpa(vm, pdpe),
+				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
+				!!(*pdpe & PTE_NX_MASK));
+
+			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
+			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
+				pde = &pde_start[n3];
+				if (!(*pde & PTE_PRESENT_MASK))
+					continue;
+				fprintf(stream, "%*spde   0x%-3zx %p "
+					"0x%-12lx 0x%-10llx %u  %u\n",
+					indent, "", pde - pde_start, pde,
+					addr_hva2gpa(vm, pde),
+					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
+					!!(*pde & PTE_NX_MASK));
+
+				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
+				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
+					pte = &pte_start[n4];
+					if (!(*pte & PTE_PRESENT_MASK))
+						continue;
+					fprintf(stream, "%*spte   0x%-3zx %p "
+						"0x%-12lx 0x%-10llx %u  %u "
+						"    %u    0x%-10lx\n",
+						indent, "",
+						pte - pte_start, pte,
+						addr_hva2gpa(vm, pte),
+						PTE_GET_PFN(*pte),
+						!!(*pte & PTE_WRITABLE_MASK),
+						!!(*pte & PTE_NX_MASK),
+						!!(*pte & PTE_DIRTY_MASK),
+						((uint64_t) n1 << 27)
+							| ((uint64_t) n2 << 18)
+							| ((uint64_t) n3 << 9)
+							| ((uint64_t) n4));
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Set Unusable Segment
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ *   segp - Pointer to segment register
+ *
+ * Return: None
+ *
+ * Sets the segment register pointed to by @segp to an unusable state.
+ */
+static void kvm_seg_set_unusable(struct kvm_segment *segp)
+{
+	memset(segp, 0, sizeof(*segp));
+	segp->unusable = true;
+}
+
+static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
+{
+	void *gdt = addr_gva2hva(vm, vm->arch.gdt);
+	struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
+
+	desc->limit0 = segp->limit & 0xFFFF;
+	desc->base0 = segp->base & 0xFFFF;
+	desc->base1 = segp->base >> 16;
+	desc->type = segp->type;
+	desc->s = segp->s;
+	desc->dpl = segp->dpl;
+	desc->p = segp->present;
+	desc->limit1 = segp->limit >> 16;
+	desc->avl = segp->avl;
+	desc->l = segp->l;
+	desc->db = segp->db;
+	desc->g = segp->g;
+	desc->base2 = segp->base >> 24;
+	if (!segp->s)
+		desc->base3 = segp->base >> 32;
+}
+
+static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp)
+{
+	memset(segp, 0, sizeof(*segp));
+	segp->selector = KERNEL_CS;
+	segp->limit = 0xFFFFFFFFu;
+	segp->s = 0x1; /* kTypeCodeData */
+	segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
+					  * | kFlagCodeReadable
+					  */
+	segp->g = true;
+	segp->l = true;
+	segp->present = 1;
+}
+
+static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)
+{
+	memset(segp, 0, sizeof(*segp));
+	segp->selector = KERNEL_DS;
+	segp->limit = 0xFFFFFFFFu;
+	segp->s = 0x1; /* kTypeCodeData */
+	segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
+					  * | kFlagDataWritable
+					  */
+	segp->g = true;
+	segp->present = true;
+}
+
+vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	int level = PG_LEVEL_NONE;
+	uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);
+
+	TEST_ASSERT(*pte & PTE_PRESENT_MASK,
+		    "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
+
+	/*
+	 * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
+	 * address bits to be zero.
+	 */
+	return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));
+}
+
+static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp)
+{
+	memset(segp, 0, sizeof(*segp));
+	segp->base = base;
+	segp->limit = 0x67;
+	segp->selector = KERNEL_TSS;
+	segp->type = 0xb;
+	segp->present = 1;
+}
+
+static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	struct kvm_sregs sregs;
+
+	TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K);
+
+	/* Set mode specific system register values. */
+	vcpu_sregs_get(vcpu, &sregs);
+
+	sregs.idt.base = vm->arch.idt;
+	sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
+	sregs.gdt.base = vm->arch.gdt;
+	sregs.gdt.limit = getpagesize() - 1;
+
+	sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
+	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
+	if (kvm_cpu_has(X86_FEATURE_XSAVE))
+		sregs.cr4 |= X86_CR4_OSXSAVE;
+	sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
+
+	kvm_seg_set_unusable(&sregs.ldt);
+	kvm_seg_set_kernel_code_64bit(&sregs.cs);
+	kvm_seg_set_kernel_data_64bit(&sregs.ds);
+	kvm_seg_set_kernel_data_64bit(&sregs.es);
+	kvm_seg_set_kernel_data_64bit(&sregs.gs);
+	kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);
+
+	sregs.cr3 = vm->pgd;
+	vcpu_sregs_set(vcpu, &sregs);
+}
+
+static void vcpu_init_xcrs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	struct kvm_xcrs xcrs = {
+		.nr_xcrs = 1,
+		.xcrs[0].xcr = 0,
+		.xcrs[0].value = kvm_cpu_supported_xcr0(),
+	};
+
+	if (!kvm_cpu_has(X86_FEATURE_XSAVE))
+		return;
+
+	vcpu_xcrs_set(vcpu, &xcrs);
+}
+
+static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
+			  int dpl, unsigned short selector)
+{
+	struct idt_entry *base =
+		(struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);
+	struct idt_entry *e = &base[vector];
+
+	memset(e, 0, sizeof(*e));
+	e->offset0 = addr;
+	e->selector = selector;
+	e->ist = 0;
+	e->type = 14;
+	e->dpl = dpl;
+	e->p = 1;
+	e->offset1 = addr >> 16;
+	e->offset2 = addr >> 32;
+}
+
+static bool kvm_fixup_exception(struct ex_regs *regs)
+{
+	if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10)
+		return false;
+
+	if (regs->vector == DE_VECTOR)
+		return false;
+
+	regs->rip = regs->r11;
+	regs->r9 = regs->vector;
+	regs->r10 = regs->error_code;
+	return true;
+}
+
+void route_exception(struct ex_regs *regs)
+{
+	typedef void(*handler)(struct ex_regs *);
+	handler *handlers = (handler *)exception_handlers;
+
+	if (handlers && handlers[regs->vector]) {
+		handlers[regs->vector](regs);
+		return;
+	}
+
+	if (kvm_fixup_exception(regs))
+		return;
+
+	GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'",
+		   regs->vector, regs->rip);
+}
+
+static void vm_init_descriptor_tables(struct kvm_vm *vm)
+{
+	extern void *idt_handlers;
+	struct kvm_segment seg;
+	int i;
+
+	vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
+	vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
+	vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
+	vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
+
+	/* Handlers have the same address in both address spaces.*/
+	for (i = 0; i < NUM_INTERRUPTS; i++)
+		set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS);
+
+	*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
+
+	kvm_seg_set_kernel_code_64bit(&seg);
+	kvm_seg_fill_gdt_64bit(vm, &seg);
+
+	kvm_seg_set_kernel_data_64bit(&seg);
+	kvm_seg_fill_gdt_64bit(vm, &seg);
+
+	kvm_seg_set_tss_64bit(vm->arch.tss, &seg);
+	kvm_seg_fill_gdt_64bit(vm, &seg);
+}
+
+void vm_install_exception_handler(struct kvm_vm *vm, int vector,
+			       void (*handler)(struct ex_regs *))
+{
+	vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
+
+	handlers[vector] = (vm_vaddr_t)handler;
+}
+
+void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	if (get_ucall(vcpu, &uc) == UCALL_ABORT)
+		REPORT_GUEST_ASSERT(uc);
+}
+
+void kvm_arch_vm_post_create(struct kvm_vm *vm)
+{
+	int r;
+
+	TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
+		    "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
+
+	vm_create_irqchip(vm);
+	vm_init_descriptor_tables(vm);
+
+	sync_global_to_guest(vm, host_cpu_is_intel);
+	sync_global_to_guest(vm, host_cpu_is_amd);
+	sync_global_to_guest(vm, is_forced_emulation_enabled);
+
+	if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
+		struct kvm_sev_init init = { 0 };
+
+		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+	}
+
+	r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
+	TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
+	guest_tsc_khz = r;
+	sync_global_to_guest(vm, guest_tsc_khz);
+}
+
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	struct kvm_regs regs;
+
+	vcpu_regs_get(vcpu, &regs);
+	regs.rip = (unsigned long) guest_code;
+	vcpu_regs_set(vcpu, &regs);
+}
+
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
+{
+	struct kvm_mp_state mp_state;
+	struct kvm_regs regs;
+	vm_vaddr_t stack_vaddr;
+	struct kvm_vcpu *vcpu;
+
+	stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
+				       DEFAULT_GUEST_STACK_VADDR_MIN,
+				       MEM_REGION_DATA);
+
+	stack_vaddr += DEFAULT_STACK_PGS * getpagesize();
+
+	/*
+	 * Align stack to match calling sequence requirements in section "The
+	 * Stack Frame" of the System V ABI AMD64 Architecture Processor
+	 * Supplement, which requires the value (%rsp + 8) to be a multiple of
+	 * 16 when control is transferred to the function entry point.
+	 *
+	 * If this code is ever used to launch a vCPU with 32-bit entry point it
+	 * may need to subtract 4 bytes instead of 8 bytes.
+	 */
+	TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE),
+		    "__vm_vaddr_alloc() did not provide a page-aligned address");
+	stack_vaddr -= 8;
+
+	vcpu = __vm_vcpu_add(vm, vcpu_id);
+	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
+	vcpu_init_sregs(vm, vcpu);
+	vcpu_init_xcrs(vm, vcpu);
+
+	/* Setup guest general purpose registers */
+	vcpu_regs_get(vcpu, &regs);
+	regs.rflags = regs.rflags | 0x2;
+	regs.rsp = stack_vaddr;
+	vcpu_regs_set(vcpu, &regs);
+
+	/* Setup the MP state */
+	mp_state.mp_state = 0;
+	vcpu_mp_state_set(vcpu, &mp_state);
+
+	/*
+	 * Refresh CPUID after setting SREGS and XCR0, so that KVM's "runtime"
+	 * updates to guest CPUID, e.g. for OSXSAVE and XSAVE state size, are
+	 * reflected into selftests' vCPU CPUID cache, i.e. so that the cache
+	 * is consistent with vCPU state.
+	 */
+	vcpu_get_cpuid(vcpu);
+	return vcpu;
+}
+
+struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id)
+{
+	struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
+
+	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
+
+	return vcpu;
+}
+
+void vcpu_arch_free(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->cpuid)
+		free(vcpu->cpuid);
+}
+
+/* Do not use kvm_supported_cpuid directly except for validity checks. */
+static void *kvm_supported_cpuid;
+
+const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
+{
+	int kvm_fd;
+
+	if (kvm_supported_cpuid)
+		return kvm_supported_cpuid;
+
+	kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
+	kvm_fd = open_kvm_dev_path_or_exit();
+
+	kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID,
+		  (struct kvm_cpuid2 *)kvm_supported_cpuid);
+
+	close(kvm_fd);
+	return kvm_supported_cpuid;
+}
+
+static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,
+			      uint32_t function, uint32_t index,
+			      uint8_t reg, uint8_t lo, uint8_t hi)
+{
+	const struct kvm_cpuid_entry2 *entry;
+	int i;
+
+	for (i = 0; i < cpuid->nent; i++) {
+		entry = &cpuid->entries[i];
+
+		/*
+		 * The output registers in kvm_cpuid_entry2 are in alphabetical
+		 * order, but kvm_x86_cpu_feature matches that mess, so yay
+		 * pointer shenanigans!
+		 */
+		if (entry->function == function && entry->index == index)
+			return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;
+	}
+
+	return 0;
+}
+
+bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
+		   struct kvm_x86_cpu_feature feature)
+{
+	return __kvm_cpu_has(cpuid, feature.function, feature.index,
+			     feature.reg, feature.bit, feature.bit);
+}
+
+uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
+			    struct kvm_x86_cpu_property property)
+{
+	return __kvm_cpu_has(cpuid, property.function, property.index,
+			     property.reg, property.lo_bit, property.hi_bit);
+}
+
+uint64_t kvm_get_feature_msr(uint64_t msr_index)
+{
+	struct {
+		struct kvm_msrs header;
+		struct kvm_msr_entry entry;
+	} buffer = {};
+	int r, kvm_fd;
+
+	buffer.header.nmsrs = 1;
+	buffer.entry.index = msr_index;
+	kvm_fd = open_kvm_dev_path_or_exit();
+
+	r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
+	TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r));
+
+	close(kvm_fd);
+	return buffer.entry.data;
+}
+
+void __vm_xsave_require_permission(uint64_t xfeature, const char *name)
+{
+	int kvm_fd;
+	u64 bitmask;
+	long rc;
+	struct kvm_device_attr attr = {
+		.group = 0,
+		.attr = KVM_X86_XCOMP_GUEST_SUPP,
+		.addr = (unsigned long) &bitmask,
+	};
+
+	TEST_ASSERT(!kvm_supported_cpuid,
+		    "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM");
+
+	TEST_ASSERT(is_power_of_2(xfeature),
+		    "Dynamic XFeatures must be enabled one at a time");
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
+	close(kvm_fd);
+
+	if (rc == -1 && (errno == ENXIO || errno == EINVAL))
+		__TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported");
+
+	TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
+
+	__TEST_REQUIRE(bitmask & xfeature,
+		       "Required XSAVE feature '%s' not supported", name);
+
+	TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature)));
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
+	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
+	TEST_ASSERT(bitmask & xfeature,
+		    "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx",
+		    name, xfeature, bitmask);
+}
+
+void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)
+{
+	TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID");
+
+	/* Allow overriding the default CPUID. */
+	if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) {
+		free(vcpu->cpuid);
+		vcpu->cpuid = NULL;
+	}
+
+	if (!vcpu->cpuid)
+		vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent);
+
+	memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
+	vcpu_set_cpuid(vcpu);
+}
+
+void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
+			     struct kvm_x86_cpu_property property,
+			     uint32_t value)
+{
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);
+
+	(&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);
+	(&entry->eax)[property.reg] |= value << property.lo_bit;
+
+	vcpu_set_cpuid(vcpu);
+
+	/* Sanity check that @value doesn't exceed the bounds in any way. */
+	TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);
+}
+
+void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function)
+{
+	struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function);
+
+	entry->eax = 0;
+	entry->ebx = 0;
+	entry->ecx = 0;
+	entry->edx = 0;
+	vcpu_set_cpuid(vcpu);
+}
+
+void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
+				     struct kvm_x86_cpu_feature feature,
+				     bool set)
+{
+	struct kvm_cpuid_entry2 *entry;
+	u32 *reg;
+
+	entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
+	reg = (&entry->eax) + feature.reg;
+
+	if (set)
+		*reg |= BIT(feature.bit);
+	else
+		*reg &= ~BIT(feature.bit);
+
+	vcpu_set_cpuid(vcpu);
+}
+
+uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index)
+{
+	struct {
+		struct kvm_msrs header;
+		struct kvm_msr_entry entry;
+	} buffer = {};
+
+	buffer.header.nmsrs = 1;
+	buffer.entry.index = msr_index;
+
+	vcpu_msrs_get(vcpu, &buffer.header);
+
+	return buffer.entry.data;
+}
+
+int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value)
+{
+	struct {
+		struct kvm_msrs header;
+		struct kvm_msr_entry entry;
+	} buffer = {};
+
+	memset(&buffer, 0, sizeof(buffer));
+	buffer.header.nmsrs = 1;
+	buffer.entry.index = msr_index;
+	buffer.entry.data = msr_value;
+
+	return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header);
+}
+
+void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
+{
+	va_list ap;
+	struct kvm_regs regs;
+
+	TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
+		    "  num: %u",
+		    num);
+
+	va_start(ap, num);
+	vcpu_regs_get(vcpu, &regs);
+
+	if (num >= 1)
+		regs.rdi = va_arg(ap, uint64_t);
+
+	if (num >= 2)
+		regs.rsi = va_arg(ap, uint64_t);
+
+	if (num >= 3)
+		regs.rdx = va_arg(ap, uint64_t);
+
+	if (num >= 4)
+		regs.rcx = va_arg(ap, uint64_t);
+
+	if (num >= 5)
+		regs.r8 = va_arg(ap, uint64_t);
+
+	if (num >= 6)
+		regs.r9 = va_arg(ap, uint64_t);
+
+	vcpu_regs_set(vcpu, &regs);
+	va_end(ap);
+}
+
+void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
+{
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+
+	fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id);
+
+	fprintf(stream, "%*sregs:\n", indent + 2, "");
+	vcpu_regs_get(vcpu, &regs);
+	regs_dump(stream, &regs, indent + 4);
+
+	fprintf(stream, "%*ssregs:\n", indent + 2, "");
+	vcpu_sregs_get(vcpu, &sregs);
+	sregs_dump(stream, &sregs, indent + 4);
+}
+
+static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs)
+{
+	struct kvm_msr_list *list;
+	struct kvm_msr_list nmsrs;
+	int kvm_fd, r;
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+
+	nmsrs.nmsrs = 0;
+	if (!feature_msrs)
+		r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
+	else
+		r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs);
+
+	TEST_ASSERT(r == -1 && errno == E2BIG,
+		    "Expected -E2BIG, got rc: %i errno: %i (%s)",
+		    r, errno, strerror(errno));
+
+	list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0]));
+	TEST_ASSERT(list, "-ENOMEM when allocating MSR index list");
+	list->nmsrs = nmsrs.nmsrs;
+
+	if (!feature_msrs)
+		kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+	else
+		kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
+	close(kvm_fd);
+
+	TEST_ASSERT(list->nmsrs == nmsrs.nmsrs,
+		    "Number of MSRs in list changed, was %d, now %d",
+		    nmsrs.nmsrs, list->nmsrs);
+	return list;
+}
+
+const struct kvm_msr_list *kvm_get_msr_index_list(void)
+{
+	static const struct kvm_msr_list *list;
+
+	if (!list)
+		list = __kvm_get_msr_index_list(false);
+	return list;
+}
+
+
+const struct kvm_msr_list *kvm_get_feature_msr_index_list(void)
+{
+	static const struct kvm_msr_list *list;
+
+	if (!list)
+		list = __kvm_get_msr_index_list(true);
+	return list;
+}
+
+bool kvm_msr_is_in_save_restore_list(uint32_t msr_index)
+{
+	const struct kvm_msr_list *list = kvm_get_msr_index_list();
+	int i;
+
+	for (i = 0; i < list->nmsrs; ++i) {
+		if (list->indices[i] == msr_index)
+			return true;
+	}
+
+	return false;
+}
+
+static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu,
+				  struct kvm_x86_state *state)
+{
+	int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2);
+
+	if (size) {
+		state->xsave = malloc(size);
+		vcpu_xsave2_get(vcpu, state->xsave);
+	} else {
+		state->xsave = malloc(sizeof(struct kvm_xsave));
+		vcpu_xsave_get(vcpu, state->xsave);
+	}
+}
+
+struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu)
+{
+	const struct kvm_msr_list *msr_list = kvm_get_msr_index_list();
+	struct kvm_x86_state *state;
+	int i;
+
+	static int nested_size = -1;
+
+	if (nested_size == -1) {
+		nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
+		TEST_ASSERT(nested_size <= sizeof(state->nested_),
+			    "Nested state size too big, %i > %zi",
+			    nested_size, sizeof(state->nested_));
+	}
+
+	/*
+	 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
+	 * guest state is consistent only after userspace re-enters the
+	 * kernel with KVM_RUN.  Complete IO prior to migrating state
+	 * to a new VM.
+	 */
+	vcpu_run_complete_io(vcpu);
+
+	state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0]));
+	TEST_ASSERT(state, "-ENOMEM when allocating kvm state");
+
+	vcpu_events_get(vcpu, &state->events);
+	vcpu_mp_state_get(vcpu, &state->mp_state);
+	vcpu_regs_get(vcpu, &state->regs);
+	vcpu_save_xsave_state(vcpu, state);
+
+	if (kvm_has_cap(KVM_CAP_XCRS))
+		vcpu_xcrs_get(vcpu, &state->xcrs);
+
+	vcpu_sregs_get(vcpu, &state->sregs);
+
+	if (nested_size) {
+		state->nested.size = sizeof(state->nested_);
+
+		vcpu_nested_state_get(vcpu, &state->nested);
+		TEST_ASSERT(state->nested.size <= nested_size,
+			    "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+			    state->nested.size, nested_size);
+	} else {
+		state->nested.size = 0;
+	}
+
+	state->msrs.nmsrs = msr_list->nmsrs;
+	for (i = 0; i < msr_list->nmsrs; i++)
+		state->msrs.entries[i].index = msr_list->indices[i];
+	vcpu_msrs_get(vcpu, &state->msrs);
+
+	vcpu_debugregs_get(vcpu, &state->debugregs);
+
+	return state;
+}
+
+void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state)
+{
+	vcpu_sregs_set(vcpu, &state->sregs);
+	vcpu_msrs_set(vcpu, &state->msrs);
+
+	if (kvm_has_cap(KVM_CAP_XCRS))
+		vcpu_xcrs_set(vcpu, &state->xcrs);
+
+	vcpu_xsave_set(vcpu,  state->xsave);
+	vcpu_events_set(vcpu, &state->events);
+	vcpu_mp_state_set(vcpu, &state->mp_state);
+	vcpu_debugregs_set(vcpu, &state->debugregs);
+	vcpu_regs_set(vcpu, &state->regs);
+
+	if (state->nested.size)
+		vcpu_nested_state_set(vcpu, &state->nested);
+}
+
+void kvm_x86_state_cleanup(struct kvm_x86_state *state)
+{
+	free(state->xsave);
+	free(state);
+}
+
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
+{
+	if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
+		*pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;
+		*va_bits = 32;
+	} else {
+		*pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
+		*va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);
+	}
+}
+
+void kvm_init_vm_address_properties(struct kvm_vm *vm)
+{
+	if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
+		vm->arch.sev_fd = open_sev_dev_path_or_exit();
+		vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
+		vm->gpa_tag_mask = vm->arch.c_bit;
+	} else {
+		vm->arch.sev_fd = -1;
+	}
+}
+
+const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
+					       uint32_t function, uint32_t index)
+{
+	int i;
+
+	for (i = 0; i < cpuid->nent; i++) {
+		if (cpuid->entries[i].function == function &&
+		    cpuid->entries[i].index == index)
+			return &cpuid->entries[i];
+	}
+
+	TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
+
+	return NULL;
+}
+
+#define X86_HYPERCALL(inputs...)					\
+({									\
+	uint64_t r;							\
+									\
+	asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t"		\
+		     "jnz 1f\n\t"					\
+		     "vmcall\n\t"					\
+		     "jmp 2f\n\t"					\
+		     "1: vmmcall\n\t"					\
+		     "2:"						\
+		     : "=a"(r)						\
+		     : [use_vmmcall] "r" (host_cpu_is_amd), inputs);	\
+									\
+	r;								\
+})
+
+uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+		       uint64_t a3)
+{
+	return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
+}
+
+uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
+{
+	return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1));
+}
+
+void xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
+{
+	GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));
+}
+
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
+{
+	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
+	unsigned long ht_gfn, max_gfn, max_pfn;
+	uint8_t maxphyaddr, guest_maxphyaddr;
+
+	/*
+	 * Use "guest MAXPHYADDR" from KVM if it's available.  Guest MAXPHYADDR
+	 * enumerates the max _mappable_ GPA, which can be less than the raw
+	 * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU
+	 * doesn't support 5-level TDP.
+	 */
+	guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
+	guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits;
+	TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits,
+		    "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR");
+
+	max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1;
+
+	/* Avoid reserved HyperTransport region on AMD processors.  */
+	if (!host_cpu_is_amd)
+		return max_gfn;
+
+	/* On parts with <40 physical address bits, the area is fully hidden */
+	if (vm->pa_bits < 40)
+		return max_gfn;
+
+	/* Before family 17h, the HyperTransport area is just below 1T.  */
+	ht_gfn = (1 << 28) - num_ht_pages;
+	if (this_cpu_family() < 0x17)
+		goto done;
+
+	/*
+	 * Otherwise it's at the top of the physical address space, possibly
+	 * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX.  Use
+	 * the old conservative value if MAXPHYADDR is not enumerated.
+	 */
+	if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))
+		goto done;
+
+	maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
+	max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;
+
+	if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))
+		max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);
+
+	ht_gfn = max_pfn - num_ht_pages;
+done:
+	return min(max_gfn, ht_gfn - 1);
+}
+
+/* Returns true if kvm_intel was loaded with unrestricted_guest=1. */
+bool vm_is_unrestricted_guest(struct kvm_vm *vm)
+{
+	/* Ensure that a KVM vendor-specific module is loaded. */
+	if (vm == NULL)
+		close(open_kvm_dev_path_or_exit());
+
+	return get_kvm_intel_param_bool("unrestricted_guest");
+}
+
+void kvm_selftest_arch_init(void)
+{
+	host_cpu_is_intel = this_cpu_is_intel();
+	host_cpu_is_amd = this_cpu_is_amd();
+	is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
+}
+
+bool sys_clocksource_is_based_on_tsc(void)
+{
+	char *clk_name = sys_get_cur_clocksource();
+	bool ret = !strcmp(clk_name, "tsc\n") ||
+		   !strcmp(clk_name, "hyperv_clocksource_tsc_page\n");
+
+	free(clk_name);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c
new file mode 100644
index 000000000000..e9535ee20b7f
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/sev.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "sev.h"
+
+/*
+ * sparsebit_next_clear() can return 0 if [x, 2**64-1] are all set, and the
+ * -1 would then cause an underflow back to 2**64 - 1. This is expected and
+ * correct.
+ *
+ * If the last range in the sparsebit is [x, y] and we try to iterate,
+ * sparsebit_next_set() will return 0, and sparsebit_next_clear() will try
+ * and find the first range, but that's correct because the condition
+ * expression would cause us to quit the loop.
+ */
+static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region)
+{
+	const struct sparsebit *protected_phy_pages = region->protected_phy_pages;
+	const vm_paddr_t gpa_base = region->region.guest_phys_addr;
+	const sparsebit_idx_t lowest_page_in_region = gpa_base >> vm->page_shift;
+	sparsebit_idx_t i, j;
+
+	if (!sparsebit_any_set(protected_phy_pages))
+		return;
+
+	sev_register_encrypted_memory(vm, region);
+
+	sparsebit_for_each_set_range(protected_phy_pages, i, j) {
+		const uint64_t size = (j - i + 1) * vm->page_size;
+		const uint64_t offset = (i - lowest_page_in_region) * vm->page_size;
+
+		sev_launch_update_data(vm, gpa_base + offset, size);
+	}
+}
+
+void sev_vm_init(struct kvm_vm *vm)
+{
+	if (vm->type == KVM_X86_DEFAULT_VM) {
+		assert(vm->arch.sev_fd == -1);
+		vm->arch.sev_fd = open_sev_dev_path_or_exit();
+		vm_sev_ioctl(vm, KVM_SEV_INIT, NULL);
+	} else {
+		struct kvm_sev_init init = { 0 };
+		assert(vm->type == KVM_X86_SEV_VM);
+		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+	}
+}
+
+void sev_es_vm_init(struct kvm_vm *vm)
+{
+	if (vm->type == KVM_X86_DEFAULT_VM) {
+		assert(vm->arch.sev_fd == -1);
+		vm->arch.sev_fd = open_sev_dev_path_or_exit();
+		vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL);
+	} else {
+		struct kvm_sev_init init = { 0 };
+		assert(vm->type == KVM_X86_SEV_ES_VM);
+		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+	}
+}
+
+void sev_vm_launch(struct kvm_vm *vm, uint32_t policy)
+{
+	struct kvm_sev_launch_start launch_start = {
+		.policy = policy,
+	};
+	struct userspace_mem_region *region;
+	struct kvm_sev_guest_status status;
+	int ctr;
+
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_START, &launch_start);
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+
+	TEST_ASSERT_EQ(status.policy, policy);
+	TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE);
+
+	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node)
+		encrypt_region(vm, region);
+
+	if (policy & SEV_POLICY_ES)
+		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+
+	vm->arch.is_pt_protected = true;
+}
+
+void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement)
+{
+	struct kvm_sev_launch_measure launch_measure;
+	struct kvm_sev_guest_status guest_status;
+
+	launch_measure.len = 256;
+	launch_measure.uaddr = (__u64)measurement;
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_MEASURE, &launch_measure);
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &guest_status);
+	TEST_ASSERT_EQ(guest_status.state, SEV_GUEST_STATE_LAUNCH_SECRET);
+}
+
+void sev_vm_launch_finish(struct kvm_vm *vm)
+{
+	struct kvm_sev_guest_status status;
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+	TEST_ASSERT(status.state == SEV_GUEST_STATE_LAUNCH_UPDATE ||
+		    status.state == SEV_GUEST_STATE_LAUNCH_SECRET,
+		    "Unexpected guest state: %d", status.state);
+
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_FINISH, NULL);
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+	TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING);
+}
+
+struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
+					   struct kvm_vcpu **cpu)
+{
+	struct vm_shape shape = {
+		.mode = VM_MODE_DEFAULT,
+		.type = type,
+	};
+	struct kvm_vm *vm;
+	struct kvm_vcpu *cpus[1];
+
+	vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus);
+	*cpu = cpus[0];
+
+	return vm;
+}
+
+void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement)
+{
+	sev_vm_launch(vm, policy);
+
+	if (!measurement)
+		measurement = alloca(256);
+
+	sev_vm_launch_measure(vm, measurement);
+
+	sev_vm_launch_finish(vm);
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c
new file mode 100644
index 000000000000..d239c2097391
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/svm.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Helpers used for nested SVM testing
+ * Largely inspired from KVM unit test svm.c
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+#define SEV_DEV_PATH "/dev/sev"
+
+struct gpr64_regs guest_regs;
+u64 rflags;
+
+/* Allocate memory regions for nested SVM tests.
+ *
+ * Input Args:
+ *   vm - The VM to allocate guest-virtual addresses in.
+ *
+ * Output Args:
+ *   p_svm_gva - The guest virtual address for the struct svm_test_data.
+ *
+ * Return:
+ *   Pointer to structure with the addresses of the SVM areas.
+ */
+struct svm_test_data *
+vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva)
+{
+	vm_vaddr_t svm_gva = vm_vaddr_alloc_page(vm);
+	struct svm_test_data *svm = addr_gva2hva(vm, svm_gva);
+
+	svm->vmcb = (void *)vm_vaddr_alloc_page(vm);
+	svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb);
+	svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb);
+
+	svm->save_area = (void *)vm_vaddr_alloc_page(vm);
+	svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area);
+	svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area);
+
+	svm->msr = (void *)vm_vaddr_alloc_page(vm);
+	svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr);
+	svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr);
+	memset(svm->msr_hva, 0, getpagesize());
+
+	*p_svm_gva = svm_gva;
+	return svm;
+}
+
+static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector,
+			 u64 base, u32 limit, u32 attr)
+{
+	seg->selector = selector;
+	seg->attrib = attr;
+	seg->limit = limit;
+	seg->base = base;
+}
+
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
+{
+	struct vmcb *vmcb = svm->vmcb;
+	uint64_t vmcb_gpa = svm->vmcb_gpa;
+	struct vmcb_save_area *save = &vmcb->save;
+	struct vmcb_control_area *ctrl = &vmcb->control;
+	u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
+	      | SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK;
+	u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
+		| SVM_SELECTOR_L_MASK | SVM_SELECTOR_G_MASK;
+	uint64_t efer;
+
+	efer = rdmsr(MSR_EFER);
+	wrmsr(MSR_EFER, efer | EFER_SVME);
+	wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
+
+	memset(vmcb, 0, sizeof(*vmcb));
+	asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
+	vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
+	vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
+	vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr);
+	vmcb_set_seg(&save->ds, get_ds(), 0, -1U, data_seg_attr);
+	vmcb_set_seg(&save->gdtr, 0, get_gdt().address, get_gdt().size, 0);
+	vmcb_set_seg(&save->idtr, 0, get_idt().address, get_idt().size, 0);
+
+	ctrl->asid = 1;
+	save->cpl = 0;
+	save->efer = rdmsr(MSR_EFER);
+	asm volatile ("mov %%cr4, %0" : "=r"(save->cr4) : : "memory");
+	asm volatile ("mov %%cr3, %0" : "=r"(save->cr3) : : "memory");
+	asm volatile ("mov %%cr0, %0" : "=r"(save->cr0) : : "memory");
+	asm volatile ("mov %%dr7, %0" : "=r"(save->dr7) : : "memory");
+	asm volatile ("mov %%dr6, %0" : "=r"(save->dr6) : : "memory");
+	asm volatile ("mov %%cr2, %0" : "=r"(save->cr2) : : "memory");
+	save->g_pat = rdmsr(MSR_IA32_CR_PAT);
+	save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+	ctrl->intercept = (1ULL << INTERCEPT_VMRUN) |
+				(1ULL << INTERCEPT_VMMCALL);
+	ctrl->msrpm_base_pa = svm->msr_gpa;
+
+	vmcb->save.rip = (u64)guest_rip;
+	vmcb->save.rsp = (u64)guest_rsp;
+	guest_regs.rdi = (u64)svm;
+}
+
+/*
+ * save/restore 64-bit general registers except rax, rip, rsp
+ * which are directly handed through the VMCB guest processor state
+ */
+#define SAVE_GPR_C				\
+	"xchg %%rbx, guest_regs+0x20\n\t"	\
+	"xchg %%rcx, guest_regs+0x10\n\t"	\
+	"xchg %%rdx, guest_regs+0x18\n\t"	\
+	"xchg %%rbp, guest_regs+0x30\n\t"	\
+	"xchg %%rsi, guest_regs+0x38\n\t"	\
+	"xchg %%rdi, guest_regs+0x40\n\t"	\
+	"xchg %%r8,  guest_regs+0x48\n\t"	\
+	"xchg %%r9,  guest_regs+0x50\n\t"	\
+	"xchg %%r10, guest_regs+0x58\n\t"	\
+	"xchg %%r11, guest_regs+0x60\n\t"	\
+	"xchg %%r12, guest_regs+0x68\n\t"	\
+	"xchg %%r13, guest_regs+0x70\n\t"	\
+	"xchg %%r14, guest_regs+0x78\n\t"	\
+	"xchg %%r15, guest_regs+0x80\n\t"
+
+#define LOAD_GPR_C      SAVE_GPR_C
+
+/*
+ * selftests do not use interrupts so we dropped clgi/sti/cli/stgi
+ * for now. registers involved in LOAD/SAVE_GPR_C are eventually
+ * unmodified so they do not need to be in the clobber list.
+ */
+void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa)
+{
+	asm volatile (
+		"vmload %[vmcb_gpa]\n\t"
+		"mov rflags, %%r15\n\t"	// rflags
+		"mov %%r15, 0x170(%[vmcb])\n\t"
+		"mov guest_regs, %%r15\n\t"	// rax
+		"mov %%r15, 0x1f8(%[vmcb])\n\t"
+		LOAD_GPR_C
+		"vmrun %[vmcb_gpa]\n\t"
+		SAVE_GPR_C
+		"mov 0x170(%[vmcb]), %%r15\n\t"	// rflags
+		"mov %%r15, rflags\n\t"
+		"mov 0x1f8(%[vmcb]), %%r15\n\t"	// rax
+		"mov %%r15, guest_regs\n\t"
+		"vmsave %[vmcb_gpa]\n\t"
+		: : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa)
+		: "r15", "memory");
+}
+
+/*
+ * Open SEV_DEV_PATH if available, otherwise exit the entire program.
+ *
+ * Return:
+ *   The opened file descriptor of /dev/sev.
+ */
+int open_sev_dev_path_or_exit(void)
+{
+	return open_path_or_exit(SEV_DEV_PATH, 0);
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/ucall.c b/tools/testing/selftests/kvm/lib/x86/ucall.c
new file mode 100644
index 000000000000..1265cecc7dd1
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/ucall.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+void ucall_arch_do_ucall(vm_vaddr_t uc)
+{
+	/*
+	 * FIXME: Revert this hack (the entire commit that added it) once nVMX
+	 * preserves L2 GPRs across a nested VM-Exit.  If a ucall from L2, e.g.
+	 * to do a GUEST_SYNC(), lands the vCPU in L1, any and all GPRs can be
+	 * clobbered by L1.  Save and restore non-volatile GPRs (clobbering RBP
+	 * in particular is problematic) along with RDX and RDI (which are
+	 * inputs), and clobber volatile GPRs. *sigh*
+	 */
+#define HORRIFIC_L2_UCALL_CLOBBER_HACK	\
+	"rcx", "rsi", "r8", "r9", "r10", "r11"
+
+	asm volatile("push %%rbp\n\t"
+		     "push %%r15\n\t"
+		     "push %%r14\n\t"
+		     "push %%r13\n\t"
+		     "push %%r12\n\t"
+		     "push %%rbx\n\t"
+		     "push %%rdx\n\t"
+		     "push %%rdi\n\t"
+		     "in %[port], %%al\n\t"
+		     "pop %%rdi\n\t"
+		     "pop %%rdx\n\t"
+		     "pop %%rbx\n\t"
+		     "pop %%r12\n\t"
+		     "pop %%r13\n\t"
+		     "pop %%r14\n\t"
+		     "pop %%r15\n\t"
+		     "pop %%rbp\n\t"
+		: : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory",
+		     HORRIFIC_L2_UCALL_CLOBBER_HACK);
+}
+
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) {
+		struct kvm_regs regs;
+
+		vcpu_regs_get(vcpu, &regs);
+		return (void *)regs.rdi;
+	}
+	return NULL;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
new file mode 100644
index 000000000000..d4d1208dd023
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include <asm/msr-index.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define PAGE_SHIFT_4K  12
+
+#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000
+
+bool enable_evmcs;
+
+struct hv_enlightened_vmcs *current_evmcs;
+struct hv_vp_assist_page *current_vp_assist;
+
+struct eptPageTableEntry {
+	uint64_t readable:1;
+	uint64_t writable:1;
+	uint64_t executable:1;
+	uint64_t memory_type:3;
+	uint64_t ignore_pat:1;
+	uint64_t page_size:1;
+	uint64_t accessed:1;
+	uint64_t dirty:1;
+	uint64_t ignored_11_10:2;
+	uint64_t address:40;
+	uint64_t ignored_62_52:11;
+	uint64_t suppress_ve:1;
+};
+
+struct eptPageTablePointer {
+	uint64_t memory_type:3;
+	uint64_t page_walk_length:3;
+	uint64_t ad_enabled:1;
+	uint64_t reserved_11_07:5;
+	uint64_t address:40;
+	uint64_t reserved_63_52:12;
+};
+int vcpu_enable_evmcs(struct kvm_vcpu *vcpu)
+{
+	uint16_t evmcs_ver;
+
+	vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENLIGHTENED_VMCS,
+			(unsigned long)&evmcs_ver);
+
+	/* KVM should return supported EVMCS version range */
+	TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) &&
+		    (evmcs_ver & 0xff) > 0,
+		    "Incorrect EVMCS version range: %x:%x",
+		    evmcs_ver & 0xff, evmcs_ver >> 8);
+
+	return evmcs_ver;
+}
+
+/* Allocate memory regions for nested VMX tests.
+ *
+ * Input Args:
+ *   vm - The VM to allocate guest-virtual addresses in.
+ *
+ * Output Args:
+ *   p_vmx_gva - The guest virtual address for the struct vmx_pages.
+ *
+ * Return:
+ *   Pointer to structure with the addresses of the VMX areas.
+ */
+struct vmx_pages *
+vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
+{
+	vm_vaddr_t vmx_gva = vm_vaddr_alloc_page(vm);
+	struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva);
+
+	/* Setup of a region of guest memory for the vmxon region. */
+	vmx->vmxon = (void *)vm_vaddr_alloc_page(vm);
+	vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon);
+	vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon);
+
+	/* Setup of a region of guest memory for a vmcs. */
+	vmx->vmcs = (void *)vm_vaddr_alloc_page(vm);
+	vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs);
+	vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs);
+
+	/* Setup of a region of guest memory for the MSR bitmap. */
+	vmx->msr = (void *)vm_vaddr_alloc_page(vm);
+	vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
+	vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
+	memset(vmx->msr_hva, 0, getpagesize());
+
+	/* Setup of a region of guest memory for the shadow VMCS. */
+	vmx->shadow_vmcs = (void *)vm_vaddr_alloc_page(vm);
+	vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs);
+	vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs);
+
+	/* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */
+	vmx->vmread = (void *)vm_vaddr_alloc_page(vm);
+	vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread);
+	vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread);
+	memset(vmx->vmread_hva, 0, getpagesize());
+
+	vmx->vmwrite = (void *)vm_vaddr_alloc_page(vm);
+	vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite);
+	vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
+	memset(vmx->vmwrite_hva, 0, getpagesize());
+
+	*p_vmx_gva = vmx_gva;
+	return vmx;
+}
+
+bool prepare_for_vmx_operation(struct vmx_pages *vmx)
+{
+	uint64_t feature_control;
+	uint64_t required;
+	unsigned long cr0;
+	unsigned long cr4;
+
+	/*
+	 * Ensure bits in CR0 and CR4 are valid in VMX operation:
+	 * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
+	 * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
+	 */
+	__asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
+	cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
+	cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
+	__asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
+
+	__asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
+	cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
+	cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
+	/* Enable VMX operation */
+	cr4 |= X86_CR4_VMXE;
+	__asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
+
+	/*
+	 * Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
+	 *  Bit 0: Lock bit. If clear, VMXON causes a #GP.
+	 *  Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
+	 *    outside of SMX causes a #GP.
+	 */
+	required = FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+	required |= FEAT_CTL_LOCKED;
+	feature_control = rdmsr(MSR_IA32_FEAT_CTL);
+	if ((feature_control & required) != required)
+		wrmsr(MSR_IA32_FEAT_CTL, feature_control | required);
+
+	/* Enter VMX root operation. */
+	*(uint32_t *)(vmx->vmxon) = vmcs_revision();
+	if (vmxon(vmx->vmxon_gpa))
+		return false;
+
+	return true;
+}
+
+bool load_vmcs(struct vmx_pages *vmx)
+{
+	/* Load a VMCS. */
+	*(uint32_t *)(vmx->vmcs) = vmcs_revision();
+	if (vmclear(vmx->vmcs_gpa))
+		return false;
+
+	if (vmptrld(vmx->vmcs_gpa))
+		return false;
+
+	/* Setup shadow VMCS, do not load it yet. */
+	*(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul;
+	if (vmclear(vmx->shadow_vmcs_gpa))
+		return false;
+
+	return true;
+}
+
+static bool ept_vpid_cap_supported(uint64_t mask)
+{
+	return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask;
+}
+
+bool ept_1g_pages_supported(void)
+{
+	return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES);
+}
+
+/*
+ * Initialize the control fields to the most basic settings possible.
+ */
+static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
+{
+	uint32_t sec_exec_ctl = 0;
+
+	vmwrite(VIRTUAL_PROCESSOR_ID, 0);
+	vmwrite(POSTED_INTR_NV, 0);
+
+	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
+
+	if (vmx->eptp_gpa) {
+		uint64_t ept_paddr;
+		struct eptPageTablePointer eptp = {
+			.memory_type = X86_MEMTYPE_WB,
+			.page_walk_length = 3, /* + 1 */
+			.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
+			.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
+		};
+
+		memcpy(&ept_paddr, &eptp, sizeof(ept_paddr));
+		vmwrite(EPT_POINTER, ept_paddr);
+		sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT;
+	}
+
+	if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl))
+		vmwrite(CPU_BASED_VM_EXEC_CONTROL,
+			rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+	else {
+		vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS));
+		GUEST_ASSERT(!sec_exec_ctl);
+	}
+
+	vmwrite(EXCEPTION_BITMAP, 0);
+	vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+	vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
+	vmwrite(CR3_TARGET_COUNT, 0);
+	vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
+		VM_EXIT_HOST_ADDR_SPACE_SIZE);	  /* 64-bit host */
+	vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
+		VM_ENTRY_IA32E_MODE);		  /* 64-bit guest */
+	vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+	vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+	vmwrite(TPR_THRESHOLD, 0);
+
+	vmwrite(CR0_GUEST_HOST_MASK, 0);
+	vmwrite(CR4_GUEST_HOST_MASK, 0);
+	vmwrite(CR0_READ_SHADOW, get_cr0());
+	vmwrite(CR4_READ_SHADOW, get_cr4());
+
+	vmwrite(MSR_BITMAP, vmx->msr_gpa);
+	vmwrite(VMREAD_BITMAP, vmx->vmread_gpa);
+	vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa);
+}
+
+/*
+ * Initialize the host state fields based on the current host state, with
+ * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
+ * or vmresume.
+ */
+static inline void init_vmcs_host_state(void)
+{
+	uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
+
+	vmwrite(HOST_ES_SELECTOR, get_es());
+	vmwrite(HOST_CS_SELECTOR, get_cs());
+	vmwrite(HOST_SS_SELECTOR, get_ss());
+	vmwrite(HOST_DS_SELECTOR, get_ds());
+	vmwrite(HOST_FS_SELECTOR, get_fs());
+	vmwrite(HOST_GS_SELECTOR, get_gs());
+	vmwrite(HOST_TR_SELECTOR, get_tr());
+
+	if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
+		vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
+	if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
+		vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
+	if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
+			rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
+
+	vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
+
+	vmwrite(HOST_CR0, get_cr0());
+	vmwrite(HOST_CR3, get_cr3());
+	vmwrite(HOST_CR4, get_cr4());
+	vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
+	vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
+	vmwrite(HOST_TR_BASE,
+		get_desc64_base((struct desc64 *)(get_gdt().address + get_tr())));
+	vmwrite(HOST_GDTR_BASE, get_gdt().address);
+	vmwrite(HOST_IDTR_BASE, get_idt().address);
+	vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
+	vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
+}
+
+/*
+ * Initialize the guest state fields essentially as a clone of
+ * the host state fields. Some host state fields have fixed
+ * values, and we set the corresponding guest state fields accordingly.
+ */
+static inline void init_vmcs_guest_state(void *rip, void *rsp)
+{
+	vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
+	vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
+	vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
+	vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
+	vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
+	vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
+	vmwrite(GUEST_LDTR_SELECTOR, 0);
+	vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
+	vmwrite(GUEST_INTR_STATUS, 0);
+	vmwrite(GUEST_PML_INDEX, 0);
+
+	vmwrite(VMCS_LINK_POINTER, -1ll);
+	vmwrite(GUEST_IA32_DEBUGCTL, 0);
+	vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
+	vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
+	vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
+		vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
+
+	vmwrite(GUEST_ES_LIMIT, -1);
+	vmwrite(GUEST_CS_LIMIT, -1);
+	vmwrite(GUEST_SS_LIMIT, -1);
+	vmwrite(GUEST_DS_LIMIT, -1);
+	vmwrite(GUEST_FS_LIMIT, -1);
+	vmwrite(GUEST_GS_LIMIT, -1);
+	vmwrite(GUEST_LDTR_LIMIT, -1);
+	vmwrite(GUEST_TR_LIMIT, 0x67);
+	vmwrite(GUEST_GDTR_LIMIT, 0xffff);
+	vmwrite(GUEST_IDTR_LIMIT, 0xffff);
+	vmwrite(GUEST_ES_AR_BYTES,
+		vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
+	vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+	vmwrite(GUEST_DS_AR_BYTES,
+		vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_FS_AR_BYTES,
+		vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_GS_AR_BYTES,
+		vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
+	vmwrite(GUEST_TR_AR_BYTES, 0x8b);
+	vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+	vmwrite(GUEST_ACTIVITY_STATE, 0);
+	vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
+	vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
+
+	vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
+	vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
+	vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
+	vmwrite(GUEST_ES_BASE, 0);
+	vmwrite(GUEST_CS_BASE, 0);
+	vmwrite(GUEST_SS_BASE, 0);
+	vmwrite(GUEST_DS_BASE, 0);
+	vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
+	vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
+	vmwrite(GUEST_LDTR_BASE, 0);
+	vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
+	vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
+	vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
+	vmwrite(GUEST_DR7, 0x400);
+	vmwrite(GUEST_RSP, (uint64_t)rsp);
+	vmwrite(GUEST_RIP, (uint64_t)rip);
+	vmwrite(GUEST_RFLAGS, 2);
+	vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+	vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
+	vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
+}
+
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
+{
+	init_vmcs_control_fields(vmx);
+	init_vmcs_host_state();
+	init_vmcs_guest_state(guest_rip, guest_rsp);
+}
+
+static void nested_create_pte(struct kvm_vm *vm,
+			      struct eptPageTableEntry *pte,
+			      uint64_t nested_paddr,
+			      uint64_t paddr,
+			      int current_level,
+			      int target_level)
+{
+	if (!pte->readable) {
+		pte->writable = true;
+		pte->readable = true;
+		pte->executable = true;
+		pte->page_size = (current_level == target_level);
+		if (pte->page_size)
+			pte->address = paddr >> vm->page_shift;
+		else
+			pte->address = vm_alloc_page_table(vm) >> vm->page_shift;
+	} else {
+		/*
+		 * Entry already present.  Assert that the caller doesn't want
+		 * a hugepage at this level, and that there isn't a hugepage at
+		 * this level.
+		 */
+		TEST_ASSERT(current_level != target_level,
+			    "Cannot create hugepage at level: %u, nested_paddr: 0x%lx",
+			    current_level, nested_paddr);
+		TEST_ASSERT(!pte->page_size,
+			    "Cannot create page table at level: %u, nested_paddr: 0x%lx",
+			    current_level, nested_paddr);
+	}
+}
+
+
+void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		     uint64_t nested_paddr, uint64_t paddr, int target_level)
+{
+	const uint64_t page_size = PG_LEVEL_SIZE(target_level);
+	struct eptPageTableEntry *pt = vmx->eptp_hva, *pte;
+	uint16_t index;
+
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+		    "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+	TEST_ASSERT((nested_paddr >> 48) == 0,
+		    "Nested physical address 0x%lx requires 5-level paging",
+		    nested_paddr);
+	TEST_ASSERT((nested_paddr % page_size) == 0,
+		    "Nested physical address not on page boundary,\n"
+		    "  nested_paddr: 0x%lx page_size: 0x%lx",
+		    nested_paddr, page_size);
+	TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
+		    "Physical address beyond beyond maximum supported,\n"
+		    "  nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+		    paddr, vm->max_gfn, vm->page_size);
+	TEST_ASSERT((paddr % page_size) == 0,
+		    "Physical address not on page boundary,\n"
+		    "  paddr: 0x%lx page_size: 0x%lx",
+		    paddr, page_size);
+	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+		    "Physical address beyond beyond maximum supported,\n"
+		    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+		    paddr, vm->max_gfn, vm->page_size);
+
+	for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) {
+		index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
+		pte = &pt[index];
+
+		nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
+
+		if (pte->page_size)
+			break;
+
+		pt = addr_gpa2hva(vm, pte->address * vm->page_size);
+	}
+
+	/*
+	 * For now mark these as accessed and dirty because the only
+	 * testcase we have needs that.  Can be reconsidered later.
+	 */
+	pte->accessed = true;
+	pte->dirty = true;
+
+}
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		   uint64_t nested_paddr, uint64_t paddr)
+{
+	__nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
+}
+
+/*
+ * Map a range of EPT guest physical addresses to the VM's physical address
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   nested_paddr - Nested guest physical address to map
+ *   paddr - VM Physical Address
+ *   size - The size of the range to map
+ *   level - The level at which to map the range
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within the VM given by vm, creates a nested guest translation for the
+ * page range starting at nested_paddr to the page range starting at paddr.
+ */
+void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		  uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+		  int level)
+{
+	size_t page_size = PG_LEVEL_SIZE(level);
+	size_t npages = size / page_size;
+
+	TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
+	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
+
+	while (npages--) {
+		__nested_pg_map(vmx, vm, nested_paddr, paddr, level);
+		nested_paddr += page_size;
+		paddr += page_size;
+	}
+}
+
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+{
+	__nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
+}
+
+/* Prepare an identity extended page table that maps all the
+ * physical pages in VM.
+ */
+void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
+			uint32_t memslot)
+{
+	sparsebit_idx_t i, last;
+	struct userspace_mem_region *region =
+		memslot2region(vm, memslot);
+
+	i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
+	last = i + (region->region.memory_size >> vm->page_shift);
+	for (;;) {
+		i = sparsebit_next_clear(region->unused_phy_pages, i);
+		if (i > last)
+			break;
+
+		nested_map(vmx, vm,
+			   (uint64_t)i << vm->page_shift,
+			   (uint64_t)i << vm->page_shift,
+			   1 << vm->page_shift);
+	}
+}
+
+/* Identity map a region with 1GiB Pages. */
+void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+			    uint64_t addr, uint64_t size)
+{
+	__nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
+}
+
+bool kvm_cpu_has_ept(void)
+{
+	uint64_t ctrl;
+
+	ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32;
+	if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+		return false;
+
+	ctrl = kvm_get_feature_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
+	return ctrl & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
+		  uint32_t eptp_memslot)
+{
+	TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
+
+	vmx->eptp = (void *)vm_vaddr_alloc_page(vm);
+	vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
+	vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
+}
+
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm)
+{
+	vmx->apic_access = (void *)vm_vaddr_alloc_page(vm);
+	vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access);
+	vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access);
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/apic.c b/tools/testing/selftests/kvm/lib/x86_64/apic.c
deleted file mode 100644
index 89153a333e83..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/apic.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2021, Google LLC.
- */
-
-#include "apic.h"
-
-void apic_disable(void)
-{
-	wrmsr(MSR_IA32_APICBASE,
-	      rdmsr(MSR_IA32_APICBASE) &
-		~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD));
-}
-
-void xapic_enable(void)
-{
-	uint64_t val = rdmsr(MSR_IA32_APICBASE);
-
-	/* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */
-	if (val & MSR_IA32_APICBASE_EXTD) {
-		apic_disable();
-		wrmsr(MSR_IA32_APICBASE,
-		      rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE);
-	} else if (!(val & MSR_IA32_APICBASE_ENABLE)) {
-		wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE);
-	}
-
-	/*
-	 * Per SDM: reset value of spurious interrupt vector register has the
-	 * APIC software enabled bit=0. It must be enabled in addition to the
-	 * enable bit in the MSR.
-	 */
-	val = xapic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED;
-	xapic_write_reg(APIC_SPIV, val);
-}
-
-void x2apic_enable(void)
-{
-	wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) |
-	      MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD);
-	x2apic_write_reg(APIC_SPIV,
-			 x2apic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED);
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
deleted file mode 100644
index 7629819734af..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/handlers.S
+++ /dev/null
@@ -1,81 +0,0 @@
-handle_exception:
-	push %r15
-	push %r14
-	push %r13
-	push %r12
-	push %r11
-	push %r10
-	push %r9
-	push %r8
-
-	push %rdi
-	push %rsi
-	push %rbp
-	push %rbx
-	push %rdx
-	push %rcx
-	push %rax
-	mov %rsp, %rdi
-
-	call route_exception
-
-	pop %rax
-	pop %rcx
-	pop %rdx
-	pop %rbx
-	pop %rbp
-	pop %rsi
-	pop %rdi
-	pop %r8
-	pop %r9
-	pop %r10
-	pop %r11
-	pop %r12
-	pop %r13
-	pop %r14
-	pop %r15
-
-	/* Discard vector and error code. */
-	add $16, %rsp
-	iretq
-
-/*
- * Build the handle_exception wrappers which push the vector/error code on the
- * stack and an array of pointers to those wrappers.
- */
-.pushsection .rodata
-.globl idt_handlers
-idt_handlers:
-.popsection
-
-.macro HANDLERS has_error from to
-	vector = \from
-	.rept \to - \from + 1
-	.align 8
-
-	/* Fetch current address and append it to idt_handlers. */
-666 :
-.pushsection .rodata
-	.quad 666b
-.popsection
-
-	.if ! \has_error
-	pushq $0
-	.endif
-	pushq $vector
-	jmp handle_exception
-	vector = vector + 1
-	.endr
-.endm
-
-.global idt_handler_code
-idt_handler_code:
-	HANDLERS has_error=0 from=0  to=7
-	HANDLERS has_error=1 from=8  to=8
-	HANDLERS has_error=0 from=9  to=9
-	HANDLERS has_error=1 from=10 to=14
-	HANDLERS has_error=0 from=15 to=16
-	HANDLERS has_error=1 from=17 to=17
-	HANDLERS has_error=0 from=18 to=255
-
-.section        .note.GNU-stack, "", %progbits
diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
deleted file mode 100644
index 15bc8cd583aa..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Hyper-V specific functions.
- *
- * Copyright (C) 2021, Red Hat Inc.
- */
-#include <stdint.h>
-#include "processor.h"
-#include "hyperv.h"
-
-const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
-{
-	static struct kvm_cpuid2 *cpuid;
-	int kvm_fd;
-
-	if (cpuid)
-		return cpuid;
-
-	cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
-	kvm_fd = open_kvm_dev_path_or_exit();
-
-	kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-
-	close(kvm_fd);
-	return cpuid;
-}
-
-void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu)
-{
-	static struct kvm_cpuid2 *cpuid_full;
-	const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
-	int i, nent = 0;
-
-	if (!cpuid_full) {
-		cpuid_sys = kvm_get_supported_cpuid();
-		cpuid_hv = kvm_get_supported_hv_cpuid();
-
-		cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent);
-		if (!cpuid_full) {
-			perror("malloc");
-			abort();
-		}
-
-		/* Need to skip KVM CPUID leaves 0x400000xx */
-		for (i = 0; i < cpuid_sys->nent; i++) {
-			if (cpuid_sys->entries[i].function >= 0x40000000 &&
-			    cpuid_sys->entries[i].function < 0x40000100)
-				continue;
-			cpuid_full->entries[nent] = cpuid_sys->entries[i];
-			nent++;
-		}
-
-		memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
-		       cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
-		cpuid_full->nent = nent + cpuid_hv->nent;
-	}
-
-	vcpu_init_cpuid(vcpu, cpuid_full);
-}
-
-const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
-
-	vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-
-	return cpuid;
-}
-
-bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature)
-{
-	if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID))
-		return false;
-
-	return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature);
-}
-
-struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
-						       vm_vaddr_t *p_hv_pages_gva)
-{
-	vm_vaddr_t hv_pages_gva = vm_vaddr_alloc_page(vm);
-	struct hyperv_test_pages *hv = addr_gva2hva(vm, hv_pages_gva);
-
-	/* Setup of a region of guest memory for the VP Assist page. */
-	hv->vp_assist = (void *)vm_vaddr_alloc_page(vm);
-	hv->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->vp_assist);
-	hv->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->vp_assist);
-
-	/* Setup of a region of guest memory for the partition assist page. */
-	hv->partition_assist = (void *)vm_vaddr_alloc_page(vm);
-	hv->partition_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->partition_assist);
-	hv->partition_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->partition_assist);
-
-	/* Setup of a region of guest memory for the enlightened VMCS. */
-	hv->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm);
-	hv->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)hv->enlightened_vmcs);
-	hv->enlightened_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)hv->enlightened_vmcs);
-
-	*p_hv_pages_gva = hv_pages_gva;
-	return hv;
-}
-
-int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
-{
-	uint64_t val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
-		HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-
-	wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
-
-	current_vp_assist = vp_assist;
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/memstress.c b/tools/testing/selftests/kvm/lib/x86_64/memstress.c
deleted file mode 100644
index d61e623afc8c..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/memstress.c
+++ /dev/null
@@ -1,112 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * x86_64-specific extensions to memstress.c.
- *
- * Copyright (C) 2022, Google, Inc.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <linux/bitmap.h>
-#include <linux/bitops.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "memstress.h"
-#include "processor.h"
-#include "vmx.h"
-
-void memstress_l2_guest_code(uint64_t vcpu_id)
-{
-	memstress_guest_code(vcpu_id);
-	vmcall();
-}
-
-extern char memstress_l2_guest_entry[];
-__asm__(
-"memstress_l2_guest_entry:"
-"	mov (%rsp), %rdi;"
-"	call memstress_l2_guest_code;"
-"	ud2;"
-);
-
-static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	unsigned long *rsp;
-
-	GUEST_ASSERT(vmx->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
-	GUEST_ASSERT(load_vmcs(vmx));
-	GUEST_ASSERT(ept_1g_pages_supported());
-
-	rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
-	*rsp = vcpu_id;
-	prepare_vmcs(vmx, memstress_l2_guest_entry, rsp);
-
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	GUEST_DONE();
-}
-
-uint64_t memstress_nested_pages(int nr_vcpus)
-{
-	/*
-	 * 513 page tables is enough to identity-map 256 TiB of L2 with 1G
-	 * pages and 4-level paging, plus a few pages per-vCPU for data
-	 * structures such as the VMCS.
-	 */
-	return 513 + 10 * nr_vcpus;
-}
-
-void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
-{
-	uint64_t start, end;
-
-	prepare_eptp(vmx, vm, 0);
-
-	/*
-	 * Identity map the first 4G and the test region with 1G pages so that
-	 * KVM can shadow the EPT12 with the maximum huge page size supported
-	 * by the backing source.
-	 */
-	nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
-
-	start = align_down(memstress_args.gpa, PG_SIZE_1G);
-	end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G);
-	nested_identity_map_1g(vmx, vm, start, end - start);
-}
-
-void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
-{
-	struct vmx_pages *vmx, *vmx0 = NULL;
-	struct kvm_regs regs;
-	vm_vaddr_t vmx_gva;
-	int vcpu_id;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-	TEST_REQUIRE(kvm_cpu_has_ept());
-
-	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
-		vmx = vcpu_alloc_vmx(vm, &vmx_gva);
-
-		if (vcpu_id == 0) {
-			memstress_setup_ept(vmx, vm);
-			vmx0 = vmx;
-		} else {
-			/* Share the same EPT table across all vCPUs. */
-			vmx->eptp = vmx0->eptp;
-			vmx->eptp_hva = vmx0->eptp_hva;
-			vmx->eptp_gpa = vmx0->eptp_gpa;
-		}
-
-		/*
-		 * Override the vCPU to run memstress_l1_guest_code() which will
-		 * bounce it into L2 before calling memstress_guest_code().
-		 */
-		vcpu_regs_get(vcpus[vcpu_id], &regs);
-		regs.rip = (unsigned long) memstress_l1_guest_code;
-		vcpu_regs_set(vcpus[vcpu_id], &regs);
-		vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id);
-	}
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/pmu.c b/tools/testing/selftests/kvm/lib/x86_64/pmu.c
deleted file mode 100644
index f31f0427c17c..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/pmu.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2023, Tencent, Inc.
- */
-
-#include <stdint.h>
-
-#include <linux/kernel.h>
-
-#include "kvm_util.h"
-#include "pmu.h"
-
-const uint64_t intel_pmu_arch_events[] = {
-	INTEL_ARCH_CPU_CYCLES,
-	INTEL_ARCH_INSTRUCTIONS_RETIRED,
-	INTEL_ARCH_REFERENCE_CYCLES,
-	INTEL_ARCH_LLC_REFERENCES,
-	INTEL_ARCH_LLC_MISSES,
-	INTEL_ARCH_BRANCHES_RETIRED,
-	INTEL_ARCH_BRANCHES_MISPREDICTED,
-	INTEL_ARCH_TOPDOWN_SLOTS,
-};
-kvm_static_assert(ARRAY_SIZE(intel_pmu_arch_events) == NR_INTEL_ARCH_EVENTS);
-
-const uint64_t amd_pmu_zen_events[] = {
-	AMD_ZEN_CORE_CYCLES,
-	AMD_ZEN_INSTRUCTIONS_RETIRED,
-	AMD_ZEN_BRANCHES_RETIRED,
-	AMD_ZEN_BRANCHES_MISPREDICTED,
-};
-kvm_static_assert(ARRAY_SIZE(amd_pmu_zen_events) == NR_AMD_ZEN_EVENTS);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
deleted file mode 100644
index 636b29ba8985..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ /dev/null
@@ -1,1295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * tools/testing/selftests/kvm/lib/x86_64/processor.c
- *
- * Copyright (C) 2018, Google LLC.
- */
-
-#include "linux/bitmap.h"
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "sev.h"
-
-#ifndef NUM_INTERRUPTS
-#define NUM_INTERRUPTS 256
-#endif
-
-#define KERNEL_CS	0x8
-#define KERNEL_DS	0x10
-#define KERNEL_TSS	0x18
-
-vm_vaddr_t exception_handlers;
-bool host_cpu_is_amd;
-bool host_cpu_is_intel;
-bool is_forced_emulation_enabled;
-uint64_t guest_tsc_khz;
-
-static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
-{
-	fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
-		"rcx: 0x%.16llx rdx: 0x%.16llx\n",
-		indent, "",
-		regs->rax, regs->rbx, regs->rcx, regs->rdx);
-	fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
-		"rsp: 0x%.16llx rbp: 0x%.16llx\n",
-		indent, "",
-		regs->rsi, regs->rdi, regs->rsp, regs->rbp);
-	fprintf(stream, "%*sr8:  0x%.16llx r9:  0x%.16llx "
-		"r10: 0x%.16llx r11: 0x%.16llx\n",
-		indent, "",
-		regs->r8, regs->r9, regs->r10, regs->r11);
-	fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
-		"r14: 0x%.16llx r15: 0x%.16llx\n",
-		indent, "",
-		regs->r12, regs->r13, regs->r14, regs->r15);
-	fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
-		indent, "",
-		regs->rip, regs->rflags);
-}
-
-static void segment_dump(FILE *stream, struct kvm_segment *segment,
-			 uint8_t indent)
-{
-	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
-		"selector: 0x%.4x type: 0x%.2x\n",
-		indent, "", segment->base, segment->limit,
-		segment->selector, segment->type);
-	fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
-		"db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
-		indent, "", segment->present, segment->dpl,
-		segment->db, segment->s, segment->l);
-	fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
-		"unusable: 0x%.2x padding: 0x%.2x\n",
-		indent, "", segment->g, segment->avl,
-		segment->unusable, segment->padding);
-}
-
-static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
-			uint8_t indent)
-{
-	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
-		"padding: 0x%.4x 0x%.4x 0x%.4x\n",
-		indent, "", dtable->base, dtable->limit,
-		dtable->padding[0], dtable->padding[1], dtable->padding[2]);
-}
-
-static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
-{
-	unsigned int i;
-
-	fprintf(stream, "%*scs:\n", indent, "");
-	segment_dump(stream, &sregs->cs, indent + 2);
-	fprintf(stream, "%*sds:\n", indent, "");
-	segment_dump(stream, &sregs->ds, indent + 2);
-	fprintf(stream, "%*ses:\n", indent, "");
-	segment_dump(stream, &sregs->es, indent + 2);
-	fprintf(stream, "%*sfs:\n", indent, "");
-	segment_dump(stream, &sregs->fs, indent + 2);
-	fprintf(stream, "%*sgs:\n", indent, "");
-	segment_dump(stream, &sregs->gs, indent + 2);
-	fprintf(stream, "%*sss:\n", indent, "");
-	segment_dump(stream, &sregs->ss, indent + 2);
-	fprintf(stream, "%*str:\n", indent, "");
-	segment_dump(stream, &sregs->tr, indent + 2);
-	fprintf(stream, "%*sldt:\n", indent, "");
-	segment_dump(stream, &sregs->ldt, indent + 2);
-
-	fprintf(stream, "%*sgdt:\n", indent, "");
-	dtable_dump(stream, &sregs->gdt, indent + 2);
-	fprintf(stream, "%*sidt:\n", indent, "");
-	dtable_dump(stream, &sregs->idt, indent + 2);
-
-	fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
-		"cr3: 0x%.16llx cr4: 0x%.16llx\n",
-		indent, "",
-		sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
-	fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
-		"apic_base: 0x%.16llx\n",
-		indent, "",
-		sregs->cr8, sregs->efer, sregs->apic_base);
-
-	fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
-	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
-		fprintf(stream, "%*s%.16llx\n", indent + 2, "",
-			sregs->interrupt_bitmap[i]);
-	}
-}
-
-bool kvm_is_tdp_enabled(void)
-{
-	if (host_cpu_is_intel)
-		return get_kvm_intel_param_bool("ept");
-	else
-		return get_kvm_amd_param_bool("npt");
-}
-
-void virt_arch_pgd_alloc(struct kvm_vm *vm)
-{
-	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
-		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
-
-	/* If needed, create page map l4 table. */
-	if (!vm->pgd_created) {
-		vm->pgd = vm_alloc_page_table(vm);
-		vm->pgd_created = true;
-	}
-}
-
-static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
-			  uint64_t vaddr, int level)
-{
-	uint64_t pt_gpa = PTE_GET_PA(*parent_pte);
-	uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
-	int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
-
-	TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,
-		    "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
-		    level + 1, vaddr);
-
-	return &page_table[index];
-}
-
-static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
-				       uint64_t *parent_pte,
-				       uint64_t vaddr,
-				       uint64_t paddr,
-				       int current_level,
-				       int target_level)
-{
-	uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
-
-	paddr = vm_untag_gpa(vm, paddr);
-
-	if (!(*pte & PTE_PRESENT_MASK)) {
-		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
-		if (current_level == target_level)
-			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
-		else
-			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
-	} else {
-		/*
-		 * Entry already present.  Assert that the caller doesn't want
-		 * a hugepage at this level, and that there isn't a hugepage at
-		 * this level.
-		 */
-		TEST_ASSERT(current_level != target_level,
-			    "Cannot create hugepage at level: %u, vaddr: 0x%lx",
-			    current_level, vaddr);
-		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
-			    "Cannot create page table at level: %u, vaddr: 0x%lx",
-			    current_level, vaddr);
-	}
-	return pte;
-}
-
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
-{
-	const uint64_t pg_size = PG_LEVEL_SIZE(level);
-	uint64_t *pml4e, *pdpe, *pde;
-	uint64_t *pte;
-
-	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
-		    "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
-
-	TEST_ASSERT((vaddr % pg_size) == 0,
-		    "Virtual address not aligned,\n"
-		    "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size);
-	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)),
-		    "Invalid virtual address, vaddr: 0x%lx", vaddr);
-	TEST_ASSERT((paddr % pg_size) == 0,
-		    "Physical address not aligned,\n"
-		    "  paddr: 0x%lx page size: 0x%lx", paddr, pg_size);
-	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
-		    "Physical address beyond maximum supported,\n"
-		    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		    paddr, vm->max_gfn, vm->page_size);
-	TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,
-		    "Unexpected bits in paddr: %lx", paddr);
-
-	/*
-	 * Allocate upper level page tables, if not already present.  Return
-	 * early if a hugepage was created.
-	 */
-	pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level);
-	if (*pml4e & PTE_LARGE_MASK)
-		return;
-
-	pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level);
-	if (*pdpe & PTE_LARGE_MASK)
-		return;
-
-	pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level);
-	if (*pde & PTE_LARGE_MASK)
-		return;
-
-	/* Fill in page table entry. */
-	pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
-	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
-		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
-	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
-
-	/*
-	 * Neither SEV nor TDX supports shared page tables, so only the final
-	 * leaf PTE needs manually set the C/S-bit.
-	 */
-	if (vm_is_gpa_protected(vm, paddr))
-		*pte |= vm->arch.c_bit;
-	else
-		*pte |= vm->arch.s_bit;
-}
-
-void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
-{
-	__virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
-}
-
-void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-		    uint64_t nr_bytes, int level)
-{
-	uint64_t pg_size = PG_LEVEL_SIZE(level);
-	uint64_t nr_pages = nr_bytes / pg_size;
-	int i;
-
-	TEST_ASSERT(nr_bytes % pg_size == 0,
-		    "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",
-		    nr_bytes, pg_size);
-
-	for (i = 0; i < nr_pages; i++) {
-		__virt_pg_map(vm, vaddr, paddr, level);
-
-		vaddr += pg_size;
-		paddr += pg_size;
-	}
-}
-
-static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
-{
-	if (*pte & PTE_LARGE_MASK) {
-		TEST_ASSERT(*level == PG_LEVEL_NONE ||
-			    *level == current_level,
-			    "Unexpected hugepage at level %d", current_level);
-		*level = current_level;
-	}
-
-	return *level == current_level;
-}
-
-uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
-				    int *level)
-{
-	uint64_t *pml4e, *pdpe, *pde;
-
-	TEST_ASSERT(!vm->arch.is_pt_protected,
-		    "Walking page tables of protected guests is impossible");
-
-	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM,
-		    "Invalid PG_LEVEL_* '%d'", *level);
-
-	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
-		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
-	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
-		(vaddr >> vm->page_shift)),
-		"Invalid virtual address, vaddr: 0x%lx",
-		vaddr);
-	/*
-	 * Based on the mode check above there are 48 bits in the vaddr, so
-	 * shift 16 to sign extend the last bit (bit-47),
-	 */
-	TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16),
-		"Canonical check failed.  The virtual address is invalid.");
-
-	pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G);
-	if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G))
-		return pml4e;
-
-	pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G);
-	if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G))
-		return pdpe;
-
-	pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M);
-	if (vm_is_target_pte(pde, level, PG_LEVEL_2M))
-		return pde;
-
-	return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
-}
-
-uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
-{
-	int level = PG_LEVEL_4K;
-
-	return __vm_get_page_table_entry(vm, vaddr, &level);
-}
-
-void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
-{
-	uint64_t *pml4e, *pml4e_start;
-	uint64_t *pdpe, *pdpe_start;
-	uint64_t *pde, *pde_start;
-	uint64_t *pte, *pte_start;
-
-	if (!vm->pgd_created)
-		return;
-
-	fprintf(stream, "%*s                                          "
-		"                no\n", indent, "");
-	fprintf(stream, "%*s      index hvaddr         gpaddr         "
-		"addr         w exec dirty\n",
-		indent, "");
-	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
-	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
-		pml4e = &pml4e_start[n1];
-		if (!(*pml4e & PTE_PRESENT_MASK))
-			continue;
-		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
-			" %u\n",
-			indent, "",
-			pml4e - pml4e_start, pml4e,
-			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
-			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
-
-		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
-		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
-			pdpe = &pdpe_start[n2];
-			if (!(*pdpe & PTE_PRESENT_MASK))
-				continue;
-			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
-				"%u  %u\n",
-				indent, "",
-				pdpe - pdpe_start, pdpe,
-				addr_hva2gpa(vm, pdpe),
-				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
-				!!(*pdpe & PTE_NX_MASK));
-
-			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
-			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
-				pde = &pde_start[n3];
-				if (!(*pde & PTE_PRESENT_MASK))
-					continue;
-				fprintf(stream, "%*spde   0x%-3zx %p "
-					"0x%-12lx 0x%-10llx %u  %u\n",
-					indent, "", pde - pde_start, pde,
-					addr_hva2gpa(vm, pde),
-					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
-					!!(*pde & PTE_NX_MASK));
-
-				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
-				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
-					pte = &pte_start[n4];
-					if (!(*pte & PTE_PRESENT_MASK))
-						continue;
-					fprintf(stream, "%*spte   0x%-3zx %p "
-						"0x%-12lx 0x%-10llx %u  %u "
-						"    %u    0x%-10lx\n",
-						indent, "",
-						pte - pte_start, pte,
-						addr_hva2gpa(vm, pte),
-						PTE_GET_PFN(*pte),
-						!!(*pte & PTE_WRITABLE_MASK),
-						!!(*pte & PTE_NX_MASK),
-						!!(*pte & PTE_DIRTY_MASK),
-						((uint64_t) n1 << 27)
-							| ((uint64_t) n2 << 18)
-							| ((uint64_t) n3 << 9)
-							| ((uint64_t) n4));
-				}
-			}
-		}
-	}
-}
-
-/*
- * Set Unusable Segment
- *
- * Input Args: None
- *
- * Output Args:
- *   segp - Pointer to segment register
- *
- * Return: None
- *
- * Sets the segment register pointed to by @segp to an unusable state.
- */
-static void kvm_seg_set_unusable(struct kvm_segment *segp)
-{
-	memset(segp, 0, sizeof(*segp));
-	segp->unusable = true;
-}
-
-static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
-{
-	void *gdt = addr_gva2hva(vm, vm->arch.gdt);
-	struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
-
-	desc->limit0 = segp->limit & 0xFFFF;
-	desc->base0 = segp->base & 0xFFFF;
-	desc->base1 = segp->base >> 16;
-	desc->type = segp->type;
-	desc->s = segp->s;
-	desc->dpl = segp->dpl;
-	desc->p = segp->present;
-	desc->limit1 = segp->limit >> 16;
-	desc->avl = segp->avl;
-	desc->l = segp->l;
-	desc->db = segp->db;
-	desc->g = segp->g;
-	desc->base2 = segp->base >> 24;
-	if (!segp->s)
-		desc->base3 = segp->base >> 32;
-}
-
-static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp)
-{
-	memset(segp, 0, sizeof(*segp));
-	segp->selector = KERNEL_CS;
-	segp->limit = 0xFFFFFFFFu;
-	segp->s = 0x1; /* kTypeCodeData */
-	segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
-					  * | kFlagCodeReadable
-					  */
-	segp->g = true;
-	segp->l = true;
-	segp->present = 1;
-}
-
-static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)
-{
-	memset(segp, 0, sizeof(*segp));
-	segp->selector = KERNEL_DS;
-	segp->limit = 0xFFFFFFFFu;
-	segp->s = 0x1; /* kTypeCodeData */
-	segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
-					  * | kFlagDataWritable
-					  */
-	segp->g = true;
-	segp->present = true;
-}
-
-vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
-{
-	int level = PG_LEVEL_NONE;
-	uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);
-
-	TEST_ASSERT(*pte & PTE_PRESENT_MASK,
-		    "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
-
-	/*
-	 * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
-	 * address bits to be zero.
-	 */
-	return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));
-}
-
-static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp)
-{
-	memset(segp, 0, sizeof(*segp));
-	segp->base = base;
-	segp->limit = 0x67;
-	segp->selector = KERNEL_TSS;
-	segp->type = 0xb;
-	segp->present = 1;
-}
-
-static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
-{
-	struct kvm_sregs sregs;
-
-	TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K);
-
-	/* Set mode specific system register values. */
-	vcpu_sregs_get(vcpu, &sregs);
-
-	sregs.idt.base = vm->arch.idt;
-	sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
-	sregs.gdt.base = vm->arch.gdt;
-	sregs.gdt.limit = getpagesize() - 1;
-
-	sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
-	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
-	if (kvm_cpu_has(X86_FEATURE_XSAVE))
-		sregs.cr4 |= X86_CR4_OSXSAVE;
-	sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
-
-	kvm_seg_set_unusable(&sregs.ldt);
-	kvm_seg_set_kernel_code_64bit(&sregs.cs);
-	kvm_seg_set_kernel_data_64bit(&sregs.ds);
-	kvm_seg_set_kernel_data_64bit(&sregs.es);
-	kvm_seg_set_kernel_data_64bit(&sregs.gs);
-	kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);
-
-	sregs.cr3 = vm->pgd;
-	vcpu_sregs_set(vcpu, &sregs);
-}
-
-static void vcpu_init_xcrs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
-{
-	struct kvm_xcrs xcrs = {
-		.nr_xcrs = 1,
-		.xcrs[0].xcr = 0,
-		.xcrs[0].value = kvm_cpu_supported_xcr0(),
-	};
-
-	if (!kvm_cpu_has(X86_FEATURE_XSAVE))
-		return;
-
-	vcpu_xcrs_set(vcpu, &xcrs);
-}
-
-static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
-			  int dpl, unsigned short selector)
-{
-	struct idt_entry *base =
-		(struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);
-	struct idt_entry *e = &base[vector];
-
-	memset(e, 0, sizeof(*e));
-	e->offset0 = addr;
-	e->selector = selector;
-	e->ist = 0;
-	e->type = 14;
-	e->dpl = dpl;
-	e->p = 1;
-	e->offset1 = addr >> 16;
-	e->offset2 = addr >> 32;
-}
-
-static bool kvm_fixup_exception(struct ex_regs *regs)
-{
-	if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10)
-		return false;
-
-	if (regs->vector == DE_VECTOR)
-		return false;
-
-	regs->rip = regs->r11;
-	regs->r9 = regs->vector;
-	regs->r10 = regs->error_code;
-	return true;
-}
-
-void route_exception(struct ex_regs *regs)
-{
-	typedef void(*handler)(struct ex_regs *);
-	handler *handlers = (handler *)exception_handlers;
-
-	if (handlers && handlers[regs->vector]) {
-		handlers[regs->vector](regs);
-		return;
-	}
-
-	if (kvm_fixup_exception(regs))
-		return;
-
-	GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'",
-		   regs->vector, regs->rip);
-}
-
-static void vm_init_descriptor_tables(struct kvm_vm *vm)
-{
-	extern void *idt_handlers;
-	struct kvm_segment seg;
-	int i;
-
-	vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
-	vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
-	vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
-	vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
-
-	/* Handlers have the same address in both address spaces.*/
-	for (i = 0; i < NUM_INTERRUPTS; i++)
-		set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS);
-
-	*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
-
-	kvm_seg_set_kernel_code_64bit(&seg);
-	kvm_seg_fill_gdt_64bit(vm, &seg);
-
-	kvm_seg_set_kernel_data_64bit(&seg);
-	kvm_seg_fill_gdt_64bit(vm, &seg);
-
-	kvm_seg_set_tss_64bit(vm->arch.tss, &seg);
-	kvm_seg_fill_gdt_64bit(vm, &seg);
-}
-
-void vm_install_exception_handler(struct kvm_vm *vm, int vector,
-			       void (*handler)(struct ex_regs *))
-{
-	vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
-
-	handlers[vector] = (vm_vaddr_t)handler;
-}
-
-void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	if (get_ucall(vcpu, &uc) == UCALL_ABORT)
-		REPORT_GUEST_ASSERT(uc);
-}
-
-void kvm_arch_vm_post_create(struct kvm_vm *vm)
-{
-	int r;
-
-	TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
-		    "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
-
-	vm_create_irqchip(vm);
-	vm_init_descriptor_tables(vm);
-
-	sync_global_to_guest(vm, host_cpu_is_intel);
-	sync_global_to_guest(vm, host_cpu_is_amd);
-	sync_global_to_guest(vm, is_forced_emulation_enabled);
-
-	if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
-		struct kvm_sev_init init = { 0 };
-
-		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
-	}
-
-	r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
-	TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
-	guest_tsc_khz = r;
-	sync_global_to_guest(vm, guest_tsc_khz);
-}
-
-void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
-{
-	struct kvm_regs regs;
-
-	vcpu_regs_get(vcpu, &regs);
-	regs.rip = (unsigned long) guest_code;
-	vcpu_regs_set(vcpu, &regs);
-}
-
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
-{
-	struct kvm_mp_state mp_state;
-	struct kvm_regs regs;
-	vm_vaddr_t stack_vaddr;
-	struct kvm_vcpu *vcpu;
-
-	stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
-				       DEFAULT_GUEST_STACK_VADDR_MIN,
-				       MEM_REGION_DATA);
-
-	stack_vaddr += DEFAULT_STACK_PGS * getpagesize();
-
-	/*
-	 * Align stack to match calling sequence requirements in section "The
-	 * Stack Frame" of the System V ABI AMD64 Architecture Processor
-	 * Supplement, which requires the value (%rsp + 8) to be a multiple of
-	 * 16 when control is transferred to the function entry point.
-	 *
-	 * If this code is ever used to launch a vCPU with 32-bit entry point it
-	 * may need to subtract 4 bytes instead of 8 bytes.
-	 */
-	TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE),
-		    "__vm_vaddr_alloc() did not provide a page-aligned address");
-	stack_vaddr -= 8;
-
-	vcpu = __vm_vcpu_add(vm, vcpu_id);
-	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
-	vcpu_init_sregs(vm, vcpu);
-	vcpu_init_xcrs(vm, vcpu);
-
-	/* Setup guest general purpose registers */
-	vcpu_regs_get(vcpu, &regs);
-	regs.rflags = regs.rflags | 0x2;
-	regs.rsp = stack_vaddr;
-	vcpu_regs_set(vcpu, &regs);
-
-	/* Setup the MP state */
-	mp_state.mp_state = 0;
-	vcpu_mp_state_set(vcpu, &mp_state);
-
-	/*
-	 * Refresh CPUID after setting SREGS and XCR0, so that KVM's "runtime"
-	 * updates to guest CPUID, e.g. for OSXSAVE and XSAVE state size, are
-	 * reflected into selftests' vCPU CPUID cache, i.e. so that the cache
-	 * is consistent with vCPU state.
-	 */
-	vcpu_get_cpuid(vcpu);
-	return vcpu;
-}
-
-struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id)
-{
-	struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
-
-	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
-
-	return vcpu;
-}
-
-void vcpu_arch_free(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->cpuid)
-		free(vcpu->cpuid);
-}
-
-/* Do not use kvm_supported_cpuid directly except for validity checks. */
-static void *kvm_supported_cpuid;
-
-const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
-{
-	int kvm_fd;
-
-	if (kvm_supported_cpuid)
-		return kvm_supported_cpuid;
-
-	kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
-	kvm_fd = open_kvm_dev_path_or_exit();
-
-	kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID,
-		  (struct kvm_cpuid2 *)kvm_supported_cpuid);
-
-	close(kvm_fd);
-	return kvm_supported_cpuid;
-}
-
-static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,
-			      uint32_t function, uint32_t index,
-			      uint8_t reg, uint8_t lo, uint8_t hi)
-{
-	const struct kvm_cpuid_entry2 *entry;
-	int i;
-
-	for (i = 0; i < cpuid->nent; i++) {
-		entry = &cpuid->entries[i];
-
-		/*
-		 * The output registers in kvm_cpuid_entry2 are in alphabetical
-		 * order, but kvm_x86_cpu_feature matches that mess, so yay
-		 * pointer shenanigans!
-		 */
-		if (entry->function == function && entry->index == index)
-			return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;
-	}
-
-	return 0;
-}
-
-bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
-		   struct kvm_x86_cpu_feature feature)
-{
-	return __kvm_cpu_has(cpuid, feature.function, feature.index,
-			     feature.reg, feature.bit, feature.bit);
-}
-
-uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
-			    struct kvm_x86_cpu_property property)
-{
-	return __kvm_cpu_has(cpuid, property.function, property.index,
-			     property.reg, property.lo_bit, property.hi_bit);
-}
-
-uint64_t kvm_get_feature_msr(uint64_t msr_index)
-{
-	struct {
-		struct kvm_msrs header;
-		struct kvm_msr_entry entry;
-	} buffer = {};
-	int r, kvm_fd;
-
-	buffer.header.nmsrs = 1;
-	buffer.entry.index = msr_index;
-	kvm_fd = open_kvm_dev_path_or_exit();
-
-	r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
-	TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r));
-
-	close(kvm_fd);
-	return buffer.entry.data;
-}
-
-void __vm_xsave_require_permission(uint64_t xfeature, const char *name)
-{
-	int kvm_fd;
-	u64 bitmask;
-	long rc;
-	struct kvm_device_attr attr = {
-		.group = 0,
-		.attr = KVM_X86_XCOMP_GUEST_SUPP,
-		.addr = (unsigned long) &bitmask,
-	};
-
-	TEST_ASSERT(!kvm_supported_cpuid,
-		    "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM");
-
-	TEST_ASSERT(is_power_of_2(xfeature),
-		    "Dynamic XFeatures must be enabled one at a time");
-
-	kvm_fd = open_kvm_dev_path_or_exit();
-	rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
-	close(kvm_fd);
-
-	if (rc == -1 && (errno == ENXIO || errno == EINVAL))
-		__TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported");
-
-	TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
-
-	__TEST_REQUIRE(bitmask & xfeature,
-		       "Required XSAVE feature '%s' not supported", name);
-
-	TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature)));
-
-	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
-	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
-	TEST_ASSERT(bitmask & xfeature,
-		    "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx",
-		    name, xfeature, bitmask);
-}
-
-void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)
-{
-	TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID");
-
-	/* Allow overriding the default CPUID. */
-	if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) {
-		free(vcpu->cpuid);
-		vcpu->cpuid = NULL;
-	}
-
-	if (!vcpu->cpuid)
-		vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent);
-
-	memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
-	vcpu_set_cpuid(vcpu);
-}
-
-void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
-			     struct kvm_x86_cpu_property property,
-			     uint32_t value)
-{
-	struct kvm_cpuid_entry2 *entry;
-
-	entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);
-
-	(&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);
-	(&entry->eax)[property.reg] |= value << property.lo_bit;
-
-	vcpu_set_cpuid(vcpu);
-
-	/* Sanity check that @value doesn't exceed the bounds in any way. */
-	TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);
-}
-
-void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function)
-{
-	struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function);
-
-	entry->eax = 0;
-	entry->ebx = 0;
-	entry->ecx = 0;
-	entry->edx = 0;
-	vcpu_set_cpuid(vcpu);
-}
-
-void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
-				     struct kvm_x86_cpu_feature feature,
-				     bool set)
-{
-	struct kvm_cpuid_entry2 *entry;
-	u32 *reg;
-
-	entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
-	reg = (&entry->eax) + feature.reg;
-
-	if (set)
-		*reg |= BIT(feature.bit);
-	else
-		*reg &= ~BIT(feature.bit);
-
-	vcpu_set_cpuid(vcpu);
-}
-
-uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index)
-{
-	struct {
-		struct kvm_msrs header;
-		struct kvm_msr_entry entry;
-	} buffer = {};
-
-	buffer.header.nmsrs = 1;
-	buffer.entry.index = msr_index;
-
-	vcpu_msrs_get(vcpu, &buffer.header);
-
-	return buffer.entry.data;
-}
-
-int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value)
-{
-	struct {
-		struct kvm_msrs header;
-		struct kvm_msr_entry entry;
-	} buffer = {};
-
-	memset(&buffer, 0, sizeof(buffer));
-	buffer.header.nmsrs = 1;
-	buffer.entry.index = msr_index;
-	buffer.entry.data = msr_value;
-
-	return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header);
-}
-
-void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
-{
-	va_list ap;
-	struct kvm_regs regs;
-
-	TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
-		    "  num: %u",
-		    num);
-
-	va_start(ap, num);
-	vcpu_regs_get(vcpu, &regs);
-
-	if (num >= 1)
-		regs.rdi = va_arg(ap, uint64_t);
-
-	if (num >= 2)
-		regs.rsi = va_arg(ap, uint64_t);
-
-	if (num >= 3)
-		regs.rdx = va_arg(ap, uint64_t);
-
-	if (num >= 4)
-		regs.rcx = va_arg(ap, uint64_t);
-
-	if (num >= 5)
-		regs.r8 = va_arg(ap, uint64_t);
-
-	if (num >= 6)
-		regs.r9 = va_arg(ap, uint64_t);
-
-	vcpu_regs_set(vcpu, &regs);
-	va_end(ap);
-}
-
-void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
-{
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
-
-	fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id);
-
-	fprintf(stream, "%*sregs:\n", indent + 2, "");
-	vcpu_regs_get(vcpu, &regs);
-	regs_dump(stream, &regs, indent + 4);
-
-	fprintf(stream, "%*ssregs:\n", indent + 2, "");
-	vcpu_sregs_get(vcpu, &sregs);
-	sregs_dump(stream, &sregs, indent + 4);
-}
-
-static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs)
-{
-	struct kvm_msr_list *list;
-	struct kvm_msr_list nmsrs;
-	int kvm_fd, r;
-
-	kvm_fd = open_kvm_dev_path_or_exit();
-
-	nmsrs.nmsrs = 0;
-	if (!feature_msrs)
-		r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
-	else
-		r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs);
-
-	TEST_ASSERT(r == -1 && errno == E2BIG,
-		    "Expected -E2BIG, got rc: %i errno: %i (%s)",
-		    r, errno, strerror(errno));
-
-	list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0]));
-	TEST_ASSERT(list, "-ENOMEM when allocating MSR index list");
-	list->nmsrs = nmsrs.nmsrs;
-
-	if (!feature_msrs)
-		kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
-	else
-		kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
-	close(kvm_fd);
-
-	TEST_ASSERT(list->nmsrs == nmsrs.nmsrs,
-		    "Number of MSRs in list changed, was %d, now %d",
-		    nmsrs.nmsrs, list->nmsrs);
-	return list;
-}
-
-const struct kvm_msr_list *kvm_get_msr_index_list(void)
-{
-	static const struct kvm_msr_list *list;
-
-	if (!list)
-		list = __kvm_get_msr_index_list(false);
-	return list;
-}
-
-
-const struct kvm_msr_list *kvm_get_feature_msr_index_list(void)
-{
-	static const struct kvm_msr_list *list;
-
-	if (!list)
-		list = __kvm_get_msr_index_list(true);
-	return list;
-}
-
-bool kvm_msr_is_in_save_restore_list(uint32_t msr_index)
-{
-	const struct kvm_msr_list *list = kvm_get_msr_index_list();
-	int i;
-
-	for (i = 0; i < list->nmsrs; ++i) {
-		if (list->indices[i] == msr_index)
-			return true;
-	}
-
-	return false;
-}
-
-static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu,
-				  struct kvm_x86_state *state)
-{
-	int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2);
-
-	if (size) {
-		state->xsave = malloc(size);
-		vcpu_xsave2_get(vcpu, state->xsave);
-	} else {
-		state->xsave = malloc(sizeof(struct kvm_xsave));
-		vcpu_xsave_get(vcpu, state->xsave);
-	}
-}
-
-struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu)
-{
-	const struct kvm_msr_list *msr_list = kvm_get_msr_index_list();
-	struct kvm_x86_state *state;
-	int i;
-
-	static int nested_size = -1;
-
-	if (nested_size == -1) {
-		nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
-		TEST_ASSERT(nested_size <= sizeof(state->nested_),
-			    "Nested state size too big, %i > %zi",
-			    nested_size, sizeof(state->nested_));
-	}
-
-	/*
-	 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
-	 * guest state is consistent only after userspace re-enters the
-	 * kernel with KVM_RUN.  Complete IO prior to migrating state
-	 * to a new VM.
-	 */
-	vcpu_run_complete_io(vcpu);
-
-	state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0]));
-	TEST_ASSERT(state, "-ENOMEM when allocating kvm state");
-
-	vcpu_events_get(vcpu, &state->events);
-	vcpu_mp_state_get(vcpu, &state->mp_state);
-	vcpu_regs_get(vcpu, &state->regs);
-	vcpu_save_xsave_state(vcpu, state);
-
-	if (kvm_has_cap(KVM_CAP_XCRS))
-		vcpu_xcrs_get(vcpu, &state->xcrs);
-
-	vcpu_sregs_get(vcpu, &state->sregs);
-
-	if (nested_size) {
-		state->nested.size = sizeof(state->nested_);
-
-		vcpu_nested_state_get(vcpu, &state->nested);
-		TEST_ASSERT(state->nested.size <= nested_size,
-			    "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
-			    state->nested.size, nested_size);
-	} else {
-		state->nested.size = 0;
-	}
-
-	state->msrs.nmsrs = msr_list->nmsrs;
-	for (i = 0; i < msr_list->nmsrs; i++)
-		state->msrs.entries[i].index = msr_list->indices[i];
-	vcpu_msrs_get(vcpu, &state->msrs);
-
-	vcpu_debugregs_get(vcpu, &state->debugregs);
-
-	return state;
-}
-
-void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state)
-{
-	vcpu_sregs_set(vcpu, &state->sregs);
-	vcpu_msrs_set(vcpu, &state->msrs);
-
-	if (kvm_has_cap(KVM_CAP_XCRS))
-		vcpu_xcrs_set(vcpu, &state->xcrs);
-
-	vcpu_xsave_set(vcpu,  state->xsave);
-	vcpu_events_set(vcpu, &state->events);
-	vcpu_mp_state_set(vcpu, &state->mp_state);
-	vcpu_debugregs_set(vcpu, &state->debugregs);
-	vcpu_regs_set(vcpu, &state->regs);
-
-	if (state->nested.size)
-		vcpu_nested_state_set(vcpu, &state->nested);
-}
-
-void kvm_x86_state_cleanup(struct kvm_x86_state *state)
-{
-	free(state->xsave);
-	free(state);
-}
-
-void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
-{
-	if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
-		*pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;
-		*va_bits = 32;
-	} else {
-		*pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
-		*va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);
-	}
-}
-
-void kvm_init_vm_address_properties(struct kvm_vm *vm)
-{
-	if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
-		vm->arch.sev_fd = open_sev_dev_path_or_exit();
-		vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
-		vm->gpa_tag_mask = vm->arch.c_bit;
-	} else {
-		vm->arch.sev_fd = -1;
-	}
-}
-
-const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
-					       uint32_t function, uint32_t index)
-{
-	int i;
-
-	for (i = 0; i < cpuid->nent; i++) {
-		if (cpuid->entries[i].function == function &&
-		    cpuid->entries[i].index == index)
-			return &cpuid->entries[i];
-	}
-
-	TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
-
-	return NULL;
-}
-
-#define X86_HYPERCALL(inputs...)					\
-({									\
-	uint64_t r;							\
-									\
-	asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t"		\
-		     "jnz 1f\n\t"					\
-		     "vmcall\n\t"					\
-		     "jmp 2f\n\t"					\
-		     "1: vmmcall\n\t"					\
-		     "2:"						\
-		     : "=a"(r)						\
-		     : [use_vmmcall] "r" (host_cpu_is_amd), inputs);	\
-									\
-	r;								\
-})
-
-uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
-		       uint64_t a3)
-{
-	return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
-}
-
-uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
-{
-	return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1));
-}
-
-void xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
-{
-	GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));
-}
-
-unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
-{
-	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
-	unsigned long ht_gfn, max_gfn, max_pfn;
-	uint8_t maxphyaddr, guest_maxphyaddr;
-
-	/*
-	 * Use "guest MAXPHYADDR" from KVM if it's available.  Guest MAXPHYADDR
-	 * enumerates the max _mappable_ GPA, which can be less than the raw
-	 * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU
-	 * doesn't support 5-level TDP.
-	 */
-	guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
-	guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits;
-	TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits,
-		    "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR");
-
-	max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1;
-
-	/* Avoid reserved HyperTransport region on AMD processors.  */
-	if (!host_cpu_is_amd)
-		return max_gfn;
-
-	/* On parts with <40 physical address bits, the area is fully hidden */
-	if (vm->pa_bits < 40)
-		return max_gfn;
-
-	/* Before family 17h, the HyperTransport area is just below 1T.  */
-	ht_gfn = (1 << 28) - num_ht_pages;
-	if (this_cpu_family() < 0x17)
-		goto done;
-
-	/*
-	 * Otherwise it's at the top of the physical address space, possibly
-	 * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX.  Use
-	 * the old conservative value if MAXPHYADDR is not enumerated.
-	 */
-	if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))
-		goto done;
-
-	maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
-	max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;
-
-	if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))
-		max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);
-
-	ht_gfn = max_pfn - num_ht_pages;
-done:
-	return min(max_gfn, ht_gfn - 1);
-}
-
-/* Returns true if kvm_intel was loaded with unrestricted_guest=1. */
-bool vm_is_unrestricted_guest(struct kvm_vm *vm)
-{
-	/* Ensure that a KVM vendor-specific module is loaded. */
-	if (vm == NULL)
-		close(open_kvm_dev_path_or_exit());
-
-	return get_kvm_intel_param_bool("unrestricted_guest");
-}
-
-void kvm_selftest_arch_init(void)
-{
-	host_cpu_is_intel = this_cpu_is_intel();
-	host_cpu_is_amd = this_cpu_is_amd();
-	is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
-}
-
-bool sys_clocksource_is_based_on_tsc(void)
-{
-	char *clk_name = sys_get_cur_clocksource();
-	bool ret = !strcmp(clk_name, "tsc\n") ||
-		   !strcmp(clk_name, "hyperv_clocksource_tsc_page\n");
-
-	free(clk_name);
-
-	return ret;
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c
deleted file mode 100644
index e9535ee20b7f..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/sev.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "sev.h"
-
-/*
- * sparsebit_next_clear() can return 0 if [x, 2**64-1] are all set, and the
- * -1 would then cause an underflow back to 2**64 - 1. This is expected and
- * correct.
- *
- * If the last range in the sparsebit is [x, y] and we try to iterate,
- * sparsebit_next_set() will return 0, and sparsebit_next_clear() will try
- * and find the first range, but that's correct because the condition
- * expression would cause us to quit the loop.
- */
-static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region)
-{
-	const struct sparsebit *protected_phy_pages = region->protected_phy_pages;
-	const vm_paddr_t gpa_base = region->region.guest_phys_addr;
-	const sparsebit_idx_t lowest_page_in_region = gpa_base >> vm->page_shift;
-	sparsebit_idx_t i, j;
-
-	if (!sparsebit_any_set(protected_phy_pages))
-		return;
-
-	sev_register_encrypted_memory(vm, region);
-
-	sparsebit_for_each_set_range(protected_phy_pages, i, j) {
-		const uint64_t size = (j - i + 1) * vm->page_size;
-		const uint64_t offset = (i - lowest_page_in_region) * vm->page_size;
-
-		sev_launch_update_data(vm, gpa_base + offset, size);
-	}
-}
-
-void sev_vm_init(struct kvm_vm *vm)
-{
-	if (vm->type == KVM_X86_DEFAULT_VM) {
-		assert(vm->arch.sev_fd == -1);
-		vm->arch.sev_fd = open_sev_dev_path_or_exit();
-		vm_sev_ioctl(vm, KVM_SEV_INIT, NULL);
-	} else {
-		struct kvm_sev_init init = { 0 };
-		assert(vm->type == KVM_X86_SEV_VM);
-		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
-	}
-}
-
-void sev_es_vm_init(struct kvm_vm *vm)
-{
-	if (vm->type == KVM_X86_DEFAULT_VM) {
-		assert(vm->arch.sev_fd == -1);
-		vm->arch.sev_fd = open_sev_dev_path_or_exit();
-		vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL);
-	} else {
-		struct kvm_sev_init init = { 0 };
-		assert(vm->type == KVM_X86_SEV_ES_VM);
-		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
-	}
-}
-
-void sev_vm_launch(struct kvm_vm *vm, uint32_t policy)
-{
-	struct kvm_sev_launch_start launch_start = {
-		.policy = policy,
-	};
-	struct userspace_mem_region *region;
-	struct kvm_sev_guest_status status;
-	int ctr;
-
-	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_START, &launch_start);
-	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
-
-	TEST_ASSERT_EQ(status.policy, policy);
-	TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE);
-
-	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node)
-		encrypt_region(vm, region);
-
-	if (policy & SEV_POLICY_ES)
-		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
-
-	vm->arch.is_pt_protected = true;
-}
-
-void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement)
-{
-	struct kvm_sev_launch_measure launch_measure;
-	struct kvm_sev_guest_status guest_status;
-
-	launch_measure.len = 256;
-	launch_measure.uaddr = (__u64)measurement;
-	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_MEASURE, &launch_measure);
-
-	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &guest_status);
-	TEST_ASSERT_EQ(guest_status.state, SEV_GUEST_STATE_LAUNCH_SECRET);
-}
-
-void sev_vm_launch_finish(struct kvm_vm *vm)
-{
-	struct kvm_sev_guest_status status;
-
-	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
-	TEST_ASSERT(status.state == SEV_GUEST_STATE_LAUNCH_UPDATE ||
-		    status.state == SEV_GUEST_STATE_LAUNCH_SECRET,
-		    "Unexpected guest state: %d", status.state);
-
-	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_FINISH, NULL);
-
-	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
-	TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING);
-}
-
-struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
-					   struct kvm_vcpu **cpu)
-{
-	struct vm_shape shape = {
-		.mode = VM_MODE_DEFAULT,
-		.type = type,
-	};
-	struct kvm_vm *vm;
-	struct kvm_vcpu *cpus[1];
-
-	vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus);
-	*cpu = cpus[0];
-
-	return vm;
-}
-
-void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement)
-{
-	sev_vm_launch(vm, policy);
-
-	if (!measurement)
-		measurement = alloca(256);
-
-	sev_vm_launch_measure(vm, measurement);
-
-	sev_vm_launch_finish(vm);
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c
deleted file mode 100644
index 5495a92dfd5a..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/svm.c
+++ /dev/null
@@ -1,164 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * tools/testing/selftests/kvm/lib/x86_64/svm.c
- * Helpers used for nested SVM testing
- * Largely inspired from KVM unit test svm.c
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-
-#define SEV_DEV_PATH "/dev/sev"
-
-struct gpr64_regs guest_regs;
-u64 rflags;
-
-/* Allocate memory regions for nested SVM tests.
- *
- * Input Args:
- *   vm - The VM to allocate guest-virtual addresses in.
- *
- * Output Args:
- *   p_svm_gva - The guest virtual address for the struct svm_test_data.
- *
- * Return:
- *   Pointer to structure with the addresses of the SVM areas.
- */
-struct svm_test_data *
-vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva)
-{
-	vm_vaddr_t svm_gva = vm_vaddr_alloc_page(vm);
-	struct svm_test_data *svm = addr_gva2hva(vm, svm_gva);
-
-	svm->vmcb = (void *)vm_vaddr_alloc_page(vm);
-	svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb);
-	svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb);
-
-	svm->save_area = (void *)vm_vaddr_alloc_page(vm);
-	svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area);
-	svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area);
-
-	svm->msr = (void *)vm_vaddr_alloc_page(vm);
-	svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr);
-	svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr);
-	memset(svm->msr_hva, 0, getpagesize());
-
-	*p_svm_gva = svm_gva;
-	return svm;
-}
-
-static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector,
-			 u64 base, u32 limit, u32 attr)
-{
-	seg->selector = selector;
-	seg->attrib = attr;
-	seg->limit = limit;
-	seg->base = base;
-}
-
-void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
-{
-	struct vmcb *vmcb = svm->vmcb;
-	uint64_t vmcb_gpa = svm->vmcb_gpa;
-	struct vmcb_save_area *save = &vmcb->save;
-	struct vmcb_control_area *ctrl = &vmcb->control;
-	u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
-	      | SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK;
-	u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
-		| SVM_SELECTOR_L_MASK | SVM_SELECTOR_G_MASK;
-	uint64_t efer;
-
-	efer = rdmsr(MSR_EFER);
-	wrmsr(MSR_EFER, efer | EFER_SVME);
-	wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
-
-	memset(vmcb, 0, sizeof(*vmcb));
-	asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
-	vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
-	vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
-	vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr);
-	vmcb_set_seg(&save->ds, get_ds(), 0, -1U, data_seg_attr);
-	vmcb_set_seg(&save->gdtr, 0, get_gdt().address, get_gdt().size, 0);
-	vmcb_set_seg(&save->idtr, 0, get_idt().address, get_idt().size, 0);
-
-	ctrl->asid = 1;
-	save->cpl = 0;
-	save->efer = rdmsr(MSR_EFER);
-	asm volatile ("mov %%cr4, %0" : "=r"(save->cr4) : : "memory");
-	asm volatile ("mov %%cr3, %0" : "=r"(save->cr3) : : "memory");
-	asm volatile ("mov %%cr0, %0" : "=r"(save->cr0) : : "memory");
-	asm volatile ("mov %%dr7, %0" : "=r"(save->dr7) : : "memory");
-	asm volatile ("mov %%dr6, %0" : "=r"(save->dr6) : : "memory");
-	asm volatile ("mov %%cr2, %0" : "=r"(save->cr2) : : "memory");
-	save->g_pat = rdmsr(MSR_IA32_CR_PAT);
-	save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
-	ctrl->intercept = (1ULL << INTERCEPT_VMRUN) |
-				(1ULL << INTERCEPT_VMMCALL);
-	ctrl->msrpm_base_pa = svm->msr_gpa;
-
-	vmcb->save.rip = (u64)guest_rip;
-	vmcb->save.rsp = (u64)guest_rsp;
-	guest_regs.rdi = (u64)svm;
-}
-
-/*
- * save/restore 64-bit general registers except rax, rip, rsp
- * which are directly handed through the VMCB guest processor state
- */
-#define SAVE_GPR_C				\
-	"xchg %%rbx, guest_regs+0x20\n\t"	\
-	"xchg %%rcx, guest_regs+0x10\n\t"	\
-	"xchg %%rdx, guest_regs+0x18\n\t"	\
-	"xchg %%rbp, guest_regs+0x30\n\t"	\
-	"xchg %%rsi, guest_regs+0x38\n\t"	\
-	"xchg %%rdi, guest_regs+0x40\n\t"	\
-	"xchg %%r8,  guest_regs+0x48\n\t"	\
-	"xchg %%r9,  guest_regs+0x50\n\t"	\
-	"xchg %%r10, guest_regs+0x58\n\t"	\
-	"xchg %%r11, guest_regs+0x60\n\t"	\
-	"xchg %%r12, guest_regs+0x68\n\t"	\
-	"xchg %%r13, guest_regs+0x70\n\t"	\
-	"xchg %%r14, guest_regs+0x78\n\t"	\
-	"xchg %%r15, guest_regs+0x80\n\t"
-
-#define LOAD_GPR_C      SAVE_GPR_C
-
-/*
- * selftests do not use interrupts so we dropped clgi/sti/cli/stgi
- * for now. registers involved in LOAD/SAVE_GPR_C are eventually
- * unmodified so they do not need to be in the clobber list.
- */
-void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa)
-{
-	asm volatile (
-		"vmload %[vmcb_gpa]\n\t"
-		"mov rflags, %%r15\n\t"	// rflags
-		"mov %%r15, 0x170(%[vmcb])\n\t"
-		"mov guest_regs, %%r15\n\t"	// rax
-		"mov %%r15, 0x1f8(%[vmcb])\n\t"
-		LOAD_GPR_C
-		"vmrun %[vmcb_gpa]\n\t"
-		SAVE_GPR_C
-		"mov 0x170(%[vmcb]), %%r15\n\t"	// rflags
-		"mov %%r15, rflags\n\t"
-		"mov 0x1f8(%[vmcb]), %%r15\n\t"	// rax
-		"mov %%r15, guest_regs\n\t"
-		"vmsave %[vmcb_gpa]\n\t"
-		: : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa)
-		: "r15", "memory");
-}
-
-/*
- * Open SEV_DEV_PATH if available, otherwise exit the entire program.
- *
- * Return:
- *   The opened file descriptor of /dev/sev.
- */
-int open_sev_dev_path_or_exit(void)
-{
-	return open_path_or_exit(SEV_DEV_PATH, 0);
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
deleted file mode 100644
index 1265cecc7dd1..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ucall support. A ucall is a "hypercall to userspace".
- *
- * Copyright (C) 2018, Red Hat, Inc.
- */
-#include "kvm_util.h"
-
-#define UCALL_PIO_PORT ((uint16_t)0x1000)
-
-void ucall_arch_do_ucall(vm_vaddr_t uc)
-{
-	/*
-	 * FIXME: Revert this hack (the entire commit that added it) once nVMX
-	 * preserves L2 GPRs across a nested VM-Exit.  If a ucall from L2, e.g.
-	 * to do a GUEST_SYNC(), lands the vCPU in L1, any and all GPRs can be
-	 * clobbered by L1.  Save and restore non-volatile GPRs (clobbering RBP
-	 * in particular is problematic) along with RDX and RDI (which are
-	 * inputs), and clobber volatile GPRs. *sigh*
-	 */
-#define HORRIFIC_L2_UCALL_CLOBBER_HACK	\
-	"rcx", "rsi", "r8", "r9", "r10", "r11"
-
-	asm volatile("push %%rbp\n\t"
-		     "push %%r15\n\t"
-		     "push %%r14\n\t"
-		     "push %%r13\n\t"
-		     "push %%r12\n\t"
-		     "push %%rbx\n\t"
-		     "push %%rdx\n\t"
-		     "push %%rdi\n\t"
-		     "in %[port], %%al\n\t"
-		     "pop %%rdi\n\t"
-		     "pop %%rdx\n\t"
-		     "pop %%rbx\n\t"
-		     "pop %%r12\n\t"
-		     "pop %%r13\n\t"
-		     "pop %%r14\n\t"
-		     "pop %%r15\n\t"
-		     "pop %%rbp\n\t"
-		: : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory",
-		     HORRIFIC_L2_UCALL_CLOBBER_HACK);
-}
-
-void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-
-	if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) {
-		struct kvm_regs regs;
-
-		vcpu_regs_get(vcpu, &regs);
-		return (void *)regs.rdi;
-	}
-	return NULL;
-}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
deleted file mode 100644
index d7ac122820bf..000000000000
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ /dev/null
@@ -1,554 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * tools/testing/selftests/kvm/lib/x86_64/vmx.c
- *
- * Copyright (C) 2018, Google LLC.
- */
-
-#include <asm/msr-index.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#define PAGE_SHIFT_4K  12
-
-#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000
-
-bool enable_evmcs;
-
-struct hv_enlightened_vmcs *current_evmcs;
-struct hv_vp_assist_page *current_vp_assist;
-
-struct eptPageTableEntry {
-	uint64_t readable:1;
-	uint64_t writable:1;
-	uint64_t executable:1;
-	uint64_t memory_type:3;
-	uint64_t ignore_pat:1;
-	uint64_t page_size:1;
-	uint64_t accessed:1;
-	uint64_t dirty:1;
-	uint64_t ignored_11_10:2;
-	uint64_t address:40;
-	uint64_t ignored_62_52:11;
-	uint64_t suppress_ve:1;
-};
-
-struct eptPageTablePointer {
-	uint64_t memory_type:3;
-	uint64_t page_walk_length:3;
-	uint64_t ad_enabled:1;
-	uint64_t reserved_11_07:5;
-	uint64_t address:40;
-	uint64_t reserved_63_52:12;
-};
-int vcpu_enable_evmcs(struct kvm_vcpu *vcpu)
-{
-	uint16_t evmcs_ver;
-
-	vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENLIGHTENED_VMCS,
-			(unsigned long)&evmcs_ver);
-
-	/* KVM should return supported EVMCS version range */
-	TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) &&
-		    (evmcs_ver & 0xff) > 0,
-		    "Incorrect EVMCS version range: %x:%x",
-		    evmcs_ver & 0xff, evmcs_ver >> 8);
-
-	return evmcs_ver;
-}
-
-/* Allocate memory regions for nested VMX tests.
- *
- * Input Args:
- *   vm - The VM to allocate guest-virtual addresses in.
- *
- * Output Args:
- *   p_vmx_gva - The guest virtual address for the struct vmx_pages.
- *
- * Return:
- *   Pointer to structure with the addresses of the VMX areas.
- */
-struct vmx_pages *
-vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
-{
-	vm_vaddr_t vmx_gva = vm_vaddr_alloc_page(vm);
-	struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva);
-
-	/* Setup of a region of guest memory for the vmxon region. */
-	vmx->vmxon = (void *)vm_vaddr_alloc_page(vm);
-	vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon);
-	vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon);
-
-	/* Setup of a region of guest memory for a vmcs. */
-	vmx->vmcs = (void *)vm_vaddr_alloc_page(vm);
-	vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs);
-	vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs);
-
-	/* Setup of a region of guest memory for the MSR bitmap. */
-	vmx->msr = (void *)vm_vaddr_alloc_page(vm);
-	vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
-	vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
-	memset(vmx->msr_hva, 0, getpagesize());
-
-	/* Setup of a region of guest memory for the shadow VMCS. */
-	vmx->shadow_vmcs = (void *)vm_vaddr_alloc_page(vm);
-	vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs);
-	vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs);
-
-	/* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */
-	vmx->vmread = (void *)vm_vaddr_alloc_page(vm);
-	vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread);
-	vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread);
-	memset(vmx->vmread_hva, 0, getpagesize());
-
-	vmx->vmwrite = (void *)vm_vaddr_alloc_page(vm);
-	vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite);
-	vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
-	memset(vmx->vmwrite_hva, 0, getpagesize());
-
-	*p_vmx_gva = vmx_gva;
-	return vmx;
-}
-
-bool prepare_for_vmx_operation(struct vmx_pages *vmx)
-{
-	uint64_t feature_control;
-	uint64_t required;
-	unsigned long cr0;
-	unsigned long cr4;
-
-	/*
-	 * Ensure bits in CR0 and CR4 are valid in VMX operation:
-	 * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
-	 * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
-	 */
-	__asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
-	cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
-	cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
-	__asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
-
-	__asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
-	cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
-	cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
-	/* Enable VMX operation */
-	cr4 |= X86_CR4_VMXE;
-	__asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
-
-	/*
-	 * Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
-	 *  Bit 0: Lock bit. If clear, VMXON causes a #GP.
-	 *  Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
-	 *    outside of SMX causes a #GP.
-	 */
-	required = FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
-	required |= FEAT_CTL_LOCKED;
-	feature_control = rdmsr(MSR_IA32_FEAT_CTL);
-	if ((feature_control & required) != required)
-		wrmsr(MSR_IA32_FEAT_CTL, feature_control | required);
-
-	/* Enter VMX root operation. */
-	*(uint32_t *)(vmx->vmxon) = vmcs_revision();
-	if (vmxon(vmx->vmxon_gpa))
-		return false;
-
-	return true;
-}
-
-bool load_vmcs(struct vmx_pages *vmx)
-{
-	/* Load a VMCS. */
-	*(uint32_t *)(vmx->vmcs) = vmcs_revision();
-	if (vmclear(vmx->vmcs_gpa))
-		return false;
-
-	if (vmptrld(vmx->vmcs_gpa))
-		return false;
-
-	/* Setup shadow VMCS, do not load it yet. */
-	*(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul;
-	if (vmclear(vmx->shadow_vmcs_gpa))
-		return false;
-
-	return true;
-}
-
-static bool ept_vpid_cap_supported(uint64_t mask)
-{
-	return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask;
-}
-
-bool ept_1g_pages_supported(void)
-{
-	return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES);
-}
-
-/*
- * Initialize the control fields to the most basic settings possible.
- */
-static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
-{
-	uint32_t sec_exec_ctl = 0;
-
-	vmwrite(VIRTUAL_PROCESSOR_ID, 0);
-	vmwrite(POSTED_INTR_NV, 0);
-
-	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
-
-	if (vmx->eptp_gpa) {
-		uint64_t ept_paddr;
-		struct eptPageTablePointer eptp = {
-			.memory_type = X86_MEMTYPE_WB,
-			.page_walk_length = 3, /* + 1 */
-			.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
-			.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
-		};
-
-		memcpy(&ept_paddr, &eptp, sizeof(ept_paddr));
-		vmwrite(EPT_POINTER, ept_paddr);
-		sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT;
-	}
-
-	if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl))
-		vmwrite(CPU_BASED_VM_EXEC_CONTROL,
-			rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
-	else {
-		vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS));
-		GUEST_ASSERT(!sec_exec_ctl);
-	}
-
-	vmwrite(EXCEPTION_BITMAP, 0);
-	vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
-	vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
-	vmwrite(CR3_TARGET_COUNT, 0);
-	vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
-		VM_EXIT_HOST_ADDR_SPACE_SIZE);	  /* 64-bit host */
-	vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
-	vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
-		VM_ENTRY_IA32E_MODE);		  /* 64-bit guest */
-	vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
-	vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
-	vmwrite(TPR_THRESHOLD, 0);
-
-	vmwrite(CR0_GUEST_HOST_MASK, 0);
-	vmwrite(CR4_GUEST_HOST_MASK, 0);
-	vmwrite(CR0_READ_SHADOW, get_cr0());
-	vmwrite(CR4_READ_SHADOW, get_cr4());
-
-	vmwrite(MSR_BITMAP, vmx->msr_gpa);
-	vmwrite(VMREAD_BITMAP, vmx->vmread_gpa);
-	vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa);
-}
-
-/*
- * Initialize the host state fields based on the current host state, with
- * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
- * or vmresume.
- */
-static inline void init_vmcs_host_state(void)
-{
-	uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
-
-	vmwrite(HOST_ES_SELECTOR, get_es());
-	vmwrite(HOST_CS_SELECTOR, get_cs());
-	vmwrite(HOST_SS_SELECTOR, get_ss());
-	vmwrite(HOST_DS_SELECTOR, get_ds());
-	vmwrite(HOST_FS_SELECTOR, get_fs());
-	vmwrite(HOST_GS_SELECTOR, get_gs());
-	vmwrite(HOST_TR_SELECTOR, get_tr());
-
-	if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
-		vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
-	if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
-		vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
-	if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
-		vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
-			rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
-
-	vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
-
-	vmwrite(HOST_CR0, get_cr0());
-	vmwrite(HOST_CR3, get_cr3());
-	vmwrite(HOST_CR4, get_cr4());
-	vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
-	vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
-	vmwrite(HOST_TR_BASE,
-		get_desc64_base((struct desc64 *)(get_gdt().address + get_tr())));
-	vmwrite(HOST_GDTR_BASE, get_gdt().address);
-	vmwrite(HOST_IDTR_BASE, get_idt().address);
-	vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
-	vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
-}
-
-/*
- * Initialize the guest state fields essentially as a clone of
- * the host state fields. Some host state fields have fixed
- * values, and we set the corresponding guest state fields accordingly.
- */
-static inline void init_vmcs_guest_state(void *rip, void *rsp)
-{
-	vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
-	vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
-	vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
-	vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
-	vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
-	vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
-	vmwrite(GUEST_LDTR_SELECTOR, 0);
-	vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
-	vmwrite(GUEST_INTR_STATUS, 0);
-	vmwrite(GUEST_PML_INDEX, 0);
-
-	vmwrite(VMCS_LINK_POINTER, -1ll);
-	vmwrite(GUEST_IA32_DEBUGCTL, 0);
-	vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
-	vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
-	vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
-		vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
-
-	vmwrite(GUEST_ES_LIMIT, -1);
-	vmwrite(GUEST_CS_LIMIT, -1);
-	vmwrite(GUEST_SS_LIMIT, -1);
-	vmwrite(GUEST_DS_LIMIT, -1);
-	vmwrite(GUEST_FS_LIMIT, -1);
-	vmwrite(GUEST_GS_LIMIT, -1);
-	vmwrite(GUEST_LDTR_LIMIT, -1);
-	vmwrite(GUEST_TR_LIMIT, 0x67);
-	vmwrite(GUEST_GDTR_LIMIT, 0xffff);
-	vmwrite(GUEST_IDTR_LIMIT, 0xffff);
-	vmwrite(GUEST_ES_AR_BYTES,
-		vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
-	vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
-	vmwrite(GUEST_SS_AR_BYTES, 0xc093);
-	vmwrite(GUEST_DS_AR_BYTES,
-		vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
-	vmwrite(GUEST_FS_AR_BYTES,
-		vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
-	vmwrite(GUEST_GS_AR_BYTES,
-		vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
-	vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
-	vmwrite(GUEST_TR_AR_BYTES, 0x8b);
-	vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
-	vmwrite(GUEST_ACTIVITY_STATE, 0);
-	vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
-	vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
-
-	vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
-	vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
-	vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
-	vmwrite(GUEST_ES_BASE, 0);
-	vmwrite(GUEST_CS_BASE, 0);
-	vmwrite(GUEST_SS_BASE, 0);
-	vmwrite(GUEST_DS_BASE, 0);
-	vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
-	vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
-	vmwrite(GUEST_LDTR_BASE, 0);
-	vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
-	vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
-	vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
-	vmwrite(GUEST_DR7, 0x400);
-	vmwrite(GUEST_RSP, (uint64_t)rsp);
-	vmwrite(GUEST_RIP, (uint64_t)rip);
-	vmwrite(GUEST_RFLAGS, 2);
-	vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-	vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
-	vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
-}
-
-void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
-{
-	init_vmcs_control_fields(vmx);
-	init_vmcs_host_state();
-	init_vmcs_guest_state(guest_rip, guest_rsp);
-}
-
-static void nested_create_pte(struct kvm_vm *vm,
-			      struct eptPageTableEntry *pte,
-			      uint64_t nested_paddr,
-			      uint64_t paddr,
-			      int current_level,
-			      int target_level)
-{
-	if (!pte->readable) {
-		pte->writable = true;
-		pte->readable = true;
-		pte->executable = true;
-		pte->page_size = (current_level == target_level);
-		if (pte->page_size)
-			pte->address = paddr >> vm->page_shift;
-		else
-			pte->address = vm_alloc_page_table(vm) >> vm->page_shift;
-	} else {
-		/*
-		 * Entry already present.  Assert that the caller doesn't want
-		 * a hugepage at this level, and that there isn't a hugepage at
-		 * this level.
-		 */
-		TEST_ASSERT(current_level != target_level,
-			    "Cannot create hugepage at level: %u, nested_paddr: 0x%lx",
-			    current_level, nested_paddr);
-		TEST_ASSERT(!pte->page_size,
-			    "Cannot create page table at level: %u, nested_paddr: 0x%lx",
-			    current_level, nested_paddr);
-	}
-}
-
-
-void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		     uint64_t nested_paddr, uint64_t paddr, int target_level)
-{
-	const uint64_t page_size = PG_LEVEL_SIZE(target_level);
-	struct eptPageTableEntry *pt = vmx->eptp_hva, *pte;
-	uint16_t index;
-
-	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
-		    "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
-
-	TEST_ASSERT((nested_paddr >> 48) == 0,
-		    "Nested physical address 0x%lx requires 5-level paging",
-		    nested_paddr);
-	TEST_ASSERT((nested_paddr % page_size) == 0,
-		    "Nested physical address not on page boundary,\n"
-		    "  nested_paddr: 0x%lx page_size: 0x%lx",
-		    nested_paddr, page_size);
-	TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
-		    "Physical address beyond beyond maximum supported,\n"
-		    "  nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		    paddr, vm->max_gfn, vm->page_size);
-	TEST_ASSERT((paddr % page_size) == 0,
-		    "Physical address not on page boundary,\n"
-		    "  paddr: 0x%lx page_size: 0x%lx",
-		    paddr, page_size);
-	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
-		    "Physical address beyond beyond maximum supported,\n"
-		    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		    paddr, vm->max_gfn, vm->page_size);
-
-	for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) {
-		index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
-		pte = &pt[index];
-
-		nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
-
-		if (pte->page_size)
-			break;
-
-		pt = addr_gpa2hva(vm, pte->address * vm->page_size);
-	}
-
-	/*
-	 * For now mark these as accessed and dirty because the only
-	 * testcase we have needs that.  Can be reconsidered later.
-	 */
-	pte->accessed = true;
-	pte->dirty = true;
-
-}
-
-void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		   uint64_t nested_paddr, uint64_t paddr)
-{
-	__nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
-}
-
-/*
- * Map a range of EPT guest physical addresses to the VM's physical address
- *
- * Input Args:
- *   vm - Virtual Machine
- *   nested_paddr - Nested guest physical address to map
- *   paddr - VM Physical Address
- *   size - The size of the range to map
- *   level - The level at which to map the range
- *
- * Output Args: None
- *
- * Return: None
- *
- * Within the VM given by vm, creates a nested guest translation for the
- * page range starting at nested_paddr to the page range starting at paddr.
- */
-void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		  uint64_t nested_paddr, uint64_t paddr, uint64_t size,
-		  int level)
-{
-	size_t page_size = PG_LEVEL_SIZE(level);
-	size_t npages = size / page_size;
-
-	TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
-	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
-
-	while (npages--) {
-		__nested_pg_map(vmx, vm, nested_paddr, paddr, level);
-		nested_paddr += page_size;
-		paddr += page_size;
-	}
-}
-
-void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		uint64_t nested_paddr, uint64_t paddr, uint64_t size)
-{
-	__nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
-}
-
-/* Prepare an identity extended page table that maps all the
- * physical pages in VM.
- */
-void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
-			uint32_t memslot)
-{
-	sparsebit_idx_t i, last;
-	struct userspace_mem_region *region =
-		memslot2region(vm, memslot);
-
-	i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
-	last = i + (region->region.memory_size >> vm->page_shift);
-	for (;;) {
-		i = sparsebit_next_clear(region->unused_phy_pages, i);
-		if (i > last)
-			break;
-
-		nested_map(vmx, vm,
-			   (uint64_t)i << vm->page_shift,
-			   (uint64_t)i << vm->page_shift,
-			   1 << vm->page_shift);
-	}
-}
-
-/* Identity map a region with 1GiB Pages. */
-void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
-			    uint64_t addr, uint64_t size)
-{
-	__nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
-}
-
-bool kvm_cpu_has_ept(void)
-{
-	uint64_t ctrl;
-
-	ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32;
-	if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
-		return false;
-
-	ctrl = kvm_get_feature_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
-	return ctrl & SECONDARY_EXEC_ENABLE_EPT;
-}
-
-void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
-		  uint32_t eptp_memslot)
-{
-	TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
-
-	vmx->eptp = (void *)vm_vaddr_alloc_page(vm);
-	vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
-	vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
-}
-
-void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm)
-{
-	vmx->apic_access = (void *)vm_vaddr_alloc_page(vm);
-	vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access);
-	vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access);
-}
diff --git a/tools/testing/selftests/kvm/s390/cmma_test.c b/tools/testing/selftests/kvm/s390/cmma_test.c
new file mode 100644
index 000000000000..e32dd59703a0
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/cmma_test.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x CMMA migration
+ *
+ * Copyright IBM Corp. 2023
+ *
+ * Authors:
+ *  Nico Boehr <nrb@linux.ibm.com>
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+#include "ucall_common.h"
+#include "processor.h"
+
+#define MAIN_PAGE_COUNT 512
+
+#define TEST_DATA_PAGE_COUNT 512
+#define TEST_DATA_MEMSLOT 1
+#define TEST_DATA_START_GFN PAGE_SIZE
+
+#define TEST_DATA_TWO_PAGE_COUNT 256
+#define TEST_DATA_TWO_MEMSLOT 2
+#define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE)
+
+static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT];
+
+/**
+ * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot,
+ * so use_cmma goes on and the CMMA related ioctls do something.
+ */
+static void guest_do_one_essa(void)
+{
+	asm volatile(
+		/* load TEST_DATA_START_GFN into r1 */
+		"	llilf 1,%[start_gfn]\n"
+		/* calculate the address from the gfn */
+		"	sllg 1,1,12(0)\n"
+		/* set the first page in TEST_DATA memslot to STABLE */
+		"	.insn rrf,0xb9ab0000,2,1,1,0\n"
+		/* hypercall */
+		"	diag 0,0,0x501\n"
+		"0:	j 0b"
+		:
+		: [start_gfn] "L"(TEST_DATA_START_GFN)
+		: "r1", "r2", "memory", "cc"
+	);
+}
+
+/**
+ * Touch CMMA attributes of all pages in TEST_DATA memslot. Set them to stable
+ * state.
+ */
+static void guest_dirty_test_data(void)
+{
+	asm volatile(
+		/* r1 = TEST_DATA_START_GFN */
+		"	xgr 1,1\n"
+		"	llilf 1,%[start_gfn]\n"
+		/* r5 = TEST_DATA_PAGE_COUNT */
+		"	lghi 5,%[page_count]\n"
+		/* r5 += r1 */
+		"2:	agfr 5,1\n"
+		/* r2 = r1 << PAGE_SHIFT */
+		"1:	sllg 2,1,12(0)\n"
+		/* essa(r4, r2, SET_STABLE) */
+		"	.insn rrf,0xb9ab0000,4,2,1,0\n"
+		/* i++ */
+		"	agfi 1,1\n"
+		/* if r1 < r5 goto 1 */
+		"	cgrjl 1,5,1b\n"
+		/* hypercall */
+		"	diag 0,0,0x501\n"
+		"0:	j 0b"
+		:
+		: [start_gfn] "L"(TEST_DATA_START_GFN),
+		  [page_count] "L"(TEST_DATA_PAGE_COUNT)
+		:
+			/* the counter in our loop over the pages */
+			"r1",
+			/* the calculated page physical address */
+			"r2",
+			/* ESSA output register */
+			"r4",
+			/* last page */
+			"r5",
+			"cc", "memory"
+	);
+}
+
+static void create_main_memslot(struct kvm_vm *vm)
+{
+	int i;
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, MAIN_PAGE_COUNT, 0);
+	/* set the array of memslots to zero like __vm_create does */
+	for (i = 0; i < NR_MEM_REGIONS; i++)
+		vm->memslots[i] = 0;
+}
+
+static void create_test_memslot(struct kvm_vm *vm)
+{
+	vm_userspace_mem_region_add(vm,
+				    VM_MEM_SRC_ANONYMOUS,
+				    TEST_DATA_START_GFN << vm->page_shift,
+				    TEST_DATA_MEMSLOT,
+				    TEST_DATA_PAGE_COUNT,
+				    0
+				   );
+	vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+}
+
+static void create_memslots(struct kvm_vm *vm)
+{
+	/*
+	 * Our VM has the following memory layout:
+	 * +------+---------------------------+
+	 * | GFN  | Memslot                   |
+	 * +------+---------------------------+
+	 * | 0    |                           |
+	 * | ...  | MAIN (Code, Stack, ...)   |
+	 * | 511  |                           |
+	 * +------+---------------------------+
+	 * | 4096 |                           |
+	 * | ...  | TEST_DATA                 |
+	 * | 4607 |                           |
+	 * +------+---------------------------+
+	 */
+	create_main_memslot(vm);
+	create_test_memslot(vm);
+}
+
+static void finish_vm_setup(struct kvm_vm *vm)
+{
+	struct userspace_mem_region *slot0;
+
+	kvm_vm_elf_load(vm, program_invocation_name);
+
+	slot0 = memslot2region(vm, 0);
+	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
+
+	kvm_arch_vm_post_create(vm);
+}
+
+static struct kvm_vm *create_vm_two_memslots(void)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create_barebones();
+
+	create_memslots(vm);
+
+	finish_vm_setup(vm);
+
+	return vm;
+}
+
+static void enable_cmma(struct kvm_vm *vm)
+{
+	int r;
+
+	r = __kvm_device_attr_set(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA, NULL);
+	TEST_ASSERT(!r, "enabling cmma failed r=%d errno=%d", r, errno);
+}
+
+static void enable_dirty_tracking(struct kvm_vm *vm)
+{
+	vm_mem_region_set_flags(vm, 0, KVM_MEM_LOG_DIRTY_PAGES);
+	vm_mem_region_set_flags(vm, TEST_DATA_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
+}
+
+static int __enable_migration_mode(struct kvm_vm *vm)
+{
+	return __kvm_device_attr_set(vm->fd,
+				     KVM_S390_VM_MIGRATION,
+				     KVM_S390_VM_MIGRATION_START,
+				     NULL
+				    );
+}
+
+static void enable_migration_mode(struct kvm_vm *vm)
+{
+	int r = __enable_migration_mode(vm);
+
+	TEST_ASSERT(!r, "enabling migration mode failed r=%d errno=%d", r, errno);
+}
+
+static bool is_migration_mode_on(struct kvm_vm *vm)
+{
+	u64 out;
+	int r;
+
+	r = __kvm_device_attr_get(vm->fd,
+				  KVM_S390_VM_MIGRATION,
+				  KVM_S390_VM_MIGRATION_STATUS,
+				  &out
+				 );
+	TEST_ASSERT(!r, "getting migration mode status failed r=%d errno=%d", r, errno);
+	return out;
+}
+
+static int vm_get_cmma_bits(struct kvm_vm *vm, u64 flags, int *errno_out)
+{
+	struct kvm_s390_cmma_log args;
+	int rc;
+
+	errno = 0;
+
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = 0,
+		.count = sizeof(cmma_value_buf),
+		.flags = flags,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	rc = __vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+
+	*errno_out = errno;
+	return rc;
+}
+
+static void test_get_cmma_basic(void)
+{
+	struct kvm_vm *vm = create_vm_two_memslots();
+	struct kvm_vcpu *vcpu;
+	int rc, errno_out;
+
+	/* GET_CMMA_BITS without CMMA enabled should fail */
+	rc = vm_get_cmma_bits(vm, 0, &errno_out);
+	TEST_ASSERT_EQ(rc, -1);
+	TEST_ASSERT_EQ(errno_out, ENXIO);
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+
+	vcpu_run(vcpu);
+
+	/* GET_CMMA_BITS without migration mode and without peeking should fail */
+	rc = vm_get_cmma_bits(vm, 0, &errno_out);
+	TEST_ASSERT_EQ(rc, -1);
+	TEST_ASSERT_EQ(errno_out, EINVAL);
+
+	/* GET_CMMA_BITS without migration mode and with peeking should work */
+	rc = vm_get_cmma_bits(vm, KVM_S390_CMMA_PEEK, &errno_out);
+	TEST_ASSERT_EQ(rc, 0);
+	TEST_ASSERT_EQ(errno_out, 0);
+
+	enable_dirty_tracking(vm);
+	enable_migration_mode(vm);
+
+	/* GET_CMMA_BITS with invalid flags */
+	rc = vm_get_cmma_bits(vm, 0xfeedc0fe, &errno_out);
+	TEST_ASSERT_EQ(rc, -1);
+	TEST_ASSERT_EQ(errno_out, EINVAL);
+
+	kvm_vm_free(vm);
+}
+
+static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu)
+{
+	TEST_ASSERT_EQ(vcpu->run->exit_reason, 13);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, 4);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x8300);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipb, 0x5010000);
+}
+
+static void test_migration_mode(void)
+{
+	struct kvm_vm *vm = vm_create_barebones();
+	struct kvm_vcpu *vcpu;
+	u64 orig_psw;
+	int rc;
+
+	/* enabling migration mode on a VM without memory should fail */
+	rc = __enable_migration_mode(vm);
+	TEST_ASSERT_EQ(rc, -1);
+	TEST_ASSERT_EQ(errno, EINVAL);
+	TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
+	errno = 0;
+
+	create_memslots(vm);
+	finish_vm_setup(vm);
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+	orig_psw = vcpu->run->psw_addr;
+
+	/*
+	 * Execute one essa instruction in the guest. Otherwise the guest will
+	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+	 */
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	/* migration mode when memslots have dirty tracking off should fail */
+	rc = __enable_migration_mode(vm);
+	TEST_ASSERT_EQ(rc, -1);
+	TEST_ASSERT_EQ(errno, EINVAL);
+	TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
+	errno = 0;
+
+	/* enable dirty tracking */
+	enable_dirty_tracking(vm);
+
+	/* enabling migration mode should work now */
+	rc = __enable_migration_mode(vm);
+	TEST_ASSERT_EQ(rc, 0);
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	errno = 0;
+
+	/* execute another ESSA instruction to see this goes fine */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	/*
+	 * With migration mode on, create a new memslot with dirty tracking off.
+	 * This should turn off migration mode.
+	 */
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	vm_userspace_mem_region_add(vm,
+				    VM_MEM_SRC_ANONYMOUS,
+				    TEST_DATA_TWO_START_GFN << vm->page_shift,
+				    TEST_DATA_TWO_MEMSLOT,
+				    TEST_DATA_TWO_PAGE_COUNT,
+				    0
+				   );
+	TEST_ASSERT(!is_migration_mode_on(vm),
+		    "creating memslot without dirty tracking turns off migration mode"
+		   );
+
+	/* ESSA instructions should still execute fine */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	/*
+	 * Turn on dirty tracking on the new memslot.
+	 * It should be possible to turn migration mode back on again.
+	 */
+	vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
+	rc = __enable_migration_mode(vm);
+	TEST_ASSERT_EQ(rc, 0);
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	errno = 0;
+
+	/*
+	 * Turn off dirty tracking again, this time with just a flag change.
+	 * Again, migration mode should turn off.
+	 */
+	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+	vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, 0);
+	TEST_ASSERT(!is_migration_mode_on(vm),
+		    "disabling dirty tracking should turn off migration mode"
+		   );
+
+	/* ESSA instructions should still execute fine */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+/**
+ * Given a VM with the MAIN and TEST_DATA memslot, assert that both slots have
+ * CMMA attributes of all pages in both memslots and nothing more dirty.
+ * This has the useful side effect of ensuring nothing is CMMA dirty after this
+ * function.
+ */
+static void assert_all_slots_cmma_dirty(struct kvm_vm *vm)
+{
+	struct kvm_s390_cmma_log args;
+
+	/*
+	 * First iteration - everything should be dirty.
+	 * Start at the main memslot...
+	 */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = 0,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	TEST_ASSERT_EQ(args.count, MAIN_PAGE_COUNT);
+	TEST_ASSERT_EQ(args.remaining, TEST_DATA_PAGE_COUNT);
+	TEST_ASSERT_EQ(args.start_gfn, 0);
+
+	/* ...and then - after a hole - the TEST_DATA memslot should follow */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = MAIN_PAGE_COUNT,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	TEST_ASSERT_EQ(args.count, TEST_DATA_PAGE_COUNT);
+	TEST_ASSERT_EQ(args.start_gfn, TEST_DATA_START_GFN);
+	TEST_ASSERT_EQ(args.remaining, 0);
+
+	/* ...and nothing else should be there */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	TEST_ASSERT_EQ(args.count, 0);
+	TEST_ASSERT_EQ(args.start_gfn, 0);
+	TEST_ASSERT_EQ(args.remaining, 0);
+}
+
+/**
+ * Given a VM, assert no pages are CMMA dirty.
+ */
+static void assert_no_pages_cmma_dirty(struct kvm_vm *vm)
+{
+	struct kvm_s390_cmma_log args;
+
+	/* If we start from GFN 0 again, nothing should be dirty. */
+	args = (struct kvm_s390_cmma_log){
+		.start_gfn = 0,
+		.count = sizeof(cmma_value_buf),
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+	if (args.count || args.remaining || args.start_gfn)
+		TEST_FAIL("pages are still dirty start_gfn=0x%llx count=%u remaining=%llu",
+			  args.start_gfn,
+			  args.count,
+			  args.remaining
+			 );
+}
+
+static void test_get_inital_dirty(void)
+{
+	struct kvm_vm *vm = create_vm_two_memslots();
+	struct kvm_vcpu *vcpu;
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+
+	/*
+	 * Execute one essa instruction in the guest. Otherwise the guest will
+	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+	 */
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	enable_dirty_tracking(vm);
+	enable_migration_mode(vm);
+
+	assert_all_slots_cmma_dirty(vm);
+
+	/* Start from the beginning again and make sure nothing else is dirty */
+	assert_no_pages_cmma_dirty(vm);
+
+	kvm_vm_free(vm);
+}
+
+static void query_cmma_range(struct kvm_vm *vm,
+			     u64 start_gfn, u64 gfn_count,
+			     struct kvm_s390_cmma_log *res_out)
+{
+	*res_out = (struct kvm_s390_cmma_log){
+		.start_gfn = start_gfn,
+		.count = gfn_count,
+		.flags = 0,
+		.values = (__u64)&cmma_value_buf[0]
+	};
+	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, res_out);
+}
+
+/**
+ * Assert the given cmma_log struct that was executed by query_cmma_range()
+ * indicates the first dirty gfn is at first_dirty_gfn and contains exactly
+ * dirty_gfn_count CMMA values.
+ */
+static void assert_cmma_dirty(u64 first_dirty_gfn,
+			      u64 dirty_gfn_count,
+			      const struct kvm_s390_cmma_log *res)
+{
+	TEST_ASSERT_EQ(res->start_gfn, first_dirty_gfn);
+	TEST_ASSERT_EQ(res->count, dirty_gfn_count);
+	for (size_t i = 0; i < dirty_gfn_count; i++)
+		TEST_ASSERT_EQ(cmma_value_buf[0], 0x0); /* stable state */
+	TEST_ASSERT_EQ(cmma_value_buf[dirty_gfn_count], 0xff); /* not touched */
+}
+
+static void test_get_skip_holes(void)
+{
+	size_t gfn_offset;
+	struct kvm_vm *vm = create_vm_two_memslots();
+	struct kvm_s390_cmma_log log;
+	struct kvm_vcpu *vcpu;
+	u64 orig_psw;
+
+	enable_cmma(vm);
+	vcpu = vm_vcpu_add(vm, 1, guest_dirty_test_data);
+
+	orig_psw = vcpu->run->psw_addr;
+
+	/*
+	 * Execute some essa instructions in the guest. Otherwise the guest will
+	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+	 */
+	vcpu_run(vcpu);
+	assert_exit_was_hypercall(vcpu);
+
+	enable_dirty_tracking(vm);
+	enable_migration_mode(vm);
+
+	/* un-dirty all pages */
+	assert_all_slots_cmma_dirty(vm);
+
+	/* Then, dirty just the TEST_DATA memslot */
+	vcpu->run->psw_addr = orig_psw;
+	vcpu_run(vcpu);
+
+	gfn_offset = TEST_DATA_START_GFN;
+	/**
+	 * Query CMMA attributes of one page, starting at page 0. Since the
+	 * main memslot was not touched by the VM, this should yield the first
+	 * page of the TEST_DATA memslot.
+	 * The dirty bitmap should now look like this:
+	 * 0: not dirty
+	 * [0x1, 0x200): dirty
+	 */
+	query_cmma_range(vm, 0, 1, &log);
+	assert_cmma_dirty(gfn_offset, 1, &log);
+	gfn_offset++;
+
+	/**
+	 * Query CMMA attributes of 32 (0x20) pages past the end of the TEST_DATA
+	 * memslot. This should wrap back to the beginning of the TEST_DATA
+	 * memslot, page 1.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x21): not dirty
+	 * [0x21, 0x200): dirty
+	 */
+	query_cmma_range(vm, TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, 0x20, &log);
+	assert_cmma_dirty(gfn_offset, 0x20, &log);
+	gfn_offset += 0x20;
+
+	/* Skip 32 pages */
+	gfn_offset += 0x20;
+
+	/**
+	 * After skipping 32 pages, query the next 32 (0x20) pages.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x21): not dirty
+	 * [0x21, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	query_cmma_range(vm, gfn_offset, 0x20, &log);
+	assert_cmma_dirty(gfn_offset, 0x20, &log);
+	gfn_offset += 0x20;
+
+	/**
+	 * Query 1 page from the beginning of the TEST_DATA memslot. This should
+	 * yield page 0x21.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x22): not dirty
+	 * [0x22, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	query_cmma_range(vm, TEST_DATA_START_GFN, 1, &log);
+	assert_cmma_dirty(TEST_DATA_START_GFN + 0x21, 1, &log);
+	gfn_offset++;
+
+	/**
+	 * Query 15 (0xF) pages from page 0x23 in TEST_DATA memslot.
+	 * This should yield pages [0x23, 0x33).
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x22): not dirty
+	 * 0x22: dirty
+	 * [0x23, 0x33): not dirty
+	 * [0x33, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x23;
+	query_cmma_range(vm, gfn_offset, 15, &log);
+	assert_cmma_dirty(gfn_offset, 15, &log);
+
+	/**
+	 * Query 17 (0x11) pages from page 0x22 in TEST_DATA memslot.
+	 * This should yield page [0x22, 0x33)
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x33): not dirty
+	 * [0x33, 0x41): dirty
+	 * [0x41, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x22;
+	query_cmma_range(vm, gfn_offset, 17, &log);
+	assert_cmma_dirty(gfn_offset, 17, &log);
+
+	/**
+	 * Query 25 (0x19) pages from page 0x40 in TEST_DATA memslot.
+	 * This should yield page 0x40 and nothing more, since there are more
+	 * than 16 non-dirty pages after page 0x40.
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x33): not dirty
+	 * [0x33, 0x40): dirty
+	 * [0x40, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x40;
+	query_cmma_range(vm, gfn_offset, 25, &log);
+	assert_cmma_dirty(gfn_offset, 1, &log);
+
+	/**
+	 * Query pages [0x33, 0x40).
+	 * The dirty bitmap should now look like this:
+	 * [0, 0x61): not dirty
+	 * [0x61, 0x200): dirty
+	 */
+	gfn_offset = TEST_DATA_START_GFN + 0x33;
+	query_cmma_range(vm, gfn_offset, 0x40 - 0x33, &log);
+	assert_cmma_dirty(gfn_offset, 0x40 - 0x33, &log);
+
+	/**
+	 * Query the remaining pages [0x61, 0x200).
+	 */
+	gfn_offset = TEST_DATA_START_GFN;
+	query_cmma_range(vm, gfn_offset, TEST_DATA_PAGE_COUNT - 0x61, &log);
+	assert_cmma_dirty(TEST_DATA_START_GFN + 0x61, TEST_DATA_PAGE_COUNT - 0x61, &log);
+
+	assert_no_pages_cmma_dirty(vm);
+}
+
+struct testdef {
+	const char *name;
+	void (*test)(void);
+} testlist[] = {
+	{ "migration mode and dirty tracking", test_migration_mode },
+	{ "GET_CMMA_BITS: basic calls", test_get_cmma_basic },
+	{ "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty },
+	{ "GET_CMMA_BITS: holes are skipped", test_get_skip_holes },
+};
+
+/**
+ * The kernel may support CMMA, but the machine may not (i.e. if running as
+ * guest-3).
+ *
+ * In this case, the CMMA capabilities are all there, but the CMMA-related
+ * ioctls fail. To find out whether the machine supports CMMA, create a
+ * temporary VM and then query the CMMA feature of the VM.
+ */
+static int machine_has_cmma(void)
+{
+	struct kvm_vm *vm = vm_create_barebones();
+	int r;
+
+	r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA);
+	kvm_vm_free(vm);
+
+	return r;
+}
+
+int main(int argc, char *argv[])
+{
+	int idx;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_CMMA_MIGRATION));
+	TEST_REQUIRE(machine_has_cmma());
+
+	ksft_print_header();
+
+	ksft_set_plan(ARRAY_SIZE(testlist));
+
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		testlist[idx].test();
+		ksft_test_result_pass("%s\n", testlist[idx].name);
+	}
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/s390/config b/tools/testing/selftests/kvm/s390/config
new file mode 100644
index 000000000000..23270f2d679f
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/config
@@ -0,0 +1,2 @@
+CONFIG_KVM=y
+CONFIG_KVM_S390_UCONTROL=y
diff --git a/tools/testing/selftests/kvm/s390/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390/cpumodel_subfuncs_test.c
new file mode 100644
index 000000000000..27255880dabd
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/cpumodel_subfuncs_test.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Hariharan Mari <hari55@linux.ibm.com>
+ *
+ * The tests compare the result of the KVM ioctl for obtaining CPU subfunction data with those
+ * from an ASM block performing the same CPU subfunction. Currently KVM doesn't mask instruction
+ * query data reported via the CPU Model, allowing us to directly compare it with the data
+ * acquired through executing the queries in the test.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include "facility.h"
+
+#include "kvm_util.h"
+
+#define PLO_FUNCTION_MAX 256
+
+/* Query available CPU subfunctions */
+struct kvm_s390_vm_cpu_subfunc cpu_subfunc;
+
+static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
+					struct kvm_s390_vm_cpu_subfunc *cpu_subfunc)
+{
+	int r;
+
+	r = __kvm_device_attr_get(vm->fd, KVM_S390_VM_CPU_MODEL,
+				  KVM_S390_VM_CPU_MACHINE_SUBFUNC, cpu_subfunc);
+
+	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+	unsigned long function = nr | 0x100;
+	int cc;
+
+	asm volatile("	lgr	0,%[function]\n"
+			/* Parameter registers are ignored for "test bit" */
+			"	plo	0,0,0,0(0)\n"
+			"	ipm	%0\n"
+			"	srl	%0,28\n"
+			: "=d" (cc)
+			: [function] "d" (function)
+			: "cc", "0");
+	return cc == 0;
+}
+
+/* Testing Perform Locked Operation (PLO) CPU subfunction's ASM block */
+static void test_plo_asm_block(u8 (*query)[32])
+{
+	for (int i = 0; i < PLO_FUNCTION_MAX; ++i) {
+		if (plo_test_bit(i))
+			(*query)[i >> 3] |= 0x80 >> (i & 7);
+	}
+}
+
+/* Testing Crypto Compute Message Authentication Code (KMAC) CPU subfunction's ASM block */
+static void test_kmac_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb91e0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Chaining (KMC) CPU subfunction's ASM block */
+static void test_kmc_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92f0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message (KM) CPU subfunction's ASM block */
+static void test_km_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92e0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Compute Intermediate Message Digest (KIMD) CPU subfunction's ASM block */
+static void test_kimd_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93e0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Compute Last Message Digest (KLMD) CPU subfunction's ASM block */
+static void test_klmd_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93f0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Counter (KMCTR) CPU subfunction's ASM block */
+static void test_kmctr_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rrf,0xb92d0000,2,4,6,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Cipher Feedback (KMF) CPU subfunction's ASM block */
+static void test_kmf_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92a0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Output Feedback (KMO) CPU subfunction's ASM block */
+static void test_kmo_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92b0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Perform Cryptographic Computation (PCC) CPU subfunction's ASM block */
+static void test_pcc_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb92c0000,0,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Perform Random Number Operation (PRNO) CPU subfunction's ASM block */
+static void test_prno_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93c0000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Cipher Message with Authentication (KMA) CPU subfunction's ASM block */
+static void test_kma_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rrf,0xb9290000,2,4,6,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Crypto Compute Digital Signature Authentication (KDSA) CPU subfunction's ASM block */
+static void test_kdsa_asm_block(u8 (*query)[16])
+{
+	asm volatile("	la	%%r1,%[query]\n"
+			"	xgr	%%r0,%%r0\n"
+			"	.insn	rre,0xb93a0000,0,2\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "r0", "r1");
+}
+
+/* Testing Sort Lists (SORTL) CPU subfunction's ASM block */
+static void test_sortl_asm_block(u8 (*query)[32])
+{
+	asm volatile("	lghi	0,0\n"
+			"	la	1,%[query]\n"
+			"	.insn	rre,0xb9380000,2,4\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "0", "1");
+}
+
+/* Testing Deflate Conversion Call (DFLTCC) CPU subfunction's ASM block */
+static void test_dfltcc_asm_block(u8 (*query)[32])
+{
+	asm volatile("	lghi	0,0\n"
+			"	la	1,%[query]\n"
+			"	.insn	rrf,0xb9390000,2,4,6,0\n"
+			: [query] "=R" (*query)
+			:
+			: "cc", "0", "1");
+}
+
+/*
+ * Testing Perform Function with Concurrent Results (PFCR)
+ * CPU subfunctions's ASM block
+ */
+static void test_pfcr_asm_block(u8 (*query)[16])
+{
+	asm volatile("	lghi	0,0\n"
+			"	.insn   rsy,0xeb0000000016,0,0,%[query]\n"
+			: [query] "=QS" (*query)
+			:
+			: "cc", "0");
+}
+
+typedef void (*testfunc_t)(u8 (*array)[]);
+
+struct testdef {
+	const char *subfunc_name;
+	u8 *subfunc_array;
+	size_t array_size;
+	testfunc_t test;
+	int facility_bit;
+} testlist[] = {
+	/*
+	 * PLO was introduced in the very first 64-bit machine generation.
+	 * Hence it is assumed PLO is always installed in Z Arch.
+	 */
+	{ "PLO", cpu_subfunc.plo, sizeof(cpu_subfunc.plo), test_plo_asm_block, 1 },
+	/* MSA - Facility bit 17 */
+	{ "KMAC", cpu_subfunc.kmac, sizeof(cpu_subfunc.kmac), test_kmac_asm_block, 17 },
+	{ "KMC", cpu_subfunc.kmc, sizeof(cpu_subfunc.kmc), test_kmc_asm_block, 17 },
+	{ "KM", cpu_subfunc.km, sizeof(cpu_subfunc.km), test_km_asm_block, 17 },
+	{ "KIMD", cpu_subfunc.kimd, sizeof(cpu_subfunc.kimd), test_kimd_asm_block, 17 },
+	{ "KLMD", cpu_subfunc.klmd, sizeof(cpu_subfunc.klmd), test_klmd_asm_block, 17 },
+	/* MSA - Facility bit 77 */
+	{ "KMCTR", cpu_subfunc.kmctr, sizeof(cpu_subfunc.kmctr), test_kmctr_asm_block, 77 },
+	{ "KMF", cpu_subfunc.kmf, sizeof(cpu_subfunc.kmf), test_kmf_asm_block, 77 },
+	{ "KMO", cpu_subfunc.kmo, sizeof(cpu_subfunc.kmo), test_kmo_asm_block, 77 },
+	{ "PCC", cpu_subfunc.pcc, sizeof(cpu_subfunc.pcc), test_pcc_asm_block, 77 },
+	/* MSA5 - Facility bit 57 */
+	{ "PPNO", cpu_subfunc.ppno, sizeof(cpu_subfunc.ppno), test_prno_asm_block, 57 },
+	/* MSA8 - Facility bit 146 */
+	{ "KMA", cpu_subfunc.kma, sizeof(cpu_subfunc.kma), test_kma_asm_block, 146 },
+	/* MSA9 - Facility bit 155 */
+	{ "KDSA", cpu_subfunc.kdsa, sizeof(cpu_subfunc.kdsa), test_kdsa_asm_block, 155 },
+	/* SORTL - Facility bit 150 */
+	{ "SORTL", cpu_subfunc.sortl, sizeof(cpu_subfunc.sortl), test_sortl_asm_block, 150 },
+	/* DFLTCC - Facility bit 151 */
+	{ "DFLTCC", cpu_subfunc.dfltcc, sizeof(cpu_subfunc.dfltcc), test_dfltcc_asm_block, 151 },
+	/* Concurrent-function facility - Facility bit 201 */
+	{ "PFCR", cpu_subfunc.pfcr, sizeof(cpu_subfunc.pfcr), test_pfcr_asm_block, 201 },
+};
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	int idx;
+
+	ksft_print_header();
+
+	vm = vm_create(1);
+
+	memset(&cpu_subfunc, 0, sizeof(cpu_subfunc));
+	get_cpu_machine_subfuntions(vm, &cpu_subfunc);
+
+	ksft_set_plan(ARRAY_SIZE(testlist));
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		if (test_facility(testlist[idx].facility_bit)) {
+			u8 *array = malloc(testlist[idx].array_size);
+
+			testlist[idx].test((u8 (*)[testlist[idx].array_size])array);
+
+			TEST_ASSERT_EQ(memcmp(testlist[idx].subfunc_array,
+					      array, testlist[idx].array_size), 0);
+
+			ksft_test_result_pass("%s\n", testlist[idx].subfunc_name);
+			free(array);
+		} else {
+			ksft_test_result_skip("%s feature is not avaialable\n",
+					      testlist[idx].subfunc_name);
+		}
+	}
+
+	kvm_vm_free(vm);
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/kvm/s390/debug_test.c b/tools/testing/selftests/kvm/s390/debug_test.c
new file mode 100644
index 000000000000..ad8095968601
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/debug_test.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Test KVM debugging features. */
+#include "kvm_util.h"
+#include "test_util.h"
+#include "sie.h"
+
+#include <linux/kvm.h>
+
+#define __LC_SVC_NEW_PSW 0x1c0
+#define __LC_PGM_NEW_PSW 0x1d0
+#define IPA0_DIAG 0x8300
+#define PGM_SPECIFICATION 0x06
+
+/* Common code for testing single-stepping interruptions. */
+extern char int_handler[];
+asm("int_handler:\n"
+    "j .\n");
+
+static struct kvm_vm *test_step_int_1(struct kvm_vcpu **vcpu, void *guest_code,
+				      size_t new_psw_off, uint64_t *new_psw)
+{
+	struct kvm_guest_debug debug = {};
+	struct kvm_regs regs;
+	struct kvm_vm *vm;
+	char *lowcore;
+
+	vm = vm_create_with_one_vcpu(vcpu, guest_code);
+	lowcore = addr_gpa2hva(vm, 0);
+	new_psw[0] = (*vcpu)->run->psw_mask;
+	new_psw[1] = (uint64_t)int_handler;
+	memcpy(lowcore + new_psw_off, new_psw, 16);
+	vcpu_regs_get(*vcpu, &regs);
+	regs.gprs[2] = -1;
+	vcpu_regs_set(*vcpu, &regs);
+	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+	vcpu_guest_debug_set(*vcpu, &debug);
+	vcpu_run(*vcpu);
+
+	return vm;
+}
+
+static void test_step_int(void *guest_code, size_t new_psw_off)
+{
+	struct kvm_vcpu *vcpu;
+	uint64_t new_psw[2];
+	struct kvm_vm *vm;
+
+	vm = test_step_int_1(&vcpu, guest_code, new_psw_off, new_psw);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG);
+	TEST_ASSERT_EQ(vcpu->run->psw_mask, new_psw[0]);
+	TEST_ASSERT_EQ(vcpu->run->psw_addr, new_psw[1]);
+	kvm_vm_free(vm);
+}
+
+/* Test single-stepping "boring" program interruptions. */
+extern char test_step_pgm_guest_code[];
+asm("test_step_pgm_guest_code:\n"
+    ".insn rr,0x1d00,%r1,%r0 /* dr %r1,%r0 */\n"
+    "j .\n");
+
+static void test_step_pgm(void)
+{
+	test_step_int(test_step_pgm_guest_code, __LC_PGM_NEW_PSW);
+}
+
+/*
+ * Test single-stepping program interruptions caused by DIAG.
+ * Userspace emulation must not interfere with single-stepping.
+ */
+extern char test_step_pgm_diag_guest_code[];
+asm("test_step_pgm_diag_guest_code:\n"
+    "diag %r0,%r0,0\n"
+    "j .\n");
+
+static void test_step_pgm_diag(void)
+{
+	struct kvm_s390_irq irq = {
+		.type = KVM_S390_PROGRAM_INT,
+		.u.pgm.code = PGM_SPECIFICATION,
+	};
+	struct kvm_vcpu *vcpu;
+	uint64_t new_psw[2];
+	struct kvm_vm *vm;
+
+	vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code,
+			     __LC_PGM_NEW_PSW, new_psw);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST);
+	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG);
+	vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq);
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG);
+	TEST_ASSERT_EQ(vcpu->run->psw_mask, new_psw[0]);
+	TEST_ASSERT_EQ(vcpu->run->psw_addr, new_psw[1]);
+	kvm_vm_free(vm);
+}
+
+/*
+ * Test single-stepping program interruptions caused by ISKE.
+ * CPUSTAT_KSS handling must not interfere with single-stepping.
+ */
+extern char test_step_pgm_iske_guest_code[];
+asm("test_step_pgm_iske_guest_code:\n"
+    "iske %r2,%r2\n"
+    "j .\n");
+
+static void test_step_pgm_iske(void)
+{
+	test_step_int(test_step_pgm_iske_guest_code, __LC_PGM_NEW_PSW);
+}
+
+/*
+ * Test single-stepping program interruptions caused by LCTL.
+ * KVM emulation must not interfere with single-stepping.
+ */
+extern char test_step_pgm_lctl_guest_code[];
+asm("test_step_pgm_lctl_guest_code:\n"
+    "lctl %c0,%c0,1\n"
+    "j .\n");
+
+static void test_step_pgm_lctl(void)
+{
+	test_step_int(test_step_pgm_lctl_guest_code, __LC_PGM_NEW_PSW);
+}
+
+/* Test single-stepping supervisor-call interruptions. */
+extern char test_step_svc_guest_code[];
+asm("test_step_svc_guest_code:\n"
+    "svc 0\n"
+    "j .\n");
+
+static void test_step_svc(void)
+{
+	test_step_int(test_step_svc_guest_code, __LC_SVC_NEW_PSW);
+}
+
+/* Run all tests above. */
+static struct testdef {
+	const char *name;
+	void (*test)(void);
+} testlist[] = {
+	{ "single-step pgm", test_step_pgm },
+	{ "single-step pgm caused by diag", test_step_pgm_diag },
+	{ "single-step pgm caused by iske", test_step_pgm_iske },
+	{ "single-step pgm caused by lctl", test_step_pgm_lctl },
+	{ "single-step svc", test_step_svc },
+};
+
+int main(int argc, char *argv[])
+{
+	int idx;
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(testlist));
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		testlist[idx].test();
+		ksft_test_result_pass("%s\n", testlist[idx].name);
+	}
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/kvm/s390/memop.c b/tools/testing/selftests/kvm/s390/memop.c
new file mode 100644
index 000000000000..4374b4cd2a80
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/memop.c
@@ -0,0 +1,1187 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test for s390x KVM_S390_MEM_OP
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+
+#include <linux/bits.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+#include "ucall_common.h"
+#include "processor.h"
+
+enum mop_target {
+	LOGICAL,
+	SIDA,
+	ABSOLUTE,
+	INVALID,
+};
+
+enum mop_access_mode {
+	READ,
+	WRITE,
+	CMPXCHG,
+};
+
+struct mop_desc {
+	uintptr_t gaddr;
+	uintptr_t gaddr_v;
+	uint64_t set_flags;
+	unsigned int f_check : 1;
+	unsigned int f_inject : 1;
+	unsigned int f_key : 1;
+	unsigned int _gaddr_v : 1;
+	unsigned int _set_flags : 1;
+	unsigned int _sida_offset : 1;
+	unsigned int _ar : 1;
+	uint32_t size;
+	enum mop_target target;
+	enum mop_access_mode mode;
+	void *buf;
+	uint32_t sida_offset;
+	void *old;
+	uint8_t old_value[16];
+	bool *cmpxchg_success;
+	uint8_t ar;
+	uint8_t key;
+};
+
+const uint8_t NO_KEY = 0xff;
+
+static struct kvm_s390_mem_op ksmo_from_desc(struct mop_desc *desc)
+{
+	struct kvm_s390_mem_op ksmo = {
+		.gaddr = (uintptr_t)desc->gaddr,
+		.size = desc->size,
+		.buf = ((uintptr_t)desc->buf),
+		.reserved = "ignored_ignored_ignored_ignored"
+	};
+
+	switch (desc->target) {
+	case LOGICAL:
+		if (desc->mode == READ)
+			ksmo.op = KVM_S390_MEMOP_LOGICAL_READ;
+		if (desc->mode == WRITE)
+			ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+		break;
+	case SIDA:
+		if (desc->mode == READ)
+			ksmo.op = KVM_S390_MEMOP_SIDA_READ;
+		if (desc->mode == WRITE)
+			ksmo.op = KVM_S390_MEMOP_SIDA_WRITE;
+		break;
+	case ABSOLUTE:
+		if (desc->mode == READ)
+			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_READ;
+		if (desc->mode == WRITE)
+			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_WRITE;
+		if (desc->mode == CMPXCHG) {
+			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_CMPXCHG;
+			ksmo.old_addr = (uint64_t)desc->old;
+			memcpy(desc->old_value, desc->old, desc->size);
+		}
+		break;
+	case INVALID:
+		ksmo.op = -1;
+	}
+	if (desc->f_check)
+		ksmo.flags |= KVM_S390_MEMOP_F_CHECK_ONLY;
+	if (desc->f_inject)
+		ksmo.flags |= KVM_S390_MEMOP_F_INJECT_EXCEPTION;
+	if (desc->_set_flags)
+		ksmo.flags = desc->set_flags;
+	if (desc->f_key && desc->key != NO_KEY) {
+		ksmo.flags |= KVM_S390_MEMOP_F_SKEY_PROTECTION;
+		ksmo.key = desc->key;
+	}
+	if (desc->_ar)
+		ksmo.ar = desc->ar;
+	else
+		ksmo.ar = 0;
+	if (desc->_sida_offset)
+		ksmo.sida_offset = desc->sida_offset;
+
+	return ksmo;
+}
+
+struct test_info {
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+};
+
+#define PRINT_MEMOP false
+static void print_memop(struct kvm_vcpu *vcpu, const struct kvm_s390_mem_op *ksmo)
+{
+	if (!PRINT_MEMOP)
+		return;
+
+	if (!vcpu)
+		printf("vm memop(");
+	else
+		printf("vcpu memop(");
+	switch (ksmo->op) {
+	case KVM_S390_MEMOP_LOGICAL_READ:
+		printf("LOGICAL, READ, ");
+		break;
+	case KVM_S390_MEMOP_LOGICAL_WRITE:
+		printf("LOGICAL, WRITE, ");
+		break;
+	case KVM_S390_MEMOP_SIDA_READ:
+		printf("SIDA, READ, ");
+		break;
+	case KVM_S390_MEMOP_SIDA_WRITE:
+		printf("SIDA, WRITE, ");
+		break;
+	case KVM_S390_MEMOP_ABSOLUTE_READ:
+		printf("ABSOLUTE, READ, ");
+		break;
+	case KVM_S390_MEMOP_ABSOLUTE_WRITE:
+		printf("ABSOLUTE, WRITE, ");
+		break;
+	case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
+		printf("ABSOLUTE, CMPXCHG, ");
+		break;
+	}
+	printf("gaddr=%llu, size=%u, buf=%llu, ar=%u, key=%u, old_addr=%llx",
+	       ksmo->gaddr, ksmo->size, ksmo->buf, ksmo->ar, ksmo->key,
+	       ksmo->old_addr);
+	if (ksmo->flags & KVM_S390_MEMOP_F_CHECK_ONLY)
+		printf(", CHECK_ONLY");
+	if (ksmo->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION)
+		printf(", INJECT_EXCEPTION");
+	if (ksmo->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION)
+		printf(", SKEY_PROTECTION");
+	puts(")");
+}
+
+static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
+			   struct mop_desc *desc)
+{
+	struct kvm_vcpu *vcpu = info.vcpu;
+
+	if (!vcpu)
+		return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo);
+	else
+		return __vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo);
+}
+
+static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
+			struct mop_desc *desc)
+{
+	int r;
+
+	r = err_memop_ioctl(info, ksmo, desc);
+	if (ksmo->op == KVM_S390_MEMOP_ABSOLUTE_CMPXCHG) {
+		if (desc->cmpxchg_success) {
+			int diff = memcmp(desc->old_value, desc->old, desc->size);
+			*desc->cmpxchg_success = !diff;
+		}
+	}
+	TEST_ASSERT(!r, __KVM_IOCTL_ERROR("KVM_S390_MEM_OP", r));
+}
+
+#define MEMOP(err, info_p, mop_target_p, access_mode_p, buf_p, size_p, ...)	\
+({										\
+	struct test_info __info = (info_p);					\
+	struct mop_desc __desc = {						\
+		.target = (mop_target_p),					\
+		.mode = (access_mode_p),					\
+		.buf = (buf_p),							\
+		.size = (size_p),						\
+		__VA_ARGS__							\
+	};									\
+	struct kvm_s390_mem_op __ksmo;						\
+										\
+	if (__desc._gaddr_v) {							\
+		if (__desc.target == ABSOLUTE)					\
+			__desc.gaddr = addr_gva2gpa(__info.vm, __desc.gaddr_v);	\
+		else								\
+			__desc.gaddr = __desc.gaddr_v;				\
+	}									\
+	__ksmo = ksmo_from_desc(&__desc);					\
+	print_memop(__info.vcpu, &__ksmo);					\
+	err##memop_ioctl(__info, &__ksmo, &__desc);				\
+})
+
+#define MOP(...) MEMOP(, __VA_ARGS__)
+#define ERR_MOP(...) MEMOP(err_, __VA_ARGS__)
+
+#define GADDR(a) .gaddr = ((uintptr_t)a)
+#define GADDR_V(v) ._gaddr_v = 1, .gaddr_v = ((uintptr_t)v)
+#define CHECK_ONLY .f_check = 1
+#define SET_FLAGS(f) ._set_flags = 1, .set_flags = (f)
+#define SIDA_OFFSET(o) ._sida_offset = 1, .sida_offset = (o)
+#define AR(a) ._ar = 1, .ar = (a)
+#define KEY(a) .f_key = 1, .key = (a)
+#define INJECT .f_inject = 1
+#define CMPXCHG_OLD(o) .old = (o)
+#define CMPXCHG_SUCCESS(s) .cmpxchg_success = (s)
+
+#define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); })
+
+#define CR0_FETCH_PROTECTION_OVERRIDE	(1UL << (63 - 38))
+#define CR0_STORAGE_PROTECTION_OVERRIDE	(1UL << (63 - 39))
+
+static uint8_t __aligned(PAGE_SIZE) mem1[65536];
+static uint8_t __aligned(PAGE_SIZE) mem2[65536];
+
+struct test_default {
+	struct kvm_vm *kvm_vm;
+	struct test_info vm;
+	struct test_info vcpu;
+	struct kvm_run *run;
+	int size;
+};
+
+static struct test_default test_default_init(void *guest_code)
+{
+	struct kvm_vcpu *vcpu;
+	struct test_default t;
+
+	t.size = min((size_t)kvm_check_cap(KVM_CAP_S390_MEM_OP), sizeof(mem1));
+	t.kvm_vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	t.vm = (struct test_info) { t.kvm_vm, NULL };
+	t.vcpu = (struct test_info) { t.kvm_vm, vcpu };
+	t.run = vcpu->run;
+	return t;
+}
+
+enum stage {
+	/* Synced state set by host, e.g. DAT */
+	STAGE_INITED,
+	/* Guest did nothing */
+	STAGE_IDLED,
+	/* Guest set storage keys (specifics up to test case) */
+	STAGE_SKEYS_SET,
+	/* Guest copied memory (locations up to test case) */
+	STAGE_COPIED,
+	/* End of guest code reached */
+	STAGE_DONE,
+};
+
+#define HOST_SYNC(info_p, stage)					\
+({									\
+	struct test_info __info = (info_p);				\
+	struct kvm_vcpu *__vcpu = __info.vcpu;				\
+	struct ucall uc;						\
+	int __stage = (stage);						\
+									\
+	vcpu_run(__vcpu);						\
+	get_ucall(__vcpu, &uc);						\
+	if (uc.cmd == UCALL_ABORT) {					\
+		REPORT_GUEST_ASSERT(uc);				\
+	}								\
+	TEST_ASSERT_EQ(uc.cmd, UCALL_SYNC);				\
+	TEST_ASSERT_EQ(uc.args[1], __stage);				\
+})									\
+
+static void prepare_mem12(void)
+{
+	int i;
+
+	for (i = 0; i < sizeof(mem1); i++)
+		mem1[i] = rand();
+	memset(mem2, 0xaa, sizeof(mem2));
+}
+
+#define ASSERT_MEM_EQ(p1, p2, size) \
+	TEST_ASSERT(!memcmp(p1, p2, size), "Memory contents do not match!")
+
+static void default_write_read(struct test_info copy_cpu, struct test_info mop_cpu,
+			       enum mop_target mop_target, uint32_t size, uint8_t key)
+{
+	prepare_mem12();
+	CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size,
+		   GADDR_V(mem1), KEY(key));
+	HOST_SYNC(copy_cpu, STAGE_COPIED);
+	CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size,
+		   GADDR_V(mem2), KEY(key));
+	ASSERT_MEM_EQ(mem1, mem2, size);
+}
+
+static void default_read(struct test_info copy_cpu, struct test_info mop_cpu,
+			 enum mop_target mop_target, uint32_t size, uint8_t key)
+{
+	prepare_mem12();
+	CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size, GADDR_V(mem1));
+	HOST_SYNC(copy_cpu, STAGE_COPIED);
+	CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size,
+		   GADDR_V(mem2), KEY(key));
+	ASSERT_MEM_EQ(mem1, mem2, size);
+}
+
+static void default_cmpxchg(struct test_default *test, uint8_t key)
+{
+	for (int size = 1; size <= 16; size *= 2) {
+		for (int offset = 0; offset < 16; offset += size) {
+			uint8_t __aligned(16) new[16] = {};
+			uint8_t __aligned(16) old[16];
+			bool succ;
+
+			prepare_mem12();
+			default_write_read(test->vcpu, test->vcpu, LOGICAL, 16, NO_KEY);
+
+			memcpy(&old, mem1, 16);
+			MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset,
+			    size, GADDR_V(mem1 + offset),
+			    CMPXCHG_OLD(old + offset),
+			    CMPXCHG_SUCCESS(&succ), KEY(key));
+			HOST_SYNC(test->vcpu, STAGE_COPIED);
+			MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2));
+			TEST_ASSERT(succ, "exchange of values should succeed");
+			memcpy(mem1 + offset, new + offset, size);
+			ASSERT_MEM_EQ(mem1, mem2, 16);
+
+			memcpy(&old, mem1, 16);
+			new[offset]++;
+			old[offset]++;
+			MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset,
+			    size, GADDR_V(mem1 + offset),
+			    CMPXCHG_OLD(old + offset),
+			    CMPXCHG_SUCCESS(&succ), KEY(key));
+			HOST_SYNC(test->vcpu, STAGE_COPIED);
+			MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2));
+			TEST_ASSERT(!succ, "exchange of values should not succeed");
+			ASSERT_MEM_EQ(mem1, mem2, 16);
+			ASSERT_MEM_EQ(&old, mem1, 16);
+		}
+	}
+}
+
+static void guest_copy(void)
+{
+	GUEST_SYNC(STAGE_INITED);
+	memcpy(&mem2, &mem1, sizeof(mem2));
+	GUEST_SYNC(STAGE_COPIED);
+}
+
+static void test_copy(void)
+{
+	struct test_default t = test_default_init(guest_copy);
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, NO_KEY);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_copy_access_register(void)
+{
+	struct test_default t = test_default_init(guest_copy);
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+
+	prepare_mem12();
+	t.run->psw_mask &= ~(3UL << (63 - 17));
+	t.run->psw_mask |= 1UL << (63 - 17);  /* Enable AR mode */
+
+	/*
+	 * Primary address space gets used if an access register
+	 * contains zero. The host makes use of AR[1] so is a good
+	 * candidate to ensure the guest AR (of zero) is used.
+	 */
+	CHECK_N_DO(MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size,
+		   GADDR_V(mem1), AR(1));
+	HOST_SYNC(t.vcpu, STAGE_COPIED);
+
+	CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, t.size,
+		   GADDR_V(mem2), AR(1));
+	ASSERT_MEM_EQ(mem1, mem2, t.size);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void set_storage_key_range(void *addr, size_t len, uint8_t key)
+{
+	uintptr_t _addr, abs, i;
+	int not_mapped = 0;
+
+	_addr = (uintptr_t)addr;
+	for (i = _addr & PAGE_MASK; i < _addr + len; i += PAGE_SIZE) {
+		abs = i;
+		asm volatile (
+			       "lra	%[abs], 0(0,%[abs])\n"
+			"	jz	0f\n"
+			"	llill	%[not_mapped],1\n"
+			"	j	1f\n"
+			"0:	sske	%[key], %[abs]\n"
+			"1:"
+			: [abs] "+&a" (abs), [not_mapped] "+r" (not_mapped)
+			: [key] "r" (key)
+			: "cc"
+		);
+		GUEST_ASSERT_EQ(not_mapped, 0);
+	}
+}
+
+static void guest_copy_key(void)
+{
+	set_storage_key_range(mem1, sizeof(mem1), 0x90);
+	set_storage_key_range(mem2, sizeof(mem2), 0x90);
+	GUEST_SYNC(STAGE_SKEYS_SET);
+
+	for (;;) {
+		memcpy(&mem2, &mem1, sizeof(mem2));
+		GUEST_SYNC(STAGE_COPIED);
+	}
+}
+
+static void test_copy_key(void)
+{
+	struct test_default t = test_default_init(guest_copy_key);
+
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vm, no key */
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, NO_KEY);
+
+	/* vm/vcpu, machting key or key 0 */
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 0);
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 0);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9);
+	/*
+	 * There used to be different code paths for key handling depending on
+	 * if the region crossed a page boundary.
+	 * There currently are not, but the more tests the merrier.
+	 */
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 0);
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 9);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 0);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 9);
+
+	/* vm/vcpu, mismatching keys on read, but no fetch protection */
+	default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2);
+	default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 2);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_cmpxchg_key(void)
+{
+	struct test_default t = test_default_init(guest_copy_key);
+
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	default_cmpxchg(&t, NO_KEY);
+	default_cmpxchg(&t, 0);
+	default_cmpxchg(&t, 9);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static __uint128_t cut_to_size(int size, __uint128_t val)
+{
+	switch (size) {
+	case 1:
+		return (uint8_t)val;
+	case 2:
+		return (uint16_t)val;
+	case 4:
+		return (uint32_t)val;
+	case 8:
+		return (uint64_t)val;
+	case 16:
+		return val;
+	}
+	GUEST_FAIL("Invalid size = %u", size);
+	return 0;
+}
+
+static bool popcount_eq(__uint128_t a, __uint128_t b)
+{
+	unsigned int count_a, count_b;
+
+	count_a = __builtin_popcountl((uint64_t)(a >> 64)) +
+		  __builtin_popcountl((uint64_t)a);
+	count_b = __builtin_popcountl((uint64_t)(b >> 64)) +
+		  __builtin_popcountl((uint64_t)b);
+	return count_a == count_b;
+}
+
+static __uint128_t rotate(int size, __uint128_t val, int amount)
+{
+	unsigned int bits = size * 8;
+
+	amount = (amount + bits) % bits;
+	val = cut_to_size(size, val);
+	if (!amount)
+		return val;
+	return (val << (bits - amount)) | (val >> amount);
+}
+
+const unsigned int max_block = 16;
+
+static void choose_block(bool guest, int i, int *size, int *offset)
+{
+	unsigned int rand;
+
+	rand = i;
+	if (guest) {
+		rand = rand * 19 + 11;
+		*size = 1 << ((rand % 3) + 2);
+		rand = rand * 19 + 11;
+		*offset = (rand % max_block) & ~(*size - 1);
+	} else {
+		rand = rand * 17 + 5;
+		*size = 1 << (rand % 5);
+		rand = rand * 17 + 5;
+		*offset = (rand % max_block) & ~(*size - 1);
+	}
+}
+
+static __uint128_t permutate_bits(bool guest, int i, int size, __uint128_t old)
+{
+	unsigned int rand;
+	int amount;
+	bool swap;
+
+	rand = i;
+	rand = rand * 3 + 1;
+	if (guest)
+		rand = rand * 3 + 1;
+	swap = rand % 2 == 0;
+	if (swap) {
+		int i, j;
+		__uint128_t new;
+		uint8_t byte0, byte1;
+
+		rand = rand * 3 + 1;
+		i = rand % size;
+		rand = rand * 3 + 1;
+		j = rand % size;
+		if (i == j)
+			return old;
+		new = rotate(16, old, i * 8);
+		byte0 = new & 0xff;
+		new &= ~0xff;
+		new = rotate(16, new, -i * 8);
+		new = rotate(16, new, j * 8);
+		byte1 = new & 0xff;
+		new = (new & ~0xff) | byte0;
+		new = rotate(16, new, -j * 8);
+		new = rotate(16, new, i * 8);
+		new = new | byte1;
+		new = rotate(16, new, -i * 8);
+		return new;
+	}
+	rand = rand * 3 + 1;
+	amount = rand % (size * 8);
+	return rotate(size, old, amount);
+}
+
+static bool _cmpxchg(int size, void *target, __uint128_t *old_addr, __uint128_t new)
+{
+	bool ret;
+
+	switch (size) {
+	case 4: {
+			uint32_t old = *old_addr;
+
+			asm volatile ("cs %[old],%[new],%[address]"
+			    : [old] "+d" (old),
+			      [address] "+Q" (*(uint32_t *)(target))
+			    : [new] "d" ((uint32_t)new)
+			    : "cc"
+			);
+			ret = old == (uint32_t)*old_addr;
+			*old_addr = old;
+			return ret;
+		}
+	case 8: {
+			uint64_t old = *old_addr;
+
+			asm volatile ("csg %[old],%[new],%[address]"
+			    : [old] "+d" (old),
+			      [address] "+Q" (*(uint64_t *)(target))
+			    : [new] "d" ((uint64_t)new)
+			    : "cc"
+			);
+			ret = old == (uint64_t)*old_addr;
+			*old_addr = old;
+			return ret;
+		}
+	case 16: {
+			__uint128_t old = *old_addr;
+
+			asm volatile ("cdsg %[old],%[new],%[address]"
+			    : [old] "+d" (old),
+			      [address] "+Q" (*(__uint128_t *)(target))
+			    : [new] "d" (new)
+			    : "cc"
+			);
+			ret = old == *old_addr;
+			*old_addr = old;
+			return ret;
+		}
+	}
+	GUEST_FAIL("Invalid size = %u", size);
+	return 0;
+}
+
+const unsigned int cmpxchg_iter_outer = 100, cmpxchg_iter_inner = 10000;
+
+static void guest_cmpxchg_key(void)
+{
+	int size, offset;
+	__uint128_t old, new;
+
+	set_storage_key_range(mem1, max_block, 0x10);
+	set_storage_key_range(mem2, max_block, 0x10);
+	GUEST_SYNC(STAGE_SKEYS_SET);
+
+	for (int i = 0; i < cmpxchg_iter_outer; i++) {
+		do {
+			old = 1;
+		} while (!_cmpxchg(16, mem1, &old, 0));
+		for (int j = 0; j < cmpxchg_iter_inner; j++) {
+			choose_block(true, i + j, &size, &offset);
+			do {
+				new = permutate_bits(true, i + j, size, old);
+			} while (!_cmpxchg(size, mem2 + offset, &old, new));
+		}
+	}
+
+	GUEST_SYNC(STAGE_DONE);
+}
+
+static void *run_guest(void *data)
+{
+	struct test_info *info = data;
+
+	HOST_SYNC(*info, STAGE_DONE);
+	return NULL;
+}
+
+static char *quad_to_char(__uint128_t *quad, int size)
+{
+	return ((char *)quad) + (sizeof(*quad) - size);
+}
+
+static void test_cmpxchg_key_concurrent(void)
+{
+	struct test_default t = test_default_init(guest_cmpxchg_key);
+	int size, offset;
+	__uint128_t old, new;
+	bool success;
+	pthread_t thread;
+
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+	prepare_mem12();
+	MOP(t.vcpu, LOGICAL, WRITE, mem1, max_block, GADDR_V(mem2));
+	pthread_create(&thread, NULL, run_guest, &t.vcpu);
+
+	for (int i = 0; i < cmpxchg_iter_outer; i++) {
+		do {
+			old = 0;
+			new = 1;
+			MOP(t.vm, ABSOLUTE, CMPXCHG, &new,
+			    sizeof(new), GADDR_V(mem1),
+			    CMPXCHG_OLD(&old),
+			    CMPXCHG_SUCCESS(&success), KEY(1));
+		} while (!success);
+		for (int j = 0; j < cmpxchg_iter_inner; j++) {
+			choose_block(false, i + j, &size, &offset);
+			do {
+				new = permutate_bits(false, i + j, size, old);
+				MOP(t.vm, ABSOLUTE, CMPXCHG, quad_to_char(&new, size),
+				    size, GADDR_V(mem2 + offset),
+				    CMPXCHG_OLD(quad_to_char(&old, size)),
+				    CMPXCHG_SUCCESS(&success), KEY(1));
+			} while (!success);
+		}
+	}
+
+	pthread_join(thread, NULL);
+
+	MOP(t.vcpu, LOGICAL, READ, mem2, max_block, GADDR_V(mem2));
+	TEST_ASSERT(popcount_eq(*(__uint128_t *)mem1, *(__uint128_t *)mem2),
+		    "Must retain number of set bits");
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void guest_copy_key_fetch_prot(void)
+{
+	/*
+	 * For some reason combining the first sync with override enablement
+	 * results in an exception when calling HOST_SYNC.
+	 */
+	GUEST_SYNC(STAGE_INITED);
+	/* Storage protection override applies to both store and fetch. */
+	set_storage_key_range(mem1, sizeof(mem1), 0x98);
+	set_storage_key_range(mem2, sizeof(mem2), 0x98);
+	GUEST_SYNC(STAGE_SKEYS_SET);
+
+	for (;;) {
+		memcpy(&mem2, &mem1, sizeof(mem2));
+		GUEST_SYNC(STAGE_COPIED);
+	}
+}
+
+static void test_copy_key_storage_prot_override(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	t.run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE;
+	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vcpu, mismatching keys, storage protection override in effect */
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_copy_key_fetch_prot(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vm/vcpu, matching key, fetch protection in effect */
+	default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9);
+	default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+#define ERR_PROT_MOP(...)							\
+({										\
+	int rv;									\
+										\
+	rv = ERR_MOP(__VA_ARGS__);						\
+	TEST_ASSERT(rv == 4, "Should result in protection exception");		\
+})
+
+static void guest_error_key(void)
+{
+	GUEST_SYNC(STAGE_INITED);
+	set_storage_key_range(mem1, PAGE_SIZE, 0x18);
+	set_storage_key_range(mem1 + PAGE_SIZE, sizeof(mem1) - PAGE_SIZE, 0x98);
+	GUEST_SYNC(STAGE_SKEYS_SET);
+	GUEST_SYNC(STAGE_IDLED);
+}
+
+static void test_errors_key(void)
+{
+	struct test_default t = test_default_init(guest_error_key);
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vm/vcpu, mismatching keys, fetch protection in effect */
+	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, t.size, GADDR_V(mem1), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem1), KEY(2));
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_errors_cmpxchg_key(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
+	int i;
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	for (i = 1; i <= 16; i *= 2) {
+		__uint128_t old = 0;
+
+		ERR_PROT_MOP(t.vm, ABSOLUTE, CMPXCHG, mem2, i, GADDR_V(mem2),
+			     CMPXCHG_OLD(&old), KEY(2));
+	}
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_termination(void)
+{
+	struct test_default t = test_default_init(guest_error_key);
+	uint64_t prefix;
+	uint64_t teid;
+	uint64_t teid_mask = BIT(63 - 56) | BIT(63 - 60) | BIT(63 - 61);
+	uint64_t psw[2];
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vcpu, mismatching keys after first page */
+	ERR_PROT_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(1), INJECT);
+	/*
+	 * The memop injected a program exception and the test needs to check the
+	 * Translation-Exception Identification (TEID). It is necessary to run
+	 * the guest in order to be able to read the TEID from guest memory.
+	 * Set the guest program new PSW, so the guest state is not clobbered.
+	 */
+	prefix = t.run->s.regs.prefix;
+	psw[0] = t.run->psw_mask;
+	psw[1] = t.run->psw_addr;
+	MOP(t.vm, ABSOLUTE, WRITE, psw, sizeof(psw), GADDR(prefix + 464));
+	HOST_SYNC(t.vcpu, STAGE_IDLED);
+	MOP(t.vm, ABSOLUTE, READ, &teid, sizeof(teid), GADDR(prefix + 168));
+	/* Bits 56, 60, 61 form a code, 0 being the only one allowing for termination */
+	TEST_ASSERT_EQ(teid & teid_mask, 0);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_errors_key_storage_prot_override(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	t.run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE;
+	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vm, mismatching keys, storage protection override not applicable to vm */
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem2), KEY(2));
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+const uint64_t last_page_addr = -PAGE_SIZE;
+
+static void guest_copy_key_fetch_prot_override(void)
+{
+	int i;
+	char *page_0 = 0;
+
+	GUEST_SYNC(STAGE_INITED);
+	set_storage_key_range(0, PAGE_SIZE, 0x18);
+	set_storage_key_range((void *)last_page_addr, PAGE_SIZE, 0x0);
+	asm volatile ("sske %[key],%[addr]\n" :: [addr] "r"(0L), [key] "r"(0x18) : "cc");
+	GUEST_SYNC(STAGE_SKEYS_SET);
+
+	for (;;) {
+		for (i = 0; i < PAGE_SIZE; i++)
+			page_0[i] = mem1[i];
+		GUEST_SYNC(STAGE_COPIED);
+	}
+}
+
+static void test_copy_key_fetch_prot_override(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot_override);
+	vm_vaddr_t guest_0_page, guest_last_page;
+
+	guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0);
+	guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr);
+	if (guest_0_page != 0 || guest_last_page != last_page_addr) {
+		print_skip("did not allocate guest pages at required positions");
+		goto out;
+	}
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	t.run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE;
+	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vcpu, mismatching keys on fetch, fetch protection override applies */
+	prepare_mem12();
+	MOP(t.vcpu, LOGICAL, WRITE, mem1, PAGE_SIZE, GADDR_V(mem1));
+	HOST_SYNC(t.vcpu, STAGE_COPIED);
+	CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, 2048, GADDR_V(guest_0_page), KEY(2));
+	ASSERT_MEM_EQ(mem1, mem2, 2048);
+
+	/*
+	 * vcpu, mismatching keys on fetch, fetch protection override applies,
+	 * wraparound
+	 */
+	prepare_mem12();
+	MOP(t.vcpu, LOGICAL, WRITE, mem1, 2 * PAGE_SIZE, GADDR_V(guest_last_page));
+	HOST_SYNC(t.vcpu, STAGE_COPIED);
+	CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048,
+		   GADDR_V(guest_last_page), KEY(2));
+	ASSERT_MEM_EQ(mem1, mem2, 2048);
+
+out:
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_errors_key_fetch_prot_override_not_enabled(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot_override);
+	vm_vaddr_t guest_0_page, guest_last_page;
+
+	guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0);
+	guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr);
+	if (guest_0_page != 0 || guest_last_page != last_page_addr) {
+		print_skip("did not allocate guest pages at required positions");
+		goto out;
+	}
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/* vcpu, mismatching keys on fetch, fetch protection override not enabled */
+	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048, GADDR_V(0), KEY(2));
+
+out:
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_errors_key_fetch_prot_override_enabled(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot_override);
+	vm_vaddr_t guest_0_page, guest_last_page;
+
+	guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0);
+	guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr);
+	if (guest_0_page != 0 || guest_last_page != last_page_addr) {
+		print_skip("did not allocate guest pages at required positions");
+		goto out;
+	}
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	t.run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE;
+	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	/*
+	 * vcpu, mismatching keys on fetch,
+	 * fetch protection override does not apply because memory range exceeded
+	 */
+	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048 + 1, GADDR_V(0), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048 + 1,
+		   GADDR_V(guest_last_page), KEY(2));
+	/* vm, fetch protected override does not apply */
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, 2048, GADDR(0), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, 2048, GADDR_V(guest_0_page), KEY(2));
+
+out:
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void guest_idle(void)
+{
+	GUEST_SYNC(STAGE_INITED); /* for consistency's sake */
+	for (;;)
+		GUEST_SYNC(STAGE_IDLED);
+}
+
+static void _test_errors_common(struct test_info info, enum mop_target target, int size)
+{
+	int rv;
+
+	/* Bad size: */
+	rv = ERR_MOP(info, target, WRITE, mem1, -1, GADDR_V(mem1));
+	TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes");
+
+	/* Zero size: */
+	rv = ERR_MOP(info, target, WRITE, mem1, 0, GADDR_V(mem1));
+	TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM),
+		    "ioctl allows 0 as size");
+
+	/* Bad flags: */
+	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), SET_FLAGS(-1));
+	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags");
+
+	/* Bad guest address: */
+	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL), CHECK_ONLY);
+	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address with CHECK_ONLY");
+	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL));
+	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address on write");
+
+	/* Bad host address: */
+	rv = ERR_MOP(info, target, WRITE, 0, size, GADDR_V(mem1));
+	TEST_ASSERT(rv == -1 && errno == EFAULT,
+		    "ioctl does not report bad host memory address");
+
+	/* Bad key: */
+	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), KEY(17));
+	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows invalid key");
+}
+
+static void test_errors(void)
+{
+	struct test_default t = test_default_init(guest_idle);
+	int rv;
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+
+	_test_errors_common(t.vcpu, LOGICAL, t.size);
+	_test_errors_common(t.vm, ABSOLUTE, t.size);
+
+	/* Bad operation: */
+	rv = ERR_MOP(t.vcpu, INVALID, WRITE, mem1, t.size, GADDR_V(mem1));
+	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations");
+	/* virtual addresses are not translated when passing INVALID */
+	rv = ERR_MOP(t.vm, INVALID, WRITE, mem1, PAGE_SIZE, GADDR(0));
+	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations");
+
+	/* Bad access register: */
+	t.run->psw_mask &= ~(3UL << (63 - 17));
+	t.run->psw_mask |= 1UL << (63 - 17);  /* Enable AR mode */
+	HOST_SYNC(t.vcpu, STAGE_IDLED); /* To sync new state to SIE block */
+	rv = ERR_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), AR(17));
+	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15");
+	t.run->psw_mask &= ~(3UL << (63 - 17));   /* Disable AR mode */
+	HOST_SYNC(t.vcpu, STAGE_IDLED); /* Run to sync new state */
+
+	/* Check that the SIDA calls are rejected for non-protected guests */
+	rv = ERR_MOP(t.vcpu, SIDA, READ, mem1, 8, GADDR(0), SIDA_OFFSET(0x1c0));
+	TEST_ASSERT(rv == -1 && errno == EINVAL,
+		    "ioctl does not reject SIDA_READ in non-protected mode");
+	rv = ERR_MOP(t.vcpu, SIDA, WRITE, mem1, 8, GADDR(0), SIDA_OFFSET(0x1c0));
+	TEST_ASSERT(rv == -1 && errno == EINVAL,
+		    "ioctl does not reject SIDA_WRITE in non-protected mode");
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static void test_errors_cmpxchg(void)
+{
+	struct test_default t = test_default_init(guest_idle);
+	__uint128_t old;
+	int rv, i, power = 1;
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+
+	for (i = 0; i < 32; i++) {
+		if (i == power) {
+			power *= 2;
+			continue;
+		}
+		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1),
+			     CMPXCHG_OLD(&old));
+		TEST_ASSERT(rv == -1 && errno == EINVAL,
+			    "ioctl allows bad size for cmpxchg");
+	}
+	for (i = 1; i <= 16; i *= 2) {
+		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR((void *)~0xfffUL),
+			     CMPXCHG_OLD(&old));
+		TEST_ASSERT(rv > 0, "ioctl allows bad guest address for cmpxchg");
+	}
+	for (i = 2; i <= 16; i *= 2) {
+		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1 + 1),
+			     CMPXCHG_OLD(&old));
+		TEST_ASSERT(rv == -1 && errno == EINVAL,
+			    "ioctl allows bad alignment for cmpxchg");
+	}
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+int main(int argc, char *argv[])
+{
+	int extension_cap, idx;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP));
+	extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION);
+
+	struct testdef {
+		const char *name;
+		void (*test)(void);
+		bool requirements_met;
+	} testlist[] = {
+		{
+			.name = "simple copy",
+			.test = test_copy,
+			.requirements_met = true,
+		},
+		{
+			.name = "generic error checks",
+			.test = test_errors,
+			.requirements_met = true,
+		},
+		{
+			.name = "copy with storage keys",
+			.test = test_copy_key,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "cmpxchg with storage keys",
+			.test = test_cmpxchg_key,
+			.requirements_met = extension_cap & 0x2,
+		},
+		{
+			.name = "concurrently cmpxchg with storage keys",
+			.test = test_cmpxchg_key_concurrent,
+			.requirements_met = extension_cap & 0x2,
+		},
+		{
+			.name = "copy with key storage protection override",
+			.test = test_copy_key_storage_prot_override,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "copy with key fetch protection",
+			.test = test_copy_key_fetch_prot,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "copy with key fetch protection override",
+			.test = test_copy_key_fetch_prot_override,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "copy with access register mode",
+			.test = test_copy_access_register,
+			.requirements_met = true,
+		},
+		{
+			.name = "error checks with key",
+			.test = test_errors_key,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks for cmpxchg with key",
+			.test = test_errors_cmpxchg_key,
+			.requirements_met = extension_cap & 0x2,
+		},
+		{
+			.name = "error checks for cmpxchg",
+			.test = test_errors_cmpxchg,
+			.requirements_met = extension_cap & 0x2,
+		},
+		{
+			.name = "termination",
+			.test = test_termination,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks with key storage protection override",
+			.test = test_errors_key_storage_prot_override,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks without key fetch prot override",
+			.test = test_errors_key_fetch_prot_override_not_enabled,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks with key fetch prot override",
+			.test = test_errors_key_fetch_prot_override_enabled,
+			.requirements_met = extension_cap > 0,
+		},
+	};
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(testlist));
+
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		if (testlist[idx].requirements_met) {
+			testlist[idx].test();
+			ksft_test_result_pass("%s\n", testlist[idx].name);
+		} else {
+			ksft_test_result_skip("%s - requirements not met (kernel has extension cap %#x)\n",
+					      testlist[idx].name, extension_cap);
+		}
+	}
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/s390/resets.c b/tools/testing/selftests/kvm/s390/resets.c
new file mode 100644
index 000000000000..b58f75b381e5
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/resets.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test for s390x CPU resets
+ *
+ * Copyright (C) 2020, IBM
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+
+#define LOCAL_IRQS 32
+
+#define ARBITRARY_NON_ZERO_VCPU_ID 3
+
+struct kvm_s390_irq buf[ARBITRARY_NON_ZERO_VCPU_ID + LOCAL_IRQS];
+
+static uint8_t regs_null[512];
+
+static void guest_code_initial(void)
+{
+	/* set several CRs to "safe" value */
+	unsigned long cr2_59 = 0x10;	/* enable guarded storage */
+	unsigned long cr8_63 = 0x1;	/* monitor mask = 1 */
+	unsigned long cr10 = 1;		/* PER START */
+	unsigned long cr11 = -1;	/* PER END */
+
+
+	/* Dirty registers */
+	asm volatile (
+		"	lghi	2,0x11\n"	/* Round toward 0 */
+		"	sfpc	2\n"		/* set fpc to !=0 */
+		"	lctlg	2,2,%0\n"
+		"	lctlg	8,8,%1\n"
+		"	lctlg	10,10,%2\n"
+		"	lctlg	11,11,%3\n"
+		/* now clobber some general purpose regs */
+		"	llihh	0,0xffff\n"
+		"	llihl	1,0x5555\n"
+		"	llilh	2,0xaaaa\n"
+		"	llill	3,0x0000\n"
+		/* now clobber a floating point reg */
+		"	lghi	4,0x1\n"
+		"	cdgbr	0,4\n"
+		/* now clobber an access reg */
+		"	sar	9,4\n"
+		/* We embed diag 501 here to control register content */
+		"	diag 0,0,0x501\n"
+		:
+		: "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11)
+		/* no clobber list as this should not return */
+		);
+}
+
+static void test_one_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t value)
+{
+	uint64_t eval_reg;
+
+	eval_reg = vcpu_get_reg(vcpu, id);
+	TEST_ASSERT(eval_reg == value, "value == 0x%lx", value);
+}
+
+static void assert_noirq(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_irq_state irq_state;
+	int irqs;
+
+	irq_state.len = sizeof(buf);
+	irq_state.buf = (unsigned long)buf;
+	irqs = __vcpu_ioctl(vcpu, KVM_S390_GET_IRQ_STATE, &irq_state);
+	/*
+	 * irqs contains the number of retrieved interrupts. Any interrupt
+	 * (notably, the emergency call interrupt we have injected) should
+	 * be cleared by the resets, so this should be 0.
+	 */
+	TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d", errno);
+	TEST_ASSERT(!irqs, "IRQ pending");
+}
+
+static void assert_clear(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+	struct kvm_fpu fpu;
+
+	vcpu_regs_get(vcpu, &regs);
+	TEST_ASSERT(!memcmp(&regs.gprs, regs_null, sizeof(regs.gprs)), "grs == 0");
+
+	vcpu_sregs_get(vcpu, &sregs);
+	TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0");
+
+	vcpu_fpu_get(vcpu, &fpu);
+	TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0");
+
+	/* sync regs */
+	TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)),
+		    "gprs0-15 == 0 (sync_regs)");
+
+	TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)),
+		    "acrs0-15 == 0 (sync_regs)");
+
+	TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)),
+		    "vrs0-15 == 0 (sync_regs)");
+}
+
+static void assert_initial_noclear(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
+
+	TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL,
+		    "gpr0 == 0xffff000000000000 (sync_regs)");
+	TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL,
+		    "gpr1 == 0x0000555500000000 (sync_regs)");
+	TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL,
+		    "gpr2 == 0x00000000aaaa0000 (sync_regs)");
+	TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL,
+		    "gpr3 == 0x0000000000000000 (sync_regs)");
+	TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL,
+		    "fpr0 == 0f1 (sync_regs)");
+	TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)");
+}
+
+static void assert_initial(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
+	struct kvm_sregs sregs;
+	struct kvm_fpu fpu;
+
+	/* KVM_GET_SREGS */
+	vcpu_sregs_get(vcpu, &sregs);
+	TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)");
+	TEST_ASSERT(sregs.crs[14] == 0xC2000000UL,
+		    "cr14 == 0xC2000000 (KVM_GET_SREGS)");
+	TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12),
+		    "cr1-13 == 0 (KVM_GET_SREGS)");
+	TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)");
+
+	/* sync regs */
+	TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)");
+	TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL,
+		    "cr14 == 0xC2000000 (sync_regs)");
+	TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12),
+		    "cr1-13 == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)");
+	TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)");
+
+	/* kvm_run */
+	TEST_ASSERT(vcpu->run->psw_addr == 0, "psw_addr == 0 (kvm_run)");
+	TEST_ASSERT(vcpu->run->psw_mask == 0, "psw_mask == 0 (kvm_run)");
+
+	vcpu_fpu_get(vcpu, &fpu);
+	TEST_ASSERT(!fpu.fpc, "fpc == 0");
+
+	test_one_reg(vcpu, KVM_REG_S390_GBEA, 1);
+	test_one_reg(vcpu, KVM_REG_S390_PP, 0);
+	test_one_reg(vcpu, KVM_REG_S390_TODPR, 0);
+	test_one_reg(vcpu, KVM_REG_S390_CPU_TIMER, 0);
+	test_one_reg(vcpu, KVM_REG_S390_CLOCK_COMP, 0);
+}
+
+static void assert_normal_noclear(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
+
+	TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)");
+	TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)");
+	TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)");
+	TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)");
+}
+
+static void assert_normal(struct kvm_vcpu *vcpu)
+{
+	test_one_reg(vcpu, KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID);
+	TEST_ASSERT(vcpu->run->s.regs.pft == KVM_S390_PFAULT_TOKEN_INVALID,
+			"pft == 0xff.....  (sync_regs)");
+	assert_noirq(vcpu);
+}
+
+static void inject_irq(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_irq_state irq_state;
+	struct kvm_s390_irq *irq = &buf[0];
+	int irqs;
+
+	/* Inject IRQ */
+	irq_state.len = sizeof(struct kvm_s390_irq);
+	irq_state.buf = (unsigned long)buf;
+	irq->type = KVM_S390_INT_EMERGENCY;
+	irq->u.emerg.code = vcpu->id;
+	irqs = __vcpu_ioctl(vcpu, KVM_S390_SET_IRQ_STATE, &irq_state);
+	TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d", errno);
+}
+
+static struct kvm_vm *create_vm(struct kvm_vcpu **vcpu)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create(1);
+
+	*vcpu = vm_vcpu_add(vm, ARBITRARY_NON_ZERO_VCPU_ID, guest_code_initial);
+
+	return vm;
+}
+
+static void test_normal(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	ksft_print_msg("Testing normal reset\n");
+	vm = create_vm(&vcpu);
+
+	vcpu_run(vcpu);
+
+	inject_irq(vcpu);
+
+	vcpu_ioctl(vcpu, KVM_S390_NORMAL_RESET, NULL);
+
+	/* must clears */
+	assert_normal(vcpu);
+	/* must not clears */
+	assert_normal_noclear(vcpu);
+	assert_initial_noclear(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void test_initial(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	ksft_print_msg("Testing initial reset\n");
+	vm = create_vm(&vcpu);
+
+	vcpu_run(vcpu);
+
+	inject_irq(vcpu);
+
+	vcpu_ioctl(vcpu, KVM_S390_INITIAL_RESET, NULL);
+
+	/* must clears */
+	assert_normal(vcpu);
+	assert_initial(vcpu);
+	/* must not clears */
+	assert_initial_noclear(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void test_clear(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	ksft_print_msg("Testing clear reset\n");
+	vm = create_vm(&vcpu);
+
+	vcpu_run(vcpu);
+
+	inject_irq(vcpu);
+
+	vcpu_ioctl(vcpu, KVM_S390_CLEAR_RESET, NULL);
+
+	/* must clears */
+	assert_normal(vcpu);
+	assert_initial(vcpu);
+	assert_clear(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+struct testdef {
+	const char *name;
+	void (*test)(void);
+	bool needs_cap;
+} testlist[] = {
+	{ "initial", test_initial, false },
+	{ "normal", test_normal, true },
+	{ "clear", test_clear, true },
+};
+
+int main(int argc, char *argv[])
+{
+	bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS);
+	int idx;
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(testlist));
+
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		if (!testlist[idx].needs_cap || has_s390_vcpu_resets) {
+			testlist[idx].test();
+			ksft_test_result_pass("%s\n", testlist[idx].name);
+		} else {
+			ksft_test_result_skip("%s - no VCPU_RESETS capability\n",
+					      testlist[idx].name);
+		}
+	}
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c
new file mode 100644
index 000000000000..bba0d9a6dcc8
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test shared zeropage handling (with/without storage keys)
+ *
+ * Copyright (C) 2024, Red Hat, Inc.
+ */
+#include <sys/mman.h>
+
+#include <linux/fs.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+#include "ucall_common.h"
+
+static void set_storage_key(void *addr, uint8_t skey)
+{
+	asm volatile("sske %0,%1" : : "d" (skey), "a" (addr));
+}
+
+static void guest_code(void)
+{
+	/* Issue some storage key instruction. */
+	set_storage_key((void *)0, 0x98);
+	GUEST_DONE();
+}
+
+/*
+ * Returns 1 if the shared zeropage is mapped, 0 if something else is mapped.
+ * Returns < 0 on error or if nothing is mapped.
+ */
+static int maps_shared_zeropage(int pagemap_fd, void *addr)
+{
+	struct page_region region;
+	struct pm_scan_arg arg = {
+		.start = (uintptr_t)addr,
+		.end = (uintptr_t)addr + 4096,
+		.vec = (uintptr_t)&region,
+		.vec_len = 1,
+		.size = sizeof(struct pm_scan_arg),
+		.category_mask = PAGE_IS_PFNZERO,
+		.category_anyof_mask = PAGE_IS_PRESENT,
+		.return_mask = PAGE_IS_PFNZERO,
+	};
+	return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+}
+
+int main(int argc, char *argv[])
+{
+	char *mem, *page0, *page1, *page2, tmp;
+	const size_t pagesize = getpagesize();
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int pagemap_fd;
+
+	ksft_print_header();
+	ksft_set_plan(3);
+
+	/*
+	 * We'll use memory that is not mapped into the VM for simplicity.
+	 * Shared zeropages are enabled/disabled per-process.
+	 */
+	mem = mmap(0, 3 * pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
+	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
+
+	/* Disable THP. Ignore errors on older kernels. */
+	madvise(mem, 3 * pagesize, MADV_NOHUGEPAGE);
+
+	page0 = mem;
+	page1 = page0 + pagesize;
+	page2 = page1 + pagesize;
+
+	/* Can we even detect shared zeropages? */
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	TEST_REQUIRE(pagemap_fd >= 0);
+
+	tmp = *page0;
+	asm volatile("" : "+r" (tmp));
+	TEST_REQUIRE(maps_shared_zeropage(pagemap_fd, page0) == 1);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	/* Verify that we get the shared zeropage after VM creation. */
+	tmp = *page1;
+	asm volatile("" : "+r" (tmp));
+	ksft_test_result(maps_shared_zeropage(pagemap_fd, page1) == 1,
+			 "Shared zeropages should be enabled\n");
+
+	/*
+	 * Let our VM execute a storage key instruction that should
+	 * unshare all shared zeropages.
+	 */
+	vcpu_run(vcpu);
+	get_ucall(vcpu, &uc);
+	TEST_ASSERT_EQ(uc.cmd, UCALL_DONE);
+
+	/* Verify that we don't have a shared zeropage anymore. */
+	ksft_test_result(!maps_shared_zeropage(pagemap_fd, page1),
+			 "Shared zeropage should be gone\n");
+
+	/* Verify that we don't get any new shared zeropages. */
+	tmp = *page2;
+	asm volatile("" : "+r" (tmp));
+	ksft_test_result(!maps_shared_zeropage(pagemap_fd, page2),
+			 "Shared zeropages should be disabled\n");
+
+	kvm_vm_free(vm);
+
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/kvm/s390/sync_regs_test.c b/tools/testing/selftests/kvm/s390/sync_regs_test.c
new file mode 100644
index 000000000000..53def355ccba
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/sync_regs_test.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x KVM_CAP_SYNC_REGS
+ *
+ * Based on the same test for x86:
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Adaptions for s390x:
+ * Copyright (C) 2019, Red Hat, Inc.
+ *
+ * Test expected behavior of the KVM_CAP_SYNC_REGS functionality.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "diag318_test_handler.h"
+#include "kselftest.h"
+
+static void guest_code(void)
+{
+	/*
+	 * We embed diag 501 here instead of doing a ucall to avoid that
+	 * the compiler has messed with r11 at the time of the ucall.
+	 */
+	asm volatile (
+		"0:	diag 0,0,0x501\n"
+		"	ahi 11,1\n"
+		"	j 0b\n"
+	);
+}
+
+#define REG_COMPARE(reg) \
+	TEST_ASSERT(left->reg == right->reg, \
+		    "Register " #reg \
+		    " values did not match: 0x%llx, 0x%llx", \
+		    left->reg, right->reg)
+
+#define REG_COMPARE32(reg) \
+	TEST_ASSERT(left->reg == right->reg, \
+		    "Register " #reg \
+		    " values did not match: 0x%x, 0x%x", \
+		    left->reg, right->reg)
+
+
+static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right)
+{
+	int i;
+
+	for (i = 0; i < 16; i++)
+		REG_COMPARE(gprs[i]);
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right)
+{
+	int i;
+
+	for (i = 0; i < 16; i++)
+		REG_COMPARE32(acrs[i]);
+
+	for (i = 0; i < 16; i++)
+		REG_COMPARE(crs[i]);
+}
+
+#undef REG_COMPARE
+
+#define TEST_SYNC_FIELDS   (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS|KVM_SYNC_DIAG318)
+#define INVALID_SYNC_FIELD 0x80000000
+
+void test_read_invalid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
+
+	/* Request reading invalid register set from VCPU. */
+	run->kvm_valid_regs = INVALID_SYNC_FIELD;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_valid_regs = 0;
+
+	run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_valid_regs = 0;
+}
+
+void test_set_invalid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
+
+	/* Request setting invalid register set into VCPU. */
+	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_dirty_regs = 0;
+
+	run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_dirty_regs = 0;
+}
+
+void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+	int rv;
+
+	/* Request and verify all valid register sets. */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT(run->s390_sieic.icptcode == 4 &&
+		    (run->s390_sieic.ipa >> 8) == 0x83 &&
+		    (run->s390_sieic.ipb >> 16) == 0x501,
+		    "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x",
+		    run->s390_sieic.icptcode, run->s390_sieic.ipa,
+		    run->s390_sieic.ipb);
+
+	vcpu_regs_get(vcpu, &regs);
+	compare_regs(&regs, &run->s.regs);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	compare_sregs(&sregs, &run->s.regs);
+}
+
+void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+	int rv;
+
+	/* Set and verify various register values */
+	run->s.regs.gprs[11] = 0xBAD1DEA;
+	run->s.regs.acrs[0] = 1 << 11;
+
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS;
+
+	if (get_diag318_info() > 0) {
+		run->s.regs.diag318 = get_diag318_info();
+		run->kvm_dirty_regs |= KVM_SYNC_DIAG318;
+	}
+
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1,
+		    "r11 sync regs value incorrect 0x%llx.",
+		    run->s.regs.gprs[11]);
+	TEST_ASSERT(run->s.regs.acrs[0]  == 1 << 11,
+		    "acr0 sync regs value incorrect 0x%x.",
+		    run->s.regs.acrs[0]);
+	TEST_ASSERT(run->s.regs.diag318 == get_diag318_info(),
+		    "diag318 sync regs value incorrect 0x%llx.",
+		    run->s.regs.diag318);
+
+	vcpu_regs_get(vcpu, &regs);
+	compare_regs(&regs, &run->s.regs);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	compare_sregs(&sregs, &run->s.regs);
+}
+
+void test_clear_kvm_dirty_regs_bits(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
+
+	/* Clear kvm_dirty_regs bits, verify new s.regs values are
+	 * overwritten with existing guest values.
+	 */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	run->kvm_dirty_regs = 0;
+	run->s.regs.gprs[11] = 0xDEADBEEF;
+	run->s.regs.diag318 = 0x4B1D;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
+	TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF,
+		    "r11 sync regs value incorrect 0x%llx.",
+		    run->s.regs.gprs[11]);
+	TEST_ASSERT(run->s.regs.diag318 != 0x4B1D,
+		    "diag318 sync regs value incorrect 0x%llx.",
+		    run->s.regs.diag318);
+}
+
+struct testdef {
+	const char *name;
+	void (*test)(struct kvm_vcpu *vcpu);
+} testlist[] = {
+	{ "read invalid", test_read_invalid },
+	{ "set invalid", test_set_invalid },
+	{ "request+verify all valid regs", test_req_and_verify_all_valid_regs },
+	{ "set+verify various regs", test_set_and_verify_various_reg_values },
+	{ "clear kvm_dirty_regs bits", test_clear_kvm_dirty_regs_bits },
+};
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int idx;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
+
+	ksft_print_header();
+
+	ksft_set_plan(ARRAY_SIZE(testlist));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+		testlist[idx].test(vcpu);
+		ksft_test_result_pass("%s\n", testlist[idx].name);
+	}
+
+	kvm_vm_free(vm);
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/s390/tprot.c b/tools/testing/selftests/kvm/s390/tprot.c
new file mode 100644
index 000000000000..12d5e1cb62e3
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/tprot.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test TEST PROTECTION emulation.
+ *
+ * Copyright IBM Corp. 2021
+ */
+#include <sys/mman.h>
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+#include "ucall_common.h"
+#include "processor.h"
+
+#define CR0_FETCH_PROTECTION_OVERRIDE	(1UL << (63 - 38))
+#define CR0_STORAGE_PROTECTION_OVERRIDE	(1UL << (63 - 39))
+
+static __aligned(PAGE_SIZE) uint8_t pages[2][PAGE_SIZE];
+static uint8_t *const page_store_prot = pages[0];
+static uint8_t *const page_fetch_prot = pages[1];
+
+/* Nonzero return value indicates that address not mapped */
+static int set_storage_key(void *addr, uint8_t key)
+{
+	int not_mapped = 0;
+
+	asm volatile (
+		       "lra	%[addr], 0(0,%[addr])\n"
+		"	jz	0f\n"
+		"	llill	%[not_mapped],1\n"
+		"	j	1f\n"
+		"0:	sske	%[key], %[addr]\n"
+		"1:"
+		: [addr] "+&a" (addr), [not_mapped] "+r" (not_mapped)
+		: [key] "r" (key)
+		: "cc"
+	);
+	return -not_mapped;
+}
+
+enum permission {
+	READ_WRITE = 0,
+	READ = 1,
+	RW_PROTECTED = 2,
+	TRANSL_UNAVAIL = 3,
+};
+
+static enum permission test_protection(void *addr, uint8_t key)
+{
+	uint64_t mask;
+
+	asm volatile (
+		       "tprot	%[addr], 0(%[key])\n"
+		"	ipm	%[mask]\n"
+		: [mask] "=r" (mask)
+		: [addr] "Q" (*(char *)addr),
+		  [key] "a" (key)
+		: "cc"
+	);
+
+	return (enum permission)(mask >> 28);
+}
+
+enum stage {
+	STAGE_INIT_SIMPLE,
+	TEST_SIMPLE,
+	STAGE_INIT_FETCH_PROT_OVERRIDE,
+	TEST_FETCH_PROT_OVERRIDE,
+	TEST_STORAGE_PROT_OVERRIDE,
+	STAGE_END	/* must be the last entry (it's the amount of tests) */
+};
+
+struct test {
+	enum stage stage;
+	void *addr;
+	uint8_t key;
+	enum permission expected;
+} tests[] = {
+	/*
+	 * We perform each test in the array by executing TEST PROTECTION on
+	 * the specified addr with the specified key and checking if the returned
+	 * permissions match the expected value.
+	 * Both guest and host cooperate to set up the required test conditions.
+	 * A central condition is that the page targeted by addr has to be DAT
+	 * protected in the host mappings, in order for KVM to emulate the
+	 * TEST PROTECTION instruction.
+	 * Since the page tables are shared, the host uses mprotect to achieve
+	 * this.
+	 *
+	 * Test resulting in RW_PROTECTED/TRANSL_UNAVAIL will be interpreted
+	 * by SIE, not KVM, but there is no harm in testing them also.
+	 * See Enhanced Suppression-on-Protection Facilities in the
+	 * Interpretive-Execution Mode
+	 */
+	/*
+	 * guest: set storage key of page_store_prot to 1
+	 *        storage key of page_fetch_prot to 9 and enable
+	 *        protection for it
+	 * STAGE_INIT_SIMPLE
+	 * host: write protect both via mprotect
+	 */
+	/* access key 0 matches any storage key -> RW */
+	{ TEST_SIMPLE, page_store_prot, 0x00, READ_WRITE },
+	/* access key matches storage key -> RW */
+	{ TEST_SIMPLE, page_store_prot, 0x10, READ_WRITE },
+	/* mismatched keys, but no fetch protection -> RO */
+	{ TEST_SIMPLE, page_store_prot, 0x20, READ },
+	/* access key 0 matches any storage key -> RW */
+	{ TEST_SIMPLE, page_fetch_prot, 0x00, READ_WRITE },
+	/* access key matches storage key -> RW */
+	{ TEST_SIMPLE, page_fetch_prot, 0x90, READ_WRITE },
+	/* mismatched keys, fetch protection -> inaccessible */
+	{ TEST_SIMPLE, page_fetch_prot, 0x10, RW_PROTECTED },
+	/* page 0 not mapped yet -> translation not available */
+	{ TEST_SIMPLE, (void *)0x00, 0x10, TRANSL_UNAVAIL },
+	/*
+	 * host: try to map page 0
+	 * guest: set storage key of page 0 to 9 and enable fetch protection
+	 * STAGE_INIT_FETCH_PROT_OVERRIDE
+	 * host: write protect page 0
+	 *       enable fetch protection override
+	 */
+	/* mismatched keys, fetch protection, but override applies -> RO */
+	{ TEST_FETCH_PROT_OVERRIDE, (void *)0x00, 0x10, READ },
+	/* mismatched keys, fetch protection, override applies to 0-2048 only -> inaccessible */
+	{ TEST_FETCH_PROT_OVERRIDE, (void *)2049, 0x10, RW_PROTECTED },
+	/*
+	 * host: enable storage protection override
+	 */
+	/* mismatched keys, but override applies (storage key 9) -> RW */
+	{ TEST_STORAGE_PROT_OVERRIDE, page_fetch_prot, 0x10, READ_WRITE },
+	/* mismatched keys, no fetch protection, override doesn't apply -> RO */
+	{ TEST_STORAGE_PROT_OVERRIDE, page_store_prot, 0x20, READ },
+	/* mismatched keys, but override applies (storage key 9) -> RW */
+	{ TEST_STORAGE_PROT_OVERRIDE, (void *)2049, 0x10, READ_WRITE },
+	/* end marker */
+	{ STAGE_END, 0, 0, 0 },
+};
+
+static enum stage perform_next_stage(int *i, bool mapped_0)
+{
+	enum stage stage = tests[*i].stage;
+	enum permission result;
+	bool skip;
+
+	for (; tests[*i].stage == stage; (*i)++) {
+		/*
+		 * Some fetch protection override tests require that page 0
+		 * be mapped, however, when the hosts tries to map that page via
+		 * vm_vaddr_alloc, it may happen that some other page gets mapped
+		 * instead.
+		 * In order to skip these tests we detect this inside the guest
+		 */
+		skip = tests[*i].addr < (void *)PAGE_SIZE &&
+		       tests[*i].expected != TRANSL_UNAVAIL &&
+		       !mapped_0;
+		if (!skip) {
+			result = test_protection(tests[*i].addr, tests[*i].key);
+			__GUEST_ASSERT(result == tests[*i].expected,
+				       "Wanted %u, got %u, for i = %u",
+				       tests[*i].expected, result, *i);
+		}
+	}
+	return stage;
+}
+
+static void guest_code(void)
+{
+	bool mapped_0;
+	int i = 0;
+
+	GUEST_ASSERT_EQ(set_storage_key(page_store_prot, 0x10), 0);
+	GUEST_ASSERT_EQ(set_storage_key(page_fetch_prot, 0x98), 0);
+	GUEST_SYNC(STAGE_INIT_SIMPLE);
+	GUEST_SYNC(perform_next_stage(&i, false));
+
+	/* Fetch-protection override */
+	mapped_0 = !set_storage_key((void *)0, 0x98);
+	GUEST_SYNC(STAGE_INIT_FETCH_PROT_OVERRIDE);
+	GUEST_SYNC(perform_next_stage(&i, mapped_0));
+
+	/* Storage-protection override */
+	GUEST_SYNC(perform_next_stage(&i, mapped_0));
+}
+
+#define HOST_SYNC_NO_TAP(vcpup, stage)				\
+({								\
+	struct kvm_vcpu *__vcpu = (vcpup);			\
+	struct ucall uc;					\
+	int __stage = (stage);					\
+								\
+	vcpu_run(__vcpu);					\
+	get_ucall(__vcpu, &uc);					\
+	if (uc.cmd == UCALL_ABORT)				\
+		REPORT_GUEST_ASSERT(uc);			\
+	TEST_ASSERT_EQ(uc.cmd, UCALL_SYNC);			\
+	TEST_ASSERT_EQ(uc.args[1], __stage);			\
+})
+
+#define HOST_SYNC(vcpu, stage)			\
+({						\
+	HOST_SYNC_NO_TAP(vcpu, stage);		\
+	ksft_test_result_pass("" #stage "\n");	\
+})
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	vm_vaddr_t guest_0_page;
+
+	ksft_print_header();
+	ksft_set_plan(STAGE_END);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+
+	HOST_SYNC(vcpu, STAGE_INIT_SIMPLE);
+	mprotect(addr_gva2hva(vm, (vm_vaddr_t)pages), PAGE_SIZE * 2, PROT_READ);
+	HOST_SYNC(vcpu, TEST_SIMPLE);
+
+	guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0);
+	if (guest_0_page != 0) {
+		/* Use NO_TAP so we don't get a PASS print */
+		HOST_SYNC_NO_TAP(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE);
+		ksft_test_result_skip("STAGE_INIT_FETCH_PROT_OVERRIDE - "
+				      "Did not allocate page at 0\n");
+	} else {
+		HOST_SYNC(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE);
+	}
+	if (guest_0_page == 0)
+		mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ);
+	run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE;
+	run->kvm_dirty_regs = KVM_SYNC_CRS;
+	HOST_SYNC(vcpu, TEST_FETCH_PROT_OVERRIDE);
+
+	run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE;
+	run->kvm_dirty_regs = KVM_SYNC_CRS;
+	HOST_SYNC(vcpu, TEST_STORAGE_PROT_OVERRIDE);
+
+	kvm_vm_free(vm);
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/s390/ucontrol_test.c b/tools/testing/selftests/kvm/s390/ucontrol_test.c
new file mode 100644
index 000000000000..0c112319dab1
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/ucontrol_test.c
@@ -0,0 +1,638 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test code for the s390x kvm ucontrol interface
+ *
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ *  Christoph Schlameuss <schlameuss@linux.ibm.com>
+ */
+#include "debug_print.h"
+#include "kselftest_harness.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "sie.h"
+
+#include <linux/capability.h>
+#include <linux/sizes.h>
+
+#define PGM_SEGMENT_TRANSLATION 0x10
+
+#define VM_MEM_SIZE (4 * SZ_1M)
+#define VM_MEM_EXT_SIZE (2 * SZ_1M)
+#define VM_MEM_MAX_M ((VM_MEM_SIZE + VM_MEM_EXT_SIZE) / SZ_1M)
+
+/* so directly declare capget to check caps without libcap */
+int capget(cap_user_header_t header, cap_user_data_t data);
+
+/**
+ * In order to create user controlled virtual machines on S390,
+ * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL
+ * as privileged user (SYS_ADMIN).
+ */
+void require_ucontrol_admin(void)
+{
+	struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3];
+	struct __user_cap_header_struct hdr = {
+		.version = _LINUX_CAPABILITY_VERSION_3,
+	};
+	int rc;
+
+	rc = capget(&hdr, data);
+	TEST_ASSERT_EQ(0, rc);
+	TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0);
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL));
+}
+
+/* Test program setting some registers and looping */
+extern char test_gprs_asm[];
+asm("test_gprs_asm:\n"
+	"xgr	%r0, %r0\n"
+	"lgfi	%r1,1\n"
+	"lgfi	%r2,2\n"
+	"lgfi	%r3,3\n"
+	"lgfi	%r4,4\n"
+	"lgfi	%r5,5\n"
+	"lgfi	%r6,6\n"
+	"lgfi	%r7,7\n"
+	"0:\n"
+	"	diag	0,0,0x44\n"
+	"	ahi	%r0,1\n"
+	"	j	0b\n"
+);
+
+/* Test program manipulating memory */
+extern char test_mem_asm[];
+asm("test_mem_asm:\n"
+	"xgr	%r0, %r0\n"
+
+	"0:\n"
+	"	ahi	%r0,1\n"
+	"	st	%r1,0(%r5,%r6)\n"
+
+	"	xgr	%r1,%r1\n"
+	"	l	%r1,0(%r5,%r6)\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	j	0b\n"
+);
+
+/* Test program manipulating storage keys */
+extern char test_skey_asm[];
+asm("test_skey_asm:\n"
+	"xgr	%r0, %r0\n"
+
+	"0:\n"
+	"	ahi	%r0,1\n"
+	"	st	%r1,0(%r5,%r6)\n"
+
+	"	iske	%r1,%r6\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	sske	%r1,%r6\n"
+	"	xgr	%r1,%r1\n"
+	"	iske	%r1,%r6\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	rrbe	%r1,%r6\n"
+	"	iske	%r1,%r6\n"
+	"	ahi	%r0,1\n"
+	"	diag	0,0,0x44\n"
+
+	"	j	0b\n"
+);
+
+FIXTURE(uc_kvm)
+{
+	struct kvm_s390_sie_block *sie_block;
+	struct kvm_run *run;
+	uintptr_t base_gpa;
+	uintptr_t code_gpa;
+	uintptr_t base_hva;
+	uintptr_t code_hva;
+	int kvm_run_size;
+	vm_paddr_t pgd;
+	void *vm_mem;
+	int vcpu_fd;
+	int kvm_fd;
+	int vm_fd;
+};
+
+/**
+ * create VM with single vcpu, map kvm_run and SIE control block for easy access
+ */
+FIXTURE_SETUP(uc_kvm)
+{
+	struct kvm_s390_vm_cpu_processor info;
+	int rc;
+
+	require_ucontrol_admin();
+
+	self->kvm_fd = open_kvm_dev_path_or_exit();
+	self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL);
+	ASSERT_GE(self->vm_fd, 0);
+
+	kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL,
+			    KVM_S390_VM_CPU_PROCESSOR, &info);
+	TH_LOG("create VM 0x%llx", info.cpuid);
+
+	self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0);
+	ASSERT_GE(self->vcpu_fd, 0);
+
+	self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
+	ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run))
+		  TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size));
+	self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size,
+		    PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0);
+	ASSERT_NE(self->run, MAP_FAILED);
+	/**
+	 * For virtual cpus that have been created with S390 user controlled
+	 * virtual machines, the resulting vcpu fd can be memory mapped at page
+	 * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of
+	 * the virtual cpu's hardware control block.
+	 */
+	self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE,
+			  PROT_READ | PROT_WRITE, MAP_SHARED,
+			  self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT);
+	ASSERT_NE(self->sie_block, MAP_FAILED);
+
+	TH_LOG("VM created %p %p", self->run, self->sie_block);
+
+	self->base_gpa = 0;
+	self->code_gpa = self->base_gpa + (3 * SZ_1M);
+
+	self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_MAX_M * SZ_1M);
+	ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno);
+	self->base_hva = (uintptr_t)self->vm_mem;
+	self->code_hva = self->base_hva - self->base_gpa + self->code_gpa;
+	struct kvm_s390_ucas_mapping map = {
+		.user_addr = self->base_hva,
+		.vcpu_addr = self->base_gpa,
+		.length = VM_MEM_SIZE,
+	};
+	TH_LOG("ucas map %p %p 0x%llx",
+	       (void *)map.user_addr, (void *)map.vcpu_addr, map.length);
+	rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map);
+	ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s",
+				rc, strerror(errno));
+
+	TH_LOG("page in %p", (void *)self->base_gpa);
+	rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa);
+	ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s",
+				(void *)self->base_hva, rc, strerror(errno));
+
+	self->sie_block->cpuflags &= ~CPUSTAT_STOPPED;
+}
+
+FIXTURE_TEARDOWN(uc_kvm)
+{
+	munmap(self->sie_block, PAGE_SIZE);
+	munmap(self->run, self->kvm_run_size);
+	close(self->vcpu_fd);
+	close(self->vm_fd);
+	close(self->kvm_fd);
+	free(self->vm_mem);
+}
+
+TEST_F(uc_kvm, uc_sie_assertions)
+{
+	/* assert interception of Code 08 (Program Interruption) is set */
+	EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI);
+}
+
+TEST_F(uc_kvm, uc_attr_mem_limit)
+{
+	u64 limit;
+	struct kvm_device_attr attr = {
+		.group = KVM_S390_VM_MEM_CTRL,
+		.attr = KVM_S390_VM_MEM_LIMIT_SIZE,
+		.addr = (unsigned long)&limit,
+	};
+	int rc;
+
+	rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr);
+	EXPECT_EQ(0, rc);
+	EXPECT_EQ(~0UL, limit);
+
+	/* assert set not supported */
+	rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr);
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EINVAL, errno);
+}
+
+TEST_F(uc_kvm, uc_no_dirty_log)
+{
+	struct kvm_dirty_log dlog;
+	int rc;
+
+	rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog);
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EINVAL, errno);
+}
+
+/**
+ * Assert HPAGE CAP cannot be enabled on UCONTROL VM
+ */
+TEST(uc_cap_hpage)
+{
+	int rc, kvm_fd, vm_fd, vcpu_fd;
+	struct kvm_enable_cap cap = {
+		.cap = KVM_CAP_S390_HPAGE_1M,
+	};
+
+	require_ucontrol_admin();
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL);
+	ASSERT_GE(vm_fd, 0);
+
+	/* assert hpages are not supported on ucontrol vm */
+	rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M);
+	EXPECT_EQ(0, rc);
+
+	/* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */
+	rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap);
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EINVAL, errno);
+
+	/* assert HPAGE CAP is rejected after vCPU creation */
+	vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
+	ASSERT_GE(vcpu_fd, 0);
+	rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap);
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EBUSY, errno);
+
+	close(vcpu_fd);
+	close(vm_fd);
+	close(kvm_fd);
+}
+
+/* calculate host virtual addr from guest physical addr */
+static void *gpa2hva(FIXTURE_DATA(uc_kvm) *self, u64 gpa)
+{
+	return (void *)(self->base_hva - self->base_gpa + gpa);
+}
+
+/* map / make additional memory available */
+static int uc_map_ext(FIXTURE_DATA(uc_kvm) *self, u64 vcpu_addr, u64 length)
+{
+	struct kvm_s390_ucas_mapping map = {
+		.user_addr = (u64)gpa2hva(self, vcpu_addr),
+		.vcpu_addr = vcpu_addr,
+		.length = length,
+	};
+	pr_info("ucas map %p %p 0x%llx",
+		(void *)map.user_addr, (void *)map.vcpu_addr, map.length);
+	return ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map);
+}
+
+/* unmap previously mapped memory */
+static int uc_unmap_ext(FIXTURE_DATA(uc_kvm) *self, u64 vcpu_addr, u64 length)
+{
+	struct kvm_s390_ucas_mapping map = {
+		.user_addr = (u64)gpa2hva(self, vcpu_addr),
+		.vcpu_addr = vcpu_addr,
+		.length = length,
+	};
+	pr_info("ucas unmap %p %p 0x%llx",
+		(void *)map.user_addr, (void *)map.vcpu_addr, map.length);
+	return ioctl(self->vcpu_fd, KVM_S390_UCAS_UNMAP, &map);
+}
+
+/* handle ucontrol exit by mapping the accessed segment */
+static void uc_handle_exit_ucontrol(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_run *run = self->run;
+	u64 seg_addr;
+	int rc;
+
+	TEST_ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
+	switch (run->s390_ucontrol.pgm_code) {
+	case PGM_SEGMENT_TRANSLATION:
+		seg_addr = run->s390_ucontrol.trans_exc_code & ~(SZ_1M - 1);
+		pr_info("ucontrol pic segment translation 0x%llx, mapping segment 0x%lx\n",
+			run->s390_ucontrol.trans_exc_code, seg_addr);
+		/* map / make additional memory available */
+		rc = uc_map_ext(self, seg_addr, SZ_1M);
+		TEST_ASSERT_EQ(0, rc);
+		break;
+	default:
+		TEST_FAIL("UNEXPECTED PGM CODE %d", run->s390_ucontrol.pgm_code);
+	}
+}
+
+/*
+ * Handle the SIEIC exit
+ * * fail on codes not expected in the test cases
+ * Returns if interception is handled / execution can be continued
+ */
+static void uc_skey_enable(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+
+	/* disable KSS */
+	sie_block->cpuflags &= ~CPUSTAT_KSS;
+	/* disable skey inst interception */
+	sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+}
+
+/*
+ * Handle the instruction intercept
+ * Returns if interception is handled / execution can be continued
+ */
+static bool uc_handle_insn_ic(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+	int ilen = insn_length(sie_block->ipa >> 8);
+	struct kvm_run *run = self->run;
+
+	switch (run->s390_sieic.ipa) {
+	case 0xB229: /* ISKE */
+	case 0xB22b: /* SSKE */
+	case 0xB22a: /* RRBE */
+		uc_skey_enable(self);
+
+		/* rewind to reexecute intercepted instruction */
+		run->psw_addr = run->psw_addr - ilen;
+		pr_info("rewind guest addr to 0x%.16llx\n", run->psw_addr);
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * Handle the SIEIC exit
+ * * fail on codes not expected in the test cases
+ * Returns if interception is handled / execution can be continued
+ */
+static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+	struct kvm_run *run = self->run;
+
+	/* check SIE interception code */
+	pr_info("sieic: 0x%.2x 0x%.4x 0x%.8x\n",
+		run->s390_sieic.icptcode,
+		run->s390_sieic.ipa,
+		run->s390_sieic.ipb);
+	switch (run->s390_sieic.icptcode) {
+	case ICPT_INST:
+		/* end execution in caller on intercepted instruction */
+		pr_info("sie instruction interception\n");
+		return uc_handle_insn_ic(self);
+	case ICPT_KSS:
+		uc_skey_enable(self);
+		return true;
+	case ICPT_OPEREXC:
+		/* operation exception */
+		TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb);
+	default:
+		TEST_FAIL("UNEXPECTED SIEIC CODE %d", run->s390_sieic.icptcode);
+	}
+	return true;
+}
+
+/* verify VM state on exit */
+static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_run *run = self->run;
+
+	switch (run->exit_reason) {
+	case KVM_EXIT_S390_UCONTROL:
+		/** check program interruption code
+		 * handle page fault --> ucas map
+		 */
+		uc_handle_exit_ucontrol(self);
+		break;
+	case KVM_EXIT_S390_SIEIC:
+		return uc_handle_sieic(self);
+	default:
+		pr_info("exit_reason %2d not handled\n", run->exit_reason);
+	}
+	return true;
+}
+
+/* run the VM until interrupted */
+static int uc_run_once(FIXTURE_DATA(uc_kvm) *self)
+{
+	int rc;
+
+	rc = ioctl(self->vcpu_fd, KVM_RUN, NULL);
+	print_run(self->run, self->sie_block);
+	print_regs(self->run);
+	pr_debug("run %d / %d %s\n", rc, errno, strerror(errno));
+	return rc;
+}
+
+static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) *self)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+
+	/* assert vm was interrupted by diag 0x0044 */
+	TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason);
+	TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode);
+	TEST_ASSERT_EQ(0x8300, sie_block->ipa);
+	TEST_ASSERT_EQ(0x440000, sie_block->ipb);
+}
+
+TEST_F(uc_kvm, uc_no_user_region)
+{
+	struct kvm_userspace_memory_region region = {
+		.slot = 1,
+		.guest_phys_addr = self->code_gpa,
+		.memory_size = VM_MEM_EXT_SIZE,
+		.userspace_addr = (uintptr_t)self->code_hva,
+	};
+	struct kvm_userspace_memory_region2 region2 = {
+		.slot = 1,
+		.guest_phys_addr = self->code_gpa,
+		.memory_size = VM_MEM_EXT_SIZE,
+		.userspace_addr = (uintptr_t)self->code_hva,
+	};
+
+	ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, &region));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION2, &region2));
+	ASSERT_EQ(EINVAL, errno);
+}
+
+TEST_F(uc_kvm, uc_map_unmap)
+{
+	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
+	struct kvm_run *run = self->run;
+	const u64 disp = 1;
+	int rc;
+
+	/* copy test_mem_asm to code_hva / code_gpa */
+	TH_LOG("copy code %p to vm mapped memory %p / %p",
+	       &test_mem_asm, (void *)self->code_hva, (void *)self->code_gpa);
+	memcpy((void *)self->code_hva, &test_mem_asm, PAGE_SIZE);
+
+	/* DAT disabled + 64 bit mode */
+	run->psw_mask = 0x0000000180000000ULL;
+	run->psw_addr = self->code_gpa;
+
+	/* set register content for test_mem_asm to access not mapped memory*/
+	sync_regs->gprs[1] = 0x55;
+	sync_regs->gprs[5] = self->base_gpa;
+	sync_regs->gprs[6] = VM_MEM_SIZE + disp;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+
+	/* run and expect to fail with ucontrol pic segment translation */
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(1, sync_regs->gprs[0]);
+	ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
+
+	ASSERT_EQ(PGM_SEGMENT_TRANSLATION, run->s390_ucontrol.pgm_code);
+	ASSERT_EQ(self->base_gpa + VM_MEM_SIZE, run->s390_ucontrol.trans_exc_code);
+
+	/* fail to map memory with not segment aligned address */
+	rc = uc_map_ext(self, self->base_gpa + VM_MEM_SIZE + disp, VM_MEM_EXT_SIZE);
+	ASSERT_GT(0, rc)
+		TH_LOG("ucas map for non segment address should fail but didn't; "
+		       "result %d not expected, %s", rc, strerror(errno));
+
+	/* map / make additional memory available */
+	rc = uc_map_ext(self, self->base_gpa + VM_MEM_SIZE, VM_MEM_EXT_SIZE);
+	ASSERT_EQ(0, rc)
+		TH_LOG("ucas map result %d not expected, %s", rc, strerror(errno));
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	uc_assert_diag44(self);
+
+	/* assert registers and memory are in expected state */
+	ASSERT_EQ(2, sync_regs->gprs[0]);
+	ASSERT_EQ(0x55, sync_regs->gprs[1]);
+	ASSERT_EQ(0x55, *(u32 *)gpa2hva(self, self->base_gpa + VM_MEM_SIZE + disp));
+
+	/* unmap and run loop again */
+	rc = uc_unmap_ext(self, self->base_gpa + VM_MEM_SIZE, VM_MEM_EXT_SIZE);
+	ASSERT_EQ(0, rc)
+		TH_LOG("ucas unmap result %d not expected, %s", rc, strerror(errno));
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(3, sync_regs->gprs[0]);
+	ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
+	ASSERT_EQ(PGM_SEGMENT_TRANSLATION, run->s390_ucontrol.pgm_code);
+	/* handle ucontrol exit and remap memory after previous map and unmap */
+	ASSERT_EQ(true, uc_handle_exit(self));
+}
+
+TEST_F(uc_kvm, uc_gprs)
+{
+	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
+	struct kvm_run *run = self->run;
+	struct kvm_regs regs = {};
+
+	/* Set registers to values that are different from the ones that we expect below */
+	for (int i = 0; i < 8; i++)
+		sync_regs->gprs[i] = 8;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+
+	/* copy test_gprs_asm to code_hva / code_gpa */
+	TH_LOG("copy code %p to vm mapped memory %p / %p",
+	       &test_gprs_asm, (void *)self->code_hva, (void *)self->code_gpa);
+	memcpy((void *)self->code_hva, &test_gprs_asm, PAGE_SIZE);
+
+	/* DAT disabled + 64 bit mode */
+	run->psw_mask = 0x0000000180000000ULL;
+	run->psw_addr = self->code_gpa;
+
+	/* run and expect interception of diag 44 */
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	uc_assert_diag44(self);
+
+	/* Retrieve and check guest register values */
+	ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs));
+	for (int i = 0; i < 8; i++) {
+		ASSERT_EQ(i, regs.gprs[i]);
+		ASSERT_EQ(i, sync_regs->gprs[i]);
+	}
+
+	/* run and expect interception of diag 44 again */
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	uc_assert_diag44(self);
+
+	/* check continued increment of register 0 value */
+	ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs));
+	ASSERT_EQ(1, regs.gprs[0]);
+	ASSERT_EQ(1, sync_regs->gprs[0]);
+}
+
+TEST_F(uc_kvm, uc_skey)
+{
+	struct kvm_s390_sie_block *sie_block = self->sie_block;
+	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
+	u64 test_vaddr = VM_MEM_SIZE - (SZ_1M / 2);
+	struct kvm_run *run = self->run;
+	const u8 skeyvalue = 0x34;
+
+	/* copy test_skey_asm to code_hva / code_gpa */
+	TH_LOG("copy code %p to vm mapped memory %p / %p",
+	       &test_skey_asm, (void *)self->code_hva, (void *)self->code_gpa);
+	memcpy((void *)self->code_hva, &test_skey_asm, PAGE_SIZE);
+
+	/* set register content for test_skey_asm to access not mapped memory */
+	sync_regs->gprs[1] = skeyvalue;
+	sync_regs->gprs[5] = self->base_gpa;
+	sync_regs->gprs[6] = test_vaddr;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+
+	/* DAT disabled + 64 bit mode */
+	run->psw_mask = 0x0000000180000000ULL;
+	run->psw_addr = self->code_gpa;
+
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(true, uc_handle_exit(self));
+	ASSERT_EQ(1, sync_regs->gprs[0]);
+
+	/* ISKE */
+	ASSERT_EQ(0, uc_run_once(self));
+
+	/*
+	 * Bail out and skip the test after uc_skey_enable was executed but iske
+	 * is still intercepted. Instructions are not handled by the kernel.
+	 * Thus there is no need to test this here.
+	 */
+	TEST_ASSERT_EQ(0, sie_block->cpuflags & CPUSTAT_KSS);
+	TEST_ASSERT_EQ(0, sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE));
+	TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason);
+	TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode);
+	TEST_REQUIRE(sie_block->ipa != 0xb229);
+
+	/* ISKE contd. */
+	ASSERT_EQ(false, uc_handle_exit(self));
+	ASSERT_EQ(2, sync_regs->gprs[0]);
+	/* assert initial skey (ACC = 0, R & C = 1) */
+	ASSERT_EQ(0x06, sync_regs->gprs[1]);
+	uc_assert_diag44(self);
+
+	/* SSKE + ISKE */
+	sync_regs->gprs[1] = skeyvalue;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	ASSERT_EQ(3, sync_regs->gprs[0]);
+	ASSERT_EQ(skeyvalue, sync_regs->gprs[1]);
+	uc_assert_diag44(self);
+
+	/* RRBE + ISKE */
+	sync_regs->gprs[1] = skeyvalue;
+	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+	ASSERT_EQ(0, uc_run_once(self));
+	ASSERT_EQ(false, uc_handle_exit(self));
+	ASSERT_EQ(4, sync_regs->gprs[0]);
+	/* assert R reset but rest of skey unchanged */
+	ASSERT_EQ(skeyvalue & 0xfa, sync_regs->gprs[1]);
+	ASSERT_EQ(0, sync_regs->gprs[1] & 0x04);
+	uc_assert_diag44(self);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c
deleted file mode 100644
index e32dd59703a0..000000000000
--- a/tools/testing/selftests/kvm/s390x/cmma_test.c
+++ /dev/null
@@ -1,695 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Test for s390x CMMA migration
- *
- * Copyright IBM Corp. 2023
- *
- * Authors:
- *  Nico Boehr <nrb@linux.ibm.com>
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "kselftest.h"
-#include "ucall_common.h"
-#include "processor.h"
-
-#define MAIN_PAGE_COUNT 512
-
-#define TEST_DATA_PAGE_COUNT 512
-#define TEST_DATA_MEMSLOT 1
-#define TEST_DATA_START_GFN PAGE_SIZE
-
-#define TEST_DATA_TWO_PAGE_COUNT 256
-#define TEST_DATA_TWO_MEMSLOT 2
-#define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE)
-
-static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT];
-
-/**
- * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot,
- * so use_cmma goes on and the CMMA related ioctls do something.
- */
-static void guest_do_one_essa(void)
-{
-	asm volatile(
-		/* load TEST_DATA_START_GFN into r1 */
-		"	llilf 1,%[start_gfn]\n"
-		/* calculate the address from the gfn */
-		"	sllg 1,1,12(0)\n"
-		/* set the first page in TEST_DATA memslot to STABLE */
-		"	.insn rrf,0xb9ab0000,2,1,1,0\n"
-		/* hypercall */
-		"	diag 0,0,0x501\n"
-		"0:	j 0b"
-		:
-		: [start_gfn] "L"(TEST_DATA_START_GFN)
-		: "r1", "r2", "memory", "cc"
-	);
-}
-
-/**
- * Touch CMMA attributes of all pages in TEST_DATA memslot. Set them to stable
- * state.
- */
-static void guest_dirty_test_data(void)
-{
-	asm volatile(
-		/* r1 = TEST_DATA_START_GFN */
-		"	xgr 1,1\n"
-		"	llilf 1,%[start_gfn]\n"
-		/* r5 = TEST_DATA_PAGE_COUNT */
-		"	lghi 5,%[page_count]\n"
-		/* r5 += r1 */
-		"2:	agfr 5,1\n"
-		/* r2 = r1 << PAGE_SHIFT */
-		"1:	sllg 2,1,12(0)\n"
-		/* essa(r4, r2, SET_STABLE) */
-		"	.insn rrf,0xb9ab0000,4,2,1,0\n"
-		/* i++ */
-		"	agfi 1,1\n"
-		/* if r1 < r5 goto 1 */
-		"	cgrjl 1,5,1b\n"
-		/* hypercall */
-		"	diag 0,0,0x501\n"
-		"0:	j 0b"
-		:
-		: [start_gfn] "L"(TEST_DATA_START_GFN),
-		  [page_count] "L"(TEST_DATA_PAGE_COUNT)
-		:
-			/* the counter in our loop over the pages */
-			"r1",
-			/* the calculated page physical address */
-			"r2",
-			/* ESSA output register */
-			"r4",
-			/* last page */
-			"r5",
-			"cc", "memory"
-	);
-}
-
-static void create_main_memslot(struct kvm_vm *vm)
-{
-	int i;
-
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, MAIN_PAGE_COUNT, 0);
-	/* set the array of memslots to zero like __vm_create does */
-	for (i = 0; i < NR_MEM_REGIONS; i++)
-		vm->memslots[i] = 0;
-}
-
-static void create_test_memslot(struct kvm_vm *vm)
-{
-	vm_userspace_mem_region_add(vm,
-				    VM_MEM_SRC_ANONYMOUS,
-				    TEST_DATA_START_GFN << vm->page_shift,
-				    TEST_DATA_MEMSLOT,
-				    TEST_DATA_PAGE_COUNT,
-				    0
-				   );
-	vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
-}
-
-static void create_memslots(struct kvm_vm *vm)
-{
-	/*
-	 * Our VM has the following memory layout:
-	 * +------+---------------------------+
-	 * | GFN  | Memslot                   |
-	 * +------+---------------------------+
-	 * | 0    |                           |
-	 * | ...  | MAIN (Code, Stack, ...)   |
-	 * | 511  |                           |
-	 * +------+---------------------------+
-	 * | 4096 |                           |
-	 * | ...  | TEST_DATA                 |
-	 * | 4607 |                           |
-	 * +------+---------------------------+
-	 */
-	create_main_memslot(vm);
-	create_test_memslot(vm);
-}
-
-static void finish_vm_setup(struct kvm_vm *vm)
-{
-	struct userspace_mem_region *slot0;
-
-	kvm_vm_elf_load(vm, program_invocation_name);
-
-	slot0 = memslot2region(vm, 0);
-	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
-
-	kvm_arch_vm_post_create(vm);
-}
-
-static struct kvm_vm *create_vm_two_memslots(void)
-{
-	struct kvm_vm *vm;
-
-	vm = vm_create_barebones();
-
-	create_memslots(vm);
-
-	finish_vm_setup(vm);
-
-	return vm;
-}
-
-static void enable_cmma(struct kvm_vm *vm)
-{
-	int r;
-
-	r = __kvm_device_attr_set(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA, NULL);
-	TEST_ASSERT(!r, "enabling cmma failed r=%d errno=%d", r, errno);
-}
-
-static void enable_dirty_tracking(struct kvm_vm *vm)
-{
-	vm_mem_region_set_flags(vm, 0, KVM_MEM_LOG_DIRTY_PAGES);
-	vm_mem_region_set_flags(vm, TEST_DATA_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
-}
-
-static int __enable_migration_mode(struct kvm_vm *vm)
-{
-	return __kvm_device_attr_set(vm->fd,
-				     KVM_S390_VM_MIGRATION,
-				     KVM_S390_VM_MIGRATION_START,
-				     NULL
-				    );
-}
-
-static void enable_migration_mode(struct kvm_vm *vm)
-{
-	int r = __enable_migration_mode(vm);
-
-	TEST_ASSERT(!r, "enabling migration mode failed r=%d errno=%d", r, errno);
-}
-
-static bool is_migration_mode_on(struct kvm_vm *vm)
-{
-	u64 out;
-	int r;
-
-	r = __kvm_device_attr_get(vm->fd,
-				  KVM_S390_VM_MIGRATION,
-				  KVM_S390_VM_MIGRATION_STATUS,
-				  &out
-				 );
-	TEST_ASSERT(!r, "getting migration mode status failed r=%d errno=%d", r, errno);
-	return out;
-}
-
-static int vm_get_cmma_bits(struct kvm_vm *vm, u64 flags, int *errno_out)
-{
-	struct kvm_s390_cmma_log args;
-	int rc;
-
-	errno = 0;
-
-	args = (struct kvm_s390_cmma_log){
-		.start_gfn = 0,
-		.count = sizeof(cmma_value_buf),
-		.flags = flags,
-		.values = (__u64)&cmma_value_buf[0]
-	};
-	rc = __vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
-
-	*errno_out = errno;
-	return rc;
-}
-
-static void test_get_cmma_basic(void)
-{
-	struct kvm_vm *vm = create_vm_two_memslots();
-	struct kvm_vcpu *vcpu;
-	int rc, errno_out;
-
-	/* GET_CMMA_BITS without CMMA enabled should fail */
-	rc = vm_get_cmma_bits(vm, 0, &errno_out);
-	TEST_ASSERT_EQ(rc, -1);
-	TEST_ASSERT_EQ(errno_out, ENXIO);
-
-	enable_cmma(vm);
-	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
-
-	vcpu_run(vcpu);
-
-	/* GET_CMMA_BITS without migration mode and without peeking should fail */
-	rc = vm_get_cmma_bits(vm, 0, &errno_out);
-	TEST_ASSERT_EQ(rc, -1);
-	TEST_ASSERT_EQ(errno_out, EINVAL);
-
-	/* GET_CMMA_BITS without migration mode and with peeking should work */
-	rc = vm_get_cmma_bits(vm, KVM_S390_CMMA_PEEK, &errno_out);
-	TEST_ASSERT_EQ(rc, 0);
-	TEST_ASSERT_EQ(errno_out, 0);
-
-	enable_dirty_tracking(vm);
-	enable_migration_mode(vm);
-
-	/* GET_CMMA_BITS with invalid flags */
-	rc = vm_get_cmma_bits(vm, 0xfeedc0fe, &errno_out);
-	TEST_ASSERT_EQ(rc, -1);
-	TEST_ASSERT_EQ(errno_out, EINVAL);
-
-	kvm_vm_free(vm);
-}
-
-static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu)
-{
-	TEST_ASSERT_EQ(vcpu->run->exit_reason, 13);
-	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, 4);
-	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x8300);
-	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipb, 0x5010000);
-}
-
-static void test_migration_mode(void)
-{
-	struct kvm_vm *vm = vm_create_barebones();
-	struct kvm_vcpu *vcpu;
-	u64 orig_psw;
-	int rc;
-
-	/* enabling migration mode on a VM without memory should fail */
-	rc = __enable_migration_mode(vm);
-	TEST_ASSERT_EQ(rc, -1);
-	TEST_ASSERT_EQ(errno, EINVAL);
-	TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
-	errno = 0;
-
-	create_memslots(vm);
-	finish_vm_setup(vm);
-
-	enable_cmma(vm);
-	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
-	orig_psw = vcpu->run->psw_addr;
-
-	/*
-	 * Execute one essa instruction in the guest. Otherwise the guest will
-	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
-	 */
-	vcpu_run(vcpu);
-	assert_exit_was_hypercall(vcpu);
-
-	/* migration mode when memslots have dirty tracking off should fail */
-	rc = __enable_migration_mode(vm);
-	TEST_ASSERT_EQ(rc, -1);
-	TEST_ASSERT_EQ(errno, EINVAL);
-	TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
-	errno = 0;
-
-	/* enable dirty tracking */
-	enable_dirty_tracking(vm);
-
-	/* enabling migration mode should work now */
-	rc = __enable_migration_mode(vm);
-	TEST_ASSERT_EQ(rc, 0);
-	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
-	errno = 0;
-
-	/* execute another ESSA instruction to see this goes fine */
-	vcpu->run->psw_addr = orig_psw;
-	vcpu_run(vcpu);
-	assert_exit_was_hypercall(vcpu);
-
-	/*
-	 * With migration mode on, create a new memslot with dirty tracking off.
-	 * This should turn off migration mode.
-	 */
-	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
-	vm_userspace_mem_region_add(vm,
-				    VM_MEM_SRC_ANONYMOUS,
-				    TEST_DATA_TWO_START_GFN << vm->page_shift,
-				    TEST_DATA_TWO_MEMSLOT,
-				    TEST_DATA_TWO_PAGE_COUNT,
-				    0
-				   );
-	TEST_ASSERT(!is_migration_mode_on(vm),
-		    "creating memslot without dirty tracking turns off migration mode"
-		   );
-
-	/* ESSA instructions should still execute fine */
-	vcpu->run->psw_addr = orig_psw;
-	vcpu_run(vcpu);
-	assert_exit_was_hypercall(vcpu);
-
-	/*
-	 * Turn on dirty tracking on the new memslot.
-	 * It should be possible to turn migration mode back on again.
-	 */
-	vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
-	rc = __enable_migration_mode(vm);
-	TEST_ASSERT_EQ(rc, 0);
-	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
-	errno = 0;
-
-	/*
-	 * Turn off dirty tracking again, this time with just a flag change.
-	 * Again, migration mode should turn off.
-	 */
-	TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
-	vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, 0);
-	TEST_ASSERT(!is_migration_mode_on(vm),
-		    "disabling dirty tracking should turn off migration mode"
-		   );
-
-	/* ESSA instructions should still execute fine */
-	vcpu->run->psw_addr = orig_psw;
-	vcpu_run(vcpu);
-	assert_exit_was_hypercall(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-/**
- * Given a VM with the MAIN and TEST_DATA memslot, assert that both slots have
- * CMMA attributes of all pages in both memslots and nothing more dirty.
- * This has the useful side effect of ensuring nothing is CMMA dirty after this
- * function.
- */
-static void assert_all_slots_cmma_dirty(struct kvm_vm *vm)
-{
-	struct kvm_s390_cmma_log args;
-
-	/*
-	 * First iteration - everything should be dirty.
-	 * Start at the main memslot...
-	 */
-	args = (struct kvm_s390_cmma_log){
-		.start_gfn = 0,
-		.count = sizeof(cmma_value_buf),
-		.flags = 0,
-		.values = (__u64)&cmma_value_buf[0]
-	};
-	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
-	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
-	TEST_ASSERT_EQ(args.count, MAIN_PAGE_COUNT);
-	TEST_ASSERT_EQ(args.remaining, TEST_DATA_PAGE_COUNT);
-	TEST_ASSERT_EQ(args.start_gfn, 0);
-
-	/* ...and then - after a hole - the TEST_DATA memslot should follow */
-	args = (struct kvm_s390_cmma_log){
-		.start_gfn = MAIN_PAGE_COUNT,
-		.count = sizeof(cmma_value_buf),
-		.flags = 0,
-		.values = (__u64)&cmma_value_buf[0]
-	};
-	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
-	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
-	TEST_ASSERT_EQ(args.count, TEST_DATA_PAGE_COUNT);
-	TEST_ASSERT_EQ(args.start_gfn, TEST_DATA_START_GFN);
-	TEST_ASSERT_EQ(args.remaining, 0);
-
-	/* ...and nothing else should be there */
-	args = (struct kvm_s390_cmma_log){
-		.start_gfn = TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT,
-		.count = sizeof(cmma_value_buf),
-		.flags = 0,
-		.values = (__u64)&cmma_value_buf[0]
-	};
-	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
-	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
-	TEST_ASSERT_EQ(args.count, 0);
-	TEST_ASSERT_EQ(args.start_gfn, 0);
-	TEST_ASSERT_EQ(args.remaining, 0);
-}
-
-/**
- * Given a VM, assert no pages are CMMA dirty.
- */
-static void assert_no_pages_cmma_dirty(struct kvm_vm *vm)
-{
-	struct kvm_s390_cmma_log args;
-
-	/* If we start from GFN 0 again, nothing should be dirty. */
-	args = (struct kvm_s390_cmma_log){
-		.start_gfn = 0,
-		.count = sizeof(cmma_value_buf),
-		.flags = 0,
-		.values = (__u64)&cmma_value_buf[0]
-	};
-	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
-	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
-	if (args.count || args.remaining || args.start_gfn)
-		TEST_FAIL("pages are still dirty start_gfn=0x%llx count=%u remaining=%llu",
-			  args.start_gfn,
-			  args.count,
-			  args.remaining
-			 );
-}
-
-static void test_get_inital_dirty(void)
-{
-	struct kvm_vm *vm = create_vm_two_memslots();
-	struct kvm_vcpu *vcpu;
-
-	enable_cmma(vm);
-	vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
-
-	/*
-	 * Execute one essa instruction in the guest. Otherwise the guest will
-	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
-	 */
-	vcpu_run(vcpu);
-	assert_exit_was_hypercall(vcpu);
-
-	enable_dirty_tracking(vm);
-	enable_migration_mode(vm);
-
-	assert_all_slots_cmma_dirty(vm);
-
-	/* Start from the beginning again and make sure nothing else is dirty */
-	assert_no_pages_cmma_dirty(vm);
-
-	kvm_vm_free(vm);
-}
-
-static void query_cmma_range(struct kvm_vm *vm,
-			     u64 start_gfn, u64 gfn_count,
-			     struct kvm_s390_cmma_log *res_out)
-{
-	*res_out = (struct kvm_s390_cmma_log){
-		.start_gfn = start_gfn,
-		.count = gfn_count,
-		.flags = 0,
-		.values = (__u64)&cmma_value_buf[0]
-	};
-	memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
-	vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, res_out);
-}
-
-/**
- * Assert the given cmma_log struct that was executed by query_cmma_range()
- * indicates the first dirty gfn is at first_dirty_gfn and contains exactly
- * dirty_gfn_count CMMA values.
- */
-static void assert_cmma_dirty(u64 first_dirty_gfn,
-			      u64 dirty_gfn_count,
-			      const struct kvm_s390_cmma_log *res)
-{
-	TEST_ASSERT_EQ(res->start_gfn, first_dirty_gfn);
-	TEST_ASSERT_EQ(res->count, dirty_gfn_count);
-	for (size_t i = 0; i < dirty_gfn_count; i++)
-		TEST_ASSERT_EQ(cmma_value_buf[0], 0x0); /* stable state */
-	TEST_ASSERT_EQ(cmma_value_buf[dirty_gfn_count], 0xff); /* not touched */
-}
-
-static void test_get_skip_holes(void)
-{
-	size_t gfn_offset;
-	struct kvm_vm *vm = create_vm_two_memslots();
-	struct kvm_s390_cmma_log log;
-	struct kvm_vcpu *vcpu;
-	u64 orig_psw;
-
-	enable_cmma(vm);
-	vcpu = vm_vcpu_add(vm, 1, guest_dirty_test_data);
-
-	orig_psw = vcpu->run->psw_addr;
-
-	/*
-	 * Execute some essa instructions in the guest. Otherwise the guest will
-	 * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
-	 */
-	vcpu_run(vcpu);
-	assert_exit_was_hypercall(vcpu);
-
-	enable_dirty_tracking(vm);
-	enable_migration_mode(vm);
-
-	/* un-dirty all pages */
-	assert_all_slots_cmma_dirty(vm);
-
-	/* Then, dirty just the TEST_DATA memslot */
-	vcpu->run->psw_addr = orig_psw;
-	vcpu_run(vcpu);
-
-	gfn_offset = TEST_DATA_START_GFN;
-	/**
-	 * Query CMMA attributes of one page, starting at page 0. Since the
-	 * main memslot was not touched by the VM, this should yield the first
-	 * page of the TEST_DATA memslot.
-	 * The dirty bitmap should now look like this:
-	 * 0: not dirty
-	 * [0x1, 0x200): dirty
-	 */
-	query_cmma_range(vm, 0, 1, &log);
-	assert_cmma_dirty(gfn_offset, 1, &log);
-	gfn_offset++;
-
-	/**
-	 * Query CMMA attributes of 32 (0x20) pages past the end of the TEST_DATA
-	 * memslot. This should wrap back to the beginning of the TEST_DATA
-	 * memslot, page 1.
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x21): not dirty
-	 * [0x21, 0x200): dirty
-	 */
-	query_cmma_range(vm, TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, 0x20, &log);
-	assert_cmma_dirty(gfn_offset, 0x20, &log);
-	gfn_offset += 0x20;
-
-	/* Skip 32 pages */
-	gfn_offset += 0x20;
-
-	/**
-	 * After skipping 32 pages, query the next 32 (0x20) pages.
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x21): not dirty
-	 * [0x21, 0x41): dirty
-	 * [0x41, 0x61): not dirty
-	 * [0x61, 0x200): dirty
-	 */
-	query_cmma_range(vm, gfn_offset, 0x20, &log);
-	assert_cmma_dirty(gfn_offset, 0x20, &log);
-	gfn_offset += 0x20;
-
-	/**
-	 * Query 1 page from the beginning of the TEST_DATA memslot. This should
-	 * yield page 0x21.
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x22): not dirty
-	 * [0x22, 0x41): dirty
-	 * [0x41, 0x61): not dirty
-	 * [0x61, 0x200): dirty
-	 */
-	query_cmma_range(vm, TEST_DATA_START_GFN, 1, &log);
-	assert_cmma_dirty(TEST_DATA_START_GFN + 0x21, 1, &log);
-	gfn_offset++;
-
-	/**
-	 * Query 15 (0xF) pages from page 0x23 in TEST_DATA memslot.
-	 * This should yield pages [0x23, 0x33).
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x22): not dirty
-	 * 0x22: dirty
-	 * [0x23, 0x33): not dirty
-	 * [0x33, 0x41): dirty
-	 * [0x41, 0x61): not dirty
-	 * [0x61, 0x200): dirty
-	 */
-	gfn_offset = TEST_DATA_START_GFN + 0x23;
-	query_cmma_range(vm, gfn_offset, 15, &log);
-	assert_cmma_dirty(gfn_offset, 15, &log);
-
-	/**
-	 * Query 17 (0x11) pages from page 0x22 in TEST_DATA memslot.
-	 * This should yield page [0x22, 0x33)
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x33): not dirty
-	 * [0x33, 0x41): dirty
-	 * [0x41, 0x61): not dirty
-	 * [0x61, 0x200): dirty
-	 */
-	gfn_offset = TEST_DATA_START_GFN + 0x22;
-	query_cmma_range(vm, gfn_offset, 17, &log);
-	assert_cmma_dirty(gfn_offset, 17, &log);
-
-	/**
-	 * Query 25 (0x19) pages from page 0x40 in TEST_DATA memslot.
-	 * This should yield page 0x40 and nothing more, since there are more
-	 * than 16 non-dirty pages after page 0x40.
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x33): not dirty
-	 * [0x33, 0x40): dirty
-	 * [0x40, 0x61): not dirty
-	 * [0x61, 0x200): dirty
-	 */
-	gfn_offset = TEST_DATA_START_GFN + 0x40;
-	query_cmma_range(vm, gfn_offset, 25, &log);
-	assert_cmma_dirty(gfn_offset, 1, &log);
-
-	/**
-	 * Query pages [0x33, 0x40).
-	 * The dirty bitmap should now look like this:
-	 * [0, 0x61): not dirty
-	 * [0x61, 0x200): dirty
-	 */
-	gfn_offset = TEST_DATA_START_GFN + 0x33;
-	query_cmma_range(vm, gfn_offset, 0x40 - 0x33, &log);
-	assert_cmma_dirty(gfn_offset, 0x40 - 0x33, &log);
-
-	/**
-	 * Query the remaining pages [0x61, 0x200).
-	 */
-	gfn_offset = TEST_DATA_START_GFN;
-	query_cmma_range(vm, gfn_offset, TEST_DATA_PAGE_COUNT - 0x61, &log);
-	assert_cmma_dirty(TEST_DATA_START_GFN + 0x61, TEST_DATA_PAGE_COUNT - 0x61, &log);
-
-	assert_no_pages_cmma_dirty(vm);
-}
-
-struct testdef {
-	const char *name;
-	void (*test)(void);
-} testlist[] = {
-	{ "migration mode and dirty tracking", test_migration_mode },
-	{ "GET_CMMA_BITS: basic calls", test_get_cmma_basic },
-	{ "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty },
-	{ "GET_CMMA_BITS: holes are skipped", test_get_skip_holes },
-};
-
-/**
- * The kernel may support CMMA, but the machine may not (i.e. if running as
- * guest-3).
- *
- * In this case, the CMMA capabilities are all there, but the CMMA-related
- * ioctls fail. To find out whether the machine supports CMMA, create a
- * temporary VM and then query the CMMA feature of the VM.
- */
-static int machine_has_cmma(void)
-{
-	struct kvm_vm *vm = vm_create_barebones();
-	int r;
-
-	r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA);
-	kvm_vm_free(vm);
-
-	return r;
-}
-
-int main(int argc, char *argv[])
-{
-	int idx;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_CMMA_MIGRATION));
-	TEST_REQUIRE(machine_has_cmma());
-
-	ksft_print_header();
-
-	ksft_set_plan(ARRAY_SIZE(testlist));
-
-	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		testlist[idx].test();
-		ksft_test_result_pass("%s\n", testlist[idx].name);
-	}
-
-	ksft_finished();	/* Print results and exit() accordingly */
-}
diff --git a/tools/testing/selftests/kvm/s390x/config b/tools/testing/selftests/kvm/s390x/config
deleted file mode 100644
index 23270f2d679f..000000000000
--- a/tools/testing/selftests/kvm/s390x/config
+++ /dev/null
@@ -1,2 +0,0 @@
-CONFIG_KVM=y
-CONFIG_KVM_S390_UCONTROL=y
diff --git a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c b/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
deleted file mode 100644
index 27255880dabd..000000000000
--- a/tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
+++ /dev/null
@@ -1,301 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright IBM Corp. 2024
- *
- * Authors:
- *  Hariharan Mari <hari55@linux.ibm.com>
- *
- * The tests compare the result of the KVM ioctl for obtaining CPU subfunction data with those
- * from an ASM block performing the same CPU subfunction. Currently KVM doesn't mask instruction
- * query data reported via the CPU Model, allowing us to directly compare it with the data
- * acquired through executing the queries in the test.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include "facility.h"
-
-#include "kvm_util.h"
-
-#define PLO_FUNCTION_MAX 256
-
-/* Query available CPU subfunctions */
-struct kvm_s390_vm_cpu_subfunc cpu_subfunc;
-
-static void get_cpu_machine_subfuntions(struct kvm_vm *vm,
-					struct kvm_s390_vm_cpu_subfunc *cpu_subfunc)
-{
-	int r;
-
-	r = __kvm_device_attr_get(vm->fd, KVM_S390_VM_CPU_MODEL,
-				  KVM_S390_VM_CPU_MACHINE_SUBFUNC, cpu_subfunc);
-
-	TEST_ASSERT(!r, "Get cpu subfunctions failed r=%d errno=%d", r, errno);
-}
-
-static inline int plo_test_bit(unsigned char nr)
-{
-	unsigned long function = nr | 0x100;
-	int cc;
-
-	asm volatile("	lgr	0,%[function]\n"
-			/* Parameter registers are ignored for "test bit" */
-			"	plo	0,0,0,0(0)\n"
-			"	ipm	%0\n"
-			"	srl	%0,28\n"
-			: "=d" (cc)
-			: [function] "d" (function)
-			: "cc", "0");
-	return cc == 0;
-}
-
-/* Testing Perform Locked Operation (PLO) CPU subfunction's ASM block */
-static void test_plo_asm_block(u8 (*query)[32])
-{
-	for (int i = 0; i < PLO_FUNCTION_MAX; ++i) {
-		if (plo_test_bit(i))
-			(*query)[i >> 3] |= 0x80 >> (i & 7);
-	}
-}
-
-/* Testing Crypto Compute Message Authentication Code (KMAC) CPU subfunction's ASM block */
-static void test_kmac_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb91e0000,0,2\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Cipher Message with Chaining (KMC) CPU subfunction's ASM block */
-static void test_kmc_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb92f0000,2,4\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Cipher Message (KM) CPU subfunction's ASM block */
-static void test_km_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb92e0000,2,4\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Compute Intermediate Message Digest (KIMD) CPU subfunction's ASM block */
-static void test_kimd_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb93e0000,0,2\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Compute Last Message Digest (KLMD) CPU subfunction's ASM block */
-static void test_klmd_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb93f0000,0,2\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Cipher Message with Counter (KMCTR) CPU subfunction's ASM block */
-static void test_kmctr_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rrf,0xb92d0000,2,4,6,0\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Cipher Message with Cipher Feedback (KMF) CPU subfunction's ASM block */
-static void test_kmf_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb92a0000,2,4\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Cipher Message with Output Feedback (KMO) CPU subfunction's ASM block */
-static void test_kmo_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb92b0000,2,4\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Perform Cryptographic Computation (PCC) CPU subfunction's ASM block */
-static void test_pcc_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb92c0000,0,0\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Perform Random Number Operation (PRNO) CPU subfunction's ASM block */
-static void test_prno_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb93c0000,2,4\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Cipher Message with Authentication (KMA) CPU subfunction's ASM block */
-static void test_kma_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rrf,0xb9290000,2,4,6,0\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Crypto Compute Digital Signature Authentication (KDSA) CPU subfunction's ASM block */
-static void test_kdsa_asm_block(u8 (*query)[16])
-{
-	asm volatile("	la	%%r1,%[query]\n"
-			"	xgr	%%r0,%%r0\n"
-			"	.insn	rre,0xb93a0000,0,2\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "r0", "r1");
-}
-
-/* Testing Sort Lists (SORTL) CPU subfunction's ASM block */
-static void test_sortl_asm_block(u8 (*query)[32])
-{
-	asm volatile("	lghi	0,0\n"
-			"	la	1,%[query]\n"
-			"	.insn	rre,0xb9380000,2,4\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "0", "1");
-}
-
-/* Testing Deflate Conversion Call (DFLTCC) CPU subfunction's ASM block */
-static void test_dfltcc_asm_block(u8 (*query)[32])
-{
-	asm volatile("	lghi	0,0\n"
-			"	la	1,%[query]\n"
-			"	.insn	rrf,0xb9390000,2,4,6,0\n"
-			: [query] "=R" (*query)
-			:
-			: "cc", "0", "1");
-}
-
-/*
- * Testing Perform Function with Concurrent Results (PFCR)
- * CPU subfunctions's ASM block
- */
-static void test_pfcr_asm_block(u8 (*query)[16])
-{
-	asm volatile("	lghi	0,0\n"
-			"	.insn   rsy,0xeb0000000016,0,0,%[query]\n"
-			: [query] "=QS" (*query)
-			:
-			: "cc", "0");
-}
-
-typedef void (*testfunc_t)(u8 (*array)[]);
-
-struct testdef {
-	const char *subfunc_name;
-	u8 *subfunc_array;
-	size_t array_size;
-	testfunc_t test;
-	int facility_bit;
-} testlist[] = {
-	/*
-	 * PLO was introduced in the very first 64-bit machine generation.
-	 * Hence it is assumed PLO is always installed in Z Arch.
-	 */
-	{ "PLO", cpu_subfunc.plo, sizeof(cpu_subfunc.plo), test_plo_asm_block, 1 },
-	/* MSA - Facility bit 17 */
-	{ "KMAC", cpu_subfunc.kmac, sizeof(cpu_subfunc.kmac), test_kmac_asm_block, 17 },
-	{ "KMC", cpu_subfunc.kmc, sizeof(cpu_subfunc.kmc), test_kmc_asm_block, 17 },
-	{ "KM", cpu_subfunc.km, sizeof(cpu_subfunc.km), test_km_asm_block, 17 },
-	{ "KIMD", cpu_subfunc.kimd, sizeof(cpu_subfunc.kimd), test_kimd_asm_block, 17 },
-	{ "KLMD", cpu_subfunc.klmd, sizeof(cpu_subfunc.klmd), test_klmd_asm_block, 17 },
-	/* MSA - Facility bit 77 */
-	{ "KMCTR", cpu_subfunc.kmctr, sizeof(cpu_subfunc.kmctr), test_kmctr_asm_block, 77 },
-	{ "KMF", cpu_subfunc.kmf, sizeof(cpu_subfunc.kmf), test_kmf_asm_block, 77 },
-	{ "KMO", cpu_subfunc.kmo, sizeof(cpu_subfunc.kmo), test_kmo_asm_block, 77 },
-	{ "PCC", cpu_subfunc.pcc, sizeof(cpu_subfunc.pcc), test_pcc_asm_block, 77 },
-	/* MSA5 - Facility bit 57 */
-	{ "PPNO", cpu_subfunc.ppno, sizeof(cpu_subfunc.ppno), test_prno_asm_block, 57 },
-	/* MSA8 - Facility bit 146 */
-	{ "KMA", cpu_subfunc.kma, sizeof(cpu_subfunc.kma), test_kma_asm_block, 146 },
-	/* MSA9 - Facility bit 155 */
-	{ "KDSA", cpu_subfunc.kdsa, sizeof(cpu_subfunc.kdsa), test_kdsa_asm_block, 155 },
-	/* SORTL - Facility bit 150 */
-	{ "SORTL", cpu_subfunc.sortl, sizeof(cpu_subfunc.sortl), test_sortl_asm_block, 150 },
-	/* DFLTCC - Facility bit 151 */
-	{ "DFLTCC", cpu_subfunc.dfltcc, sizeof(cpu_subfunc.dfltcc), test_dfltcc_asm_block, 151 },
-	/* Concurrent-function facility - Facility bit 201 */
-	{ "PFCR", cpu_subfunc.pfcr, sizeof(cpu_subfunc.pfcr), test_pfcr_asm_block, 201 },
-};
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	int idx;
-
-	ksft_print_header();
-
-	vm = vm_create(1);
-
-	memset(&cpu_subfunc, 0, sizeof(cpu_subfunc));
-	get_cpu_machine_subfuntions(vm, &cpu_subfunc);
-
-	ksft_set_plan(ARRAY_SIZE(testlist));
-	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		if (test_facility(testlist[idx].facility_bit)) {
-			u8 *array = malloc(testlist[idx].array_size);
-
-			testlist[idx].test((u8 (*)[testlist[idx].array_size])array);
-
-			TEST_ASSERT_EQ(memcmp(testlist[idx].subfunc_array,
-					      array, testlist[idx].array_size), 0);
-
-			ksft_test_result_pass("%s\n", testlist[idx].subfunc_name);
-			free(array);
-		} else {
-			ksft_test_result_skip("%s feature is not avaialable\n",
-					      testlist[idx].subfunc_name);
-		}
-	}
-
-	kvm_vm_free(vm);
-	ksft_finished();
-}
diff --git a/tools/testing/selftests/kvm/s390x/debug_test.c b/tools/testing/selftests/kvm/s390x/debug_test.c
deleted file mode 100644
index ad8095968601..000000000000
--- a/tools/testing/selftests/kvm/s390x/debug_test.c
+++ /dev/null
@@ -1,160 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Test KVM debugging features. */
-#include "kvm_util.h"
-#include "test_util.h"
-#include "sie.h"
-
-#include <linux/kvm.h>
-
-#define __LC_SVC_NEW_PSW 0x1c0
-#define __LC_PGM_NEW_PSW 0x1d0
-#define IPA0_DIAG 0x8300
-#define PGM_SPECIFICATION 0x06
-
-/* Common code for testing single-stepping interruptions. */
-extern char int_handler[];
-asm("int_handler:\n"
-    "j .\n");
-
-static struct kvm_vm *test_step_int_1(struct kvm_vcpu **vcpu, void *guest_code,
-				      size_t new_psw_off, uint64_t *new_psw)
-{
-	struct kvm_guest_debug debug = {};
-	struct kvm_regs regs;
-	struct kvm_vm *vm;
-	char *lowcore;
-
-	vm = vm_create_with_one_vcpu(vcpu, guest_code);
-	lowcore = addr_gpa2hva(vm, 0);
-	new_psw[0] = (*vcpu)->run->psw_mask;
-	new_psw[1] = (uint64_t)int_handler;
-	memcpy(lowcore + new_psw_off, new_psw, 16);
-	vcpu_regs_get(*vcpu, &regs);
-	regs.gprs[2] = -1;
-	vcpu_regs_set(*vcpu, &regs);
-	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
-	vcpu_guest_debug_set(*vcpu, &debug);
-	vcpu_run(*vcpu);
-
-	return vm;
-}
-
-static void test_step_int(void *guest_code, size_t new_psw_off)
-{
-	struct kvm_vcpu *vcpu;
-	uint64_t new_psw[2];
-	struct kvm_vm *vm;
-
-	vm = test_step_int_1(&vcpu, guest_code, new_psw_off, new_psw);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG);
-	TEST_ASSERT_EQ(vcpu->run->psw_mask, new_psw[0]);
-	TEST_ASSERT_EQ(vcpu->run->psw_addr, new_psw[1]);
-	kvm_vm_free(vm);
-}
-
-/* Test single-stepping "boring" program interruptions. */
-extern char test_step_pgm_guest_code[];
-asm("test_step_pgm_guest_code:\n"
-    ".insn rr,0x1d00,%r1,%r0 /* dr %r1,%r0 */\n"
-    "j .\n");
-
-static void test_step_pgm(void)
-{
-	test_step_int(test_step_pgm_guest_code, __LC_PGM_NEW_PSW);
-}
-
-/*
- * Test single-stepping program interruptions caused by DIAG.
- * Userspace emulation must not interfere with single-stepping.
- */
-extern char test_step_pgm_diag_guest_code[];
-asm("test_step_pgm_diag_guest_code:\n"
-    "diag %r0,%r0,0\n"
-    "j .\n");
-
-static void test_step_pgm_diag(void)
-{
-	struct kvm_s390_irq irq = {
-		.type = KVM_S390_PROGRAM_INT,
-		.u.pgm.code = PGM_SPECIFICATION,
-	};
-	struct kvm_vcpu *vcpu;
-	uint64_t new_psw[2];
-	struct kvm_vm *vm;
-
-	vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code,
-			     __LC_PGM_NEW_PSW, new_psw);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
-	TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST);
-	TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG);
-	vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq);
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG);
-	TEST_ASSERT_EQ(vcpu->run->psw_mask, new_psw[0]);
-	TEST_ASSERT_EQ(vcpu->run->psw_addr, new_psw[1]);
-	kvm_vm_free(vm);
-}
-
-/*
- * Test single-stepping program interruptions caused by ISKE.
- * CPUSTAT_KSS handling must not interfere with single-stepping.
- */
-extern char test_step_pgm_iske_guest_code[];
-asm("test_step_pgm_iske_guest_code:\n"
-    "iske %r2,%r2\n"
-    "j .\n");
-
-static void test_step_pgm_iske(void)
-{
-	test_step_int(test_step_pgm_iske_guest_code, __LC_PGM_NEW_PSW);
-}
-
-/*
- * Test single-stepping program interruptions caused by LCTL.
- * KVM emulation must not interfere with single-stepping.
- */
-extern char test_step_pgm_lctl_guest_code[];
-asm("test_step_pgm_lctl_guest_code:\n"
-    "lctl %c0,%c0,1\n"
-    "j .\n");
-
-static void test_step_pgm_lctl(void)
-{
-	test_step_int(test_step_pgm_lctl_guest_code, __LC_PGM_NEW_PSW);
-}
-
-/* Test single-stepping supervisor-call interruptions. */
-extern char test_step_svc_guest_code[];
-asm("test_step_svc_guest_code:\n"
-    "svc 0\n"
-    "j .\n");
-
-static void test_step_svc(void)
-{
-	test_step_int(test_step_svc_guest_code, __LC_SVC_NEW_PSW);
-}
-
-/* Run all tests above. */
-static struct testdef {
-	const char *name;
-	void (*test)(void);
-} testlist[] = {
-	{ "single-step pgm", test_step_pgm },
-	{ "single-step pgm caused by diag", test_step_pgm_diag },
-	{ "single-step pgm caused by iske", test_step_pgm_iske },
-	{ "single-step pgm caused by lctl", test_step_pgm_lctl },
-	{ "single-step svc", test_step_svc },
-};
-
-int main(int argc, char *argv[])
-{
-	int idx;
-
-	ksft_print_header();
-	ksft_set_plan(ARRAY_SIZE(testlist));
-	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		testlist[idx].test();
-		ksft_test_result_pass("%s\n", testlist[idx].name);
-	}
-	ksft_finished();
-}
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
deleted file mode 100644
index 4374b4cd2a80..000000000000
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ /dev/null
@@ -1,1187 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Test for s390x KVM_S390_MEM_OP
- *
- * Copyright (C) 2019, Red Hat, Inc.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <pthread.h>
-
-#include <linux/bits.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "kselftest.h"
-#include "ucall_common.h"
-#include "processor.h"
-
-enum mop_target {
-	LOGICAL,
-	SIDA,
-	ABSOLUTE,
-	INVALID,
-};
-
-enum mop_access_mode {
-	READ,
-	WRITE,
-	CMPXCHG,
-};
-
-struct mop_desc {
-	uintptr_t gaddr;
-	uintptr_t gaddr_v;
-	uint64_t set_flags;
-	unsigned int f_check : 1;
-	unsigned int f_inject : 1;
-	unsigned int f_key : 1;
-	unsigned int _gaddr_v : 1;
-	unsigned int _set_flags : 1;
-	unsigned int _sida_offset : 1;
-	unsigned int _ar : 1;
-	uint32_t size;
-	enum mop_target target;
-	enum mop_access_mode mode;
-	void *buf;
-	uint32_t sida_offset;
-	void *old;
-	uint8_t old_value[16];
-	bool *cmpxchg_success;
-	uint8_t ar;
-	uint8_t key;
-};
-
-const uint8_t NO_KEY = 0xff;
-
-static struct kvm_s390_mem_op ksmo_from_desc(struct mop_desc *desc)
-{
-	struct kvm_s390_mem_op ksmo = {
-		.gaddr = (uintptr_t)desc->gaddr,
-		.size = desc->size,
-		.buf = ((uintptr_t)desc->buf),
-		.reserved = "ignored_ignored_ignored_ignored"
-	};
-
-	switch (desc->target) {
-	case LOGICAL:
-		if (desc->mode == READ)
-			ksmo.op = KVM_S390_MEMOP_LOGICAL_READ;
-		if (desc->mode == WRITE)
-			ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
-		break;
-	case SIDA:
-		if (desc->mode == READ)
-			ksmo.op = KVM_S390_MEMOP_SIDA_READ;
-		if (desc->mode == WRITE)
-			ksmo.op = KVM_S390_MEMOP_SIDA_WRITE;
-		break;
-	case ABSOLUTE:
-		if (desc->mode == READ)
-			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_READ;
-		if (desc->mode == WRITE)
-			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_WRITE;
-		if (desc->mode == CMPXCHG) {
-			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_CMPXCHG;
-			ksmo.old_addr = (uint64_t)desc->old;
-			memcpy(desc->old_value, desc->old, desc->size);
-		}
-		break;
-	case INVALID:
-		ksmo.op = -1;
-	}
-	if (desc->f_check)
-		ksmo.flags |= KVM_S390_MEMOP_F_CHECK_ONLY;
-	if (desc->f_inject)
-		ksmo.flags |= KVM_S390_MEMOP_F_INJECT_EXCEPTION;
-	if (desc->_set_flags)
-		ksmo.flags = desc->set_flags;
-	if (desc->f_key && desc->key != NO_KEY) {
-		ksmo.flags |= KVM_S390_MEMOP_F_SKEY_PROTECTION;
-		ksmo.key = desc->key;
-	}
-	if (desc->_ar)
-		ksmo.ar = desc->ar;
-	else
-		ksmo.ar = 0;
-	if (desc->_sida_offset)
-		ksmo.sida_offset = desc->sida_offset;
-
-	return ksmo;
-}
-
-struct test_info {
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-};
-
-#define PRINT_MEMOP false
-static void print_memop(struct kvm_vcpu *vcpu, const struct kvm_s390_mem_op *ksmo)
-{
-	if (!PRINT_MEMOP)
-		return;
-
-	if (!vcpu)
-		printf("vm memop(");
-	else
-		printf("vcpu memop(");
-	switch (ksmo->op) {
-	case KVM_S390_MEMOP_LOGICAL_READ:
-		printf("LOGICAL, READ, ");
-		break;
-	case KVM_S390_MEMOP_LOGICAL_WRITE:
-		printf("LOGICAL, WRITE, ");
-		break;
-	case KVM_S390_MEMOP_SIDA_READ:
-		printf("SIDA, READ, ");
-		break;
-	case KVM_S390_MEMOP_SIDA_WRITE:
-		printf("SIDA, WRITE, ");
-		break;
-	case KVM_S390_MEMOP_ABSOLUTE_READ:
-		printf("ABSOLUTE, READ, ");
-		break;
-	case KVM_S390_MEMOP_ABSOLUTE_WRITE:
-		printf("ABSOLUTE, WRITE, ");
-		break;
-	case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
-		printf("ABSOLUTE, CMPXCHG, ");
-		break;
-	}
-	printf("gaddr=%llu, size=%u, buf=%llu, ar=%u, key=%u, old_addr=%llx",
-	       ksmo->gaddr, ksmo->size, ksmo->buf, ksmo->ar, ksmo->key,
-	       ksmo->old_addr);
-	if (ksmo->flags & KVM_S390_MEMOP_F_CHECK_ONLY)
-		printf(", CHECK_ONLY");
-	if (ksmo->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION)
-		printf(", INJECT_EXCEPTION");
-	if (ksmo->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION)
-		printf(", SKEY_PROTECTION");
-	puts(")");
-}
-
-static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
-			   struct mop_desc *desc)
-{
-	struct kvm_vcpu *vcpu = info.vcpu;
-
-	if (!vcpu)
-		return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo);
-	else
-		return __vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo);
-}
-
-static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
-			struct mop_desc *desc)
-{
-	int r;
-
-	r = err_memop_ioctl(info, ksmo, desc);
-	if (ksmo->op == KVM_S390_MEMOP_ABSOLUTE_CMPXCHG) {
-		if (desc->cmpxchg_success) {
-			int diff = memcmp(desc->old_value, desc->old, desc->size);
-			*desc->cmpxchg_success = !diff;
-		}
-	}
-	TEST_ASSERT(!r, __KVM_IOCTL_ERROR("KVM_S390_MEM_OP", r));
-}
-
-#define MEMOP(err, info_p, mop_target_p, access_mode_p, buf_p, size_p, ...)	\
-({										\
-	struct test_info __info = (info_p);					\
-	struct mop_desc __desc = {						\
-		.target = (mop_target_p),					\
-		.mode = (access_mode_p),					\
-		.buf = (buf_p),							\
-		.size = (size_p),						\
-		__VA_ARGS__							\
-	};									\
-	struct kvm_s390_mem_op __ksmo;						\
-										\
-	if (__desc._gaddr_v) {							\
-		if (__desc.target == ABSOLUTE)					\
-			__desc.gaddr = addr_gva2gpa(__info.vm, __desc.gaddr_v);	\
-		else								\
-			__desc.gaddr = __desc.gaddr_v;				\
-	}									\
-	__ksmo = ksmo_from_desc(&__desc);					\
-	print_memop(__info.vcpu, &__ksmo);					\
-	err##memop_ioctl(__info, &__ksmo, &__desc);				\
-})
-
-#define MOP(...) MEMOP(, __VA_ARGS__)
-#define ERR_MOP(...) MEMOP(err_, __VA_ARGS__)
-
-#define GADDR(a) .gaddr = ((uintptr_t)a)
-#define GADDR_V(v) ._gaddr_v = 1, .gaddr_v = ((uintptr_t)v)
-#define CHECK_ONLY .f_check = 1
-#define SET_FLAGS(f) ._set_flags = 1, .set_flags = (f)
-#define SIDA_OFFSET(o) ._sida_offset = 1, .sida_offset = (o)
-#define AR(a) ._ar = 1, .ar = (a)
-#define KEY(a) .f_key = 1, .key = (a)
-#define INJECT .f_inject = 1
-#define CMPXCHG_OLD(o) .old = (o)
-#define CMPXCHG_SUCCESS(s) .cmpxchg_success = (s)
-
-#define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); })
-
-#define CR0_FETCH_PROTECTION_OVERRIDE	(1UL << (63 - 38))
-#define CR0_STORAGE_PROTECTION_OVERRIDE	(1UL << (63 - 39))
-
-static uint8_t __aligned(PAGE_SIZE) mem1[65536];
-static uint8_t __aligned(PAGE_SIZE) mem2[65536];
-
-struct test_default {
-	struct kvm_vm *kvm_vm;
-	struct test_info vm;
-	struct test_info vcpu;
-	struct kvm_run *run;
-	int size;
-};
-
-static struct test_default test_default_init(void *guest_code)
-{
-	struct kvm_vcpu *vcpu;
-	struct test_default t;
-
-	t.size = min((size_t)kvm_check_cap(KVM_CAP_S390_MEM_OP), sizeof(mem1));
-	t.kvm_vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	t.vm = (struct test_info) { t.kvm_vm, NULL };
-	t.vcpu = (struct test_info) { t.kvm_vm, vcpu };
-	t.run = vcpu->run;
-	return t;
-}
-
-enum stage {
-	/* Synced state set by host, e.g. DAT */
-	STAGE_INITED,
-	/* Guest did nothing */
-	STAGE_IDLED,
-	/* Guest set storage keys (specifics up to test case) */
-	STAGE_SKEYS_SET,
-	/* Guest copied memory (locations up to test case) */
-	STAGE_COPIED,
-	/* End of guest code reached */
-	STAGE_DONE,
-};
-
-#define HOST_SYNC(info_p, stage)					\
-({									\
-	struct test_info __info = (info_p);				\
-	struct kvm_vcpu *__vcpu = __info.vcpu;				\
-	struct ucall uc;						\
-	int __stage = (stage);						\
-									\
-	vcpu_run(__vcpu);						\
-	get_ucall(__vcpu, &uc);						\
-	if (uc.cmd == UCALL_ABORT) {					\
-		REPORT_GUEST_ASSERT(uc);				\
-	}								\
-	TEST_ASSERT_EQ(uc.cmd, UCALL_SYNC);				\
-	TEST_ASSERT_EQ(uc.args[1], __stage);				\
-})									\
-
-static void prepare_mem12(void)
-{
-	int i;
-
-	for (i = 0; i < sizeof(mem1); i++)
-		mem1[i] = rand();
-	memset(mem2, 0xaa, sizeof(mem2));
-}
-
-#define ASSERT_MEM_EQ(p1, p2, size) \
-	TEST_ASSERT(!memcmp(p1, p2, size), "Memory contents do not match!")
-
-static void default_write_read(struct test_info copy_cpu, struct test_info mop_cpu,
-			       enum mop_target mop_target, uint32_t size, uint8_t key)
-{
-	prepare_mem12();
-	CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size,
-		   GADDR_V(mem1), KEY(key));
-	HOST_SYNC(copy_cpu, STAGE_COPIED);
-	CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size,
-		   GADDR_V(mem2), KEY(key));
-	ASSERT_MEM_EQ(mem1, mem2, size);
-}
-
-static void default_read(struct test_info copy_cpu, struct test_info mop_cpu,
-			 enum mop_target mop_target, uint32_t size, uint8_t key)
-{
-	prepare_mem12();
-	CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size, GADDR_V(mem1));
-	HOST_SYNC(copy_cpu, STAGE_COPIED);
-	CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size,
-		   GADDR_V(mem2), KEY(key));
-	ASSERT_MEM_EQ(mem1, mem2, size);
-}
-
-static void default_cmpxchg(struct test_default *test, uint8_t key)
-{
-	for (int size = 1; size <= 16; size *= 2) {
-		for (int offset = 0; offset < 16; offset += size) {
-			uint8_t __aligned(16) new[16] = {};
-			uint8_t __aligned(16) old[16];
-			bool succ;
-
-			prepare_mem12();
-			default_write_read(test->vcpu, test->vcpu, LOGICAL, 16, NO_KEY);
-
-			memcpy(&old, mem1, 16);
-			MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset,
-			    size, GADDR_V(mem1 + offset),
-			    CMPXCHG_OLD(old + offset),
-			    CMPXCHG_SUCCESS(&succ), KEY(key));
-			HOST_SYNC(test->vcpu, STAGE_COPIED);
-			MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2));
-			TEST_ASSERT(succ, "exchange of values should succeed");
-			memcpy(mem1 + offset, new + offset, size);
-			ASSERT_MEM_EQ(mem1, mem2, 16);
-
-			memcpy(&old, mem1, 16);
-			new[offset]++;
-			old[offset]++;
-			MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset,
-			    size, GADDR_V(mem1 + offset),
-			    CMPXCHG_OLD(old + offset),
-			    CMPXCHG_SUCCESS(&succ), KEY(key));
-			HOST_SYNC(test->vcpu, STAGE_COPIED);
-			MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2));
-			TEST_ASSERT(!succ, "exchange of values should not succeed");
-			ASSERT_MEM_EQ(mem1, mem2, 16);
-			ASSERT_MEM_EQ(&old, mem1, 16);
-		}
-	}
-}
-
-static void guest_copy(void)
-{
-	GUEST_SYNC(STAGE_INITED);
-	memcpy(&mem2, &mem1, sizeof(mem2));
-	GUEST_SYNC(STAGE_COPIED);
-}
-
-static void test_copy(void)
-{
-	struct test_default t = test_default_init(guest_copy);
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-
-	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, NO_KEY);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_copy_access_register(void)
-{
-	struct test_default t = test_default_init(guest_copy);
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-
-	prepare_mem12();
-	t.run->psw_mask &= ~(3UL << (63 - 17));
-	t.run->psw_mask |= 1UL << (63 - 17);  /* Enable AR mode */
-
-	/*
-	 * Primary address space gets used if an access register
-	 * contains zero. The host makes use of AR[1] so is a good
-	 * candidate to ensure the guest AR (of zero) is used.
-	 */
-	CHECK_N_DO(MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size,
-		   GADDR_V(mem1), AR(1));
-	HOST_SYNC(t.vcpu, STAGE_COPIED);
-
-	CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, t.size,
-		   GADDR_V(mem2), AR(1));
-	ASSERT_MEM_EQ(mem1, mem2, t.size);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void set_storage_key_range(void *addr, size_t len, uint8_t key)
-{
-	uintptr_t _addr, abs, i;
-	int not_mapped = 0;
-
-	_addr = (uintptr_t)addr;
-	for (i = _addr & PAGE_MASK; i < _addr + len; i += PAGE_SIZE) {
-		abs = i;
-		asm volatile (
-			       "lra	%[abs], 0(0,%[abs])\n"
-			"	jz	0f\n"
-			"	llill	%[not_mapped],1\n"
-			"	j	1f\n"
-			"0:	sske	%[key], %[abs]\n"
-			"1:"
-			: [abs] "+&a" (abs), [not_mapped] "+r" (not_mapped)
-			: [key] "r" (key)
-			: "cc"
-		);
-		GUEST_ASSERT_EQ(not_mapped, 0);
-	}
-}
-
-static void guest_copy_key(void)
-{
-	set_storage_key_range(mem1, sizeof(mem1), 0x90);
-	set_storage_key_range(mem2, sizeof(mem2), 0x90);
-	GUEST_SYNC(STAGE_SKEYS_SET);
-
-	for (;;) {
-		memcpy(&mem2, &mem1, sizeof(mem2));
-		GUEST_SYNC(STAGE_COPIED);
-	}
-}
-
-static void test_copy_key(void)
-{
-	struct test_default t = test_default_init(guest_copy_key);
-
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vm, no key */
-	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, NO_KEY);
-
-	/* vm/vcpu, machting key or key 0 */
-	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 0);
-	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9);
-	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 0);
-	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9);
-	/*
-	 * There used to be different code paths for key handling depending on
-	 * if the region crossed a page boundary.
-	 * There currently are not, but the more tests the merrier.
-	 */
-	default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 0);
-	default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 9);
-	default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 0);
-	default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 9);
-
-	/* vm/vcpu, mismatching keys on read, but no fetch protection */
-	default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2);
-	default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 2);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_cmpxchg_key(void)
-{
-	struct test_default t = test_default_init(guest_copy_key);
-
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	default_cmpxchg(&t, NO_KEY);
-	default_cmpxchg(&t, 0);
-	default_cmpxchg(&t, 9);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static __uint128_t cut_to_size(int size, __uint128_t val)
-{
-	switch (size) {
-	case 1:
-		return (uint8_t)val;
-	case 2:
-		return (uint16_t)val;
-	case 4:
-		return (uint32_t)val;
-	case 8:
-		return (uint64_t)val;
-	case 16:
-		return val;
-	}
-	GUEST_FAIL("Invalid size = %u", size);
-	return 0;
-}
-
-static bool popcount_eq(__uint128_t a, __uint128_t b)
-{
-	unsigned int count_a, count_b;
-
-	count_a = __builtin_popcountl((uint64_t)(a >> 64)) +
-		  __builtin_popcountl((uint64_t)a);
-	count_b = __builtin_popcountl((uint64_t)(b >> 64)) +
-		  __builtin_popcountl((uint64_t)b);
-	return count_a == count_b;
-}
-
-static __uint128_t rotate(int size, __uint128_t val, int amount)
-{
-	unsigned int bits = size * 8;
-
-	amount = (amount + bits) % bits;
-	val = cut_to_size(size, val);
-	if (!amount)
-		return val;
-	return (val << (bits - amount)) | (val >> amount);
-}
-
-const unsigned int max_block = 16;
-
-static void choose_block(bool guest, int i, int *size, int *offset)
-{
-	unsigned int rand;
-
-	rand = i;
-	if (guest) {
-		rand = rand * 19 + 11;
-		*size = 1 << ((rand % 3) + 2);
-		rand = rand * 19 + 11;
-		*offset = (rand % max_block) & ~(*size - 1);
-	} else {
-		rand = rand * 17 + 5;
-		*size = 1 << (rand % 5);
-		rand = rand * 17 + 5;
-		*offset = (rand % max_block) & ~(*size - 1);
-	}
-}
-
-static __uint128_t permutate_bits(bool guest, int i, int size, __uint128_t old)
-{
-	unsigned int rand;
-	int amount;
-	bool swap;
-
-	rand = i;
-	rand = rand * 3 + 1;
-	if (guest)
-		rand = rand * 3 + 1;
-	swap = rand % 2 == 0;
-	if (swap) {
-		int i, j;
-		__uint128_t new;
-		uint8_t byte0, byte1;
-
-		rand = rand * 3 + 1;
-		i = rand % size;
-		rand = rand * 3 + 1;
-		j = rand % size;
-		if (i == j)
-			return old;
-		new = rotate(16, old, i * 8);
-		byte0 = new & 0xff;
-		new &= ~0xff;
-		new = rotate(16, new, -i * 8);
-		new = rotate(16, new, j * 8);
-		byte1 = new & 0xff;
-		new = (new & ~0xff) | byte0;
-		new = rotate(16, new, -j * 8);
-		new = rotate(16, new, i * 8);
-		new = new | byte1;
-		new = rotate(16, new, -i * 8);
-		return new;
-	}
-	rand = rand * 3 + 1;
-	amount = rand % (size * 8);
-	return rotate(size, old, amount);
-}
-
-static bool _cmpxchg(int size, void *target, __uint128_t *old_addr, __uint128_t new)
-{
-	bool ret;
-
-	switch (size) {
-	case 4: {
-			uint32_t old = *old_addr;
-
-			asm volatile ("cs %[old],%[new],%[address]"
-			    : [old] "+d" (old),
-			      [address] "+Q" (*(uint32_t *)(target))
-			    : [new] "d" ((uint32_t)new)
-			    : "cc"
-			);
-			ret = old == (uint32_t)*old_addr;
-			*old_addr = old;
-			return ret;
-		}
-	case 8: {
-			uint64_t old = *old_addr;
-
-			asm volatile ("csg %[old],%[new],%[address]"
-			    : [old] "+d" (old),
-			      [address] "+Q" (*(uint64_t *)(target))
-			    : [new] "d" ((uint64_t)new)
-			    : "cc"
-			);
-			ret = old == (uint64_t)*old_addr;
-			*old_addr = old;
-			return ret;
-		}
-	case 16: {
-			__uint128_t old = *old_addr;
-
-			asm volatile ("cdsg %[old],%[new],%[address]"
-			    : [old] "+d" (old),
-			      [address] "+Q" (*(__uint128_t *)(target))
-			    : [new] "d" (new)
-			    : "cc"
-			);
-			ret = old == *old_addr;
-			*old_addr = old;
-			return ret;
-		}
-	}
-	GUEST_FAIL("Invalid size = %u", size);
-	return 0;
-}
-
-const unsigned int cmpxchg_iter_outer = 100, cmpxchg_iter_inner = 10000;
-
-static void guest_cmpxchg_key(void)
-{
-	int size, offset;
-	__uint128_t old, new;
-
-	set_storage_key_range(mem1, max_block, 0x10);
-	set_storage_key_range(mem2, max_block, 0x10);
-	GUEST_SYNC(STAGE_SKEYS_SET);
-
-	for (int i = 0; i < cmpxchg_iter_outer; i++) {
-		do {
-			old = 1;
-		} while (!_cmpxchg(16, mem1, &old, 0));
-		for (int j = 0; j < cmpxchg_iter_inner; j++) {
-			choose_block(true, i + j, &size, &offset);
-			do {
-				new = permutate_bits(true, i + j, size, old);
-			} while (!_cmpxchg(size, mem2 + offset, &old, new));
-		}
-	}
-
-	GUEST_SYNC(STAGE_DONE);
-}
-
-static void *run_guest(void *data)
-{
-	struct test_info *info = data;
-
-	HOST_SYNC(*info, STAGE_DONE);
-	return NULL;
-}
-
-static char *quad_to_char(__uint128_t *quad, int size)
-{
-	return ((char *)quad) + (sizeof(*quad) - size);
-}
-
-static void test_cmpxchg_key_concurrent(void)
-{
-	struct test_default t = test_default_init(guest_cmpxchg_key);
-	int size, offset;
-	__uint128_t old, new;
-	bool success;
-	pthread_t thread;
-
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-	prepare_mem12();
-	MOP(t.vcpu, LOGICAL, WRITE, mem1, max_block, GADDR_V(mem2));
-	pthread_create(&thread, NULL, run_guest, &t.vcpu);
-
-	for (int i = 0; i < cmpxchg_iter_outer; i++) {
-		do {
-			old = 0;
-			new = 1;
-			MOP(t.vm, ABSOLUTE, CMPXCHG, &new,
-			    sizeof(new), GADDR_V(mem1),
-			    CMPXCHG_OLD(&old),
-			    CMPXCHG_SUCCESS(&success), KEY(1));
-		} while (!success);
-		for (int j = 0; j < cmpxchg_iter_inner; j++) {
-			choose_block(false, i + j, &size, &offset);
-			do {
-				new = permutate_bits(false, i + j, size, old);
-				MOP(t.vm, ABSOLUTE, CMPXCHG, quad_to_char(&new, size),
-				    size, GADDR_V(mem2 + offset),
-				    CMPXCHG_OLD(quad_to_char(&old, size)),
-				    CMPXCHG_SUCCESS(&success), KEY(1));
-			} while (!success);
-		}
-	}
-
-	pthread_join(thread, NULL);
-
-	MOP(t.vcpu, LOGICAL, READ, mem2, max_block, GADDR_V(mem2));
-	TEST_ASSERT(popcount_eq(*(__uint128_t *)mem1, *(__uint128_t *)mem2),
-		    "Must retain number of set bits");
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void guest_copy_key_fetch_prot(void)
-{
-	/*
-	 * For some reason combining the first sync with override enablement
-	 * results in an exception when calling HOST_SYNC.
-	 */
-	GUEST_SYNC(STAGE_INITED);
-	/* Storage protection override applies to both store and fetch. */
-	set_storage_key_range(mem1, sizeof(mem1), 0x98);
-	set_storage_key_range(mem2, sizeof(mem2), 0x98);
-	GUEST_SYNC(STAGE_SKEYS_SET);
-
-	for (;;) {
-		memcpy(&mem2, &mem1, sizeof(mem2));
-		GUEST_SYNC(STAGE_COPIED);
-	}
-}
-
-static void test_copy_key_storage_prot_override(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	t.run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE;
-	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vcpu, mismatching keys, storage protection override in effect */
-	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_copy_key_fetch_prot(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vm/vcpu, matching key, fetch protection in effect */
-	default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9);
-	default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-#define ERR_PROT_MOP(...)							\
-({										\
-	int rv;									\
-										\
-	rv = ERR_MOP(__VA_ARGS__);						\
-	TEST_ASSERT(rv == 4, "Should result in protection exception");		\
-})
-
-static void guest_error_key(void)
-{
-	GUEST_SYNC(STAGE_INITED);
-	set_storage_key_range(mem1, PAGE_SIZE, 0x18);
-	set_storage_key_range(mem1 + PAGE_SIZE, sizeof(mem1) - PAGE_SIZE, 0x98);
-	GUEST_SYNC(STAGE_SKEYS_SET);
-	GUEST_SYNC(STAGE_IDLED);
-}
-
-static void test_errors_key(void)
-{
-	struct test_default t = test_default_init(guest_error_key);
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vm/vcpu, mismatching keys, fetch protection in effect */
-	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, t.size, GADDR_V(mem1), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem1), KEY(2));
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_errors_cmpxchg_key(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
-	int i;
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	for (i = 1; i <= 16; i *= 2) {
-		__uint128_t old = 0;
-
-		ERR_PROT_MOP(t.vm, ABSOLUTE, CMPXCHG, mem2, i, GADDR_V(mem2),
-			     CMPXCHG_OLD(&old), KEY(2));
-	}
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_termination(void)
-{
-	struct test_default t = test_default_init(guest_error_key);
-	uint64_t prefix;
-	uint64_t teid;
-	uint64_t teid_mask = BIT(63 - 56) | BIT(63 - 60) | BIT(63 - 61);
-	uint64_t psw[2];
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vcpu, mismatching keys after first page */
-	ERR_PROT_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(1), INJECT);
-	/*
-	 * The memop injected a program exception and the test needs to check the
-	 * Translation-Exception Identification (TEID). It is necessary to run
-	 * the guest in order to be able to read the TEID from guest memory.
-	 * Set the guest program new PSW, so the guest state is not clobbered.
-	 */
-	prefix = t.run->s.regs.prefix;
-	psw[0] = t.run->psw_mask;
-	psw[1] = t.run->psw_addr;
-	MOP(t.vm, ABSOLUTE, WRITE, psw, sizeof(psw), GADDR(prefix + 464));
-	HOST_SYNC(t.vcpu, STAGE_IDLED);
-	MOP(t.vm, ABSOLUTE, READ, &teid, sizeof(teid), GADDR(prefix + 168));
-	/* Bits 56, 60, 61 form a code, 0 being the only one allowing for termination */
-	TEST_ASSERT_EQ(teid & teid_mask, 0);
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_errors_key_storage_prot_override(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	t.run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE;
-	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vm, mismatching keys, storage protection override not applicable to vm */
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem2), KEY(2));
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-const uint64_t last_page_addr = -PAGE_SIZE;
-
-static void guest_copy_key_fetch_prot_override(void)
-{
-	int i;
-	char *page_0 = 0;
-
-	GUEST_SYNC(STAGE_INITED);
-	set_storage_key_range(0, PAGE_SIZE, 0x18);
-	set_storage_key_range((void *)last_page_addr, PAGE_SIZE, 0x0);
-	asm volatile ("sske %[key],%[addr]\n" :: [addr] "r"(0L), [key] "r"(0x18) : "cc");
-	GUEST_SYNC(STAGE_SKEYS_SET);
-
-	for (;;) {
-		for (i = 0; i < PAGE_SIZE; i++)
-			page_0[i] = mem1[i];
-		GUEST_SYNC(STAGE_COPIED);
-	}
-}
-
-static void test_copy_key_fetch_prot_override(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot_override);
-	vm_vaddr_t guest_0_page, guest_last_page;
-
-	guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0);
-	guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr);
-	if (guest_0_page != 0 || guest_last_page != last_page_addr) {
-		print_skip("did not allocate guest pages at required positions");
-		goto out;
-	}
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	t.run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE;
-	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vcpu, mismatching keys on fetch, fetch protection override applies */
-	prepare_mem12();
-	MOP(t.vcpu, LOGICAL, WRITE, mem1, PAGE_SIZE, GADDR_V(mem1));
-	HOST_SYNC(t.vcpu, STAGE_COPIED);
-	CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, 2048, GADDR_V(guest_0_page), KEY(2));
-	ASSERT_MEM_EQ(mem1, mem2, 2048);
-
-	/*
-	 * vcpu, mismatching keys on fetch, fetch protection override applies,
-	 * wraparound
-	 */
-	prepare_mem12();
-	MOP(t.vcpu, LOGICAL, WRITE, mem1, 2 * PAGE_SIZE, GADDR_V(guest_last_page));
-	HOST_SYNC(t.vcpu, STAGE_COPIED);
-	CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048,
-		   GADDR_V(guest_last_page), KEY(2));
-	ASSERT_MEM_EQ(mem1, mem2, 2048);
-
-out:
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_errors_key_fetch_prot_override_not_enabled(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot_override);
-	vm_vaddr_t guest_0_page, guest_last_page;
-
-	guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0);
-	guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr);
-	if (guest_0_page != 0 || guest_last_page != last_page_addr) {
-		print_skip("did not allocate guest pages at required positions");
-		goto out;
-	}
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/* vcpu, mismatching keys on fetch, fetch protection override not enabled */
-	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048, GADDR_V(0), KEY(2));
-
-out:
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_errors_key_fetch_prot_override_enabled(void)
-{
-	struct test_default t = test_default_init(guest_copy_key_fetch_prot_override);
-	vm_vaddr_t guest_0_page, guest_last_page;
-
-	guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0);
-	guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr);
-	if (guest_0_page != 0 || guest_last_page != last_page_addr) {
-		print_skip("did not allocate guest pages at required positions");
-		goto out;
-	}
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-	t.run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE;
-	t.run->kvm_dirty_regs = KVM_SYNC_CRS;
-	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
-
-	/*
-	 * vcpu, mismatching keys on fetch,
-	 * fetch protection override does not apply because memory range exceeded
-	 */
-	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048 + 1, GADDR_V(0), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048 + 1,
-		   GADDR_V(guest_last_page), KEY(2));
-	/* vm, fetch protected override does not apply */
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, 2048, GADDR(0), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, 2048, GADDR_V(guest_0_page), KEY(2));
-
-out:
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void guest_idle(void)
-{
-	GUEST_SYNC(STAGE_INITED); /* for consistency's sake */
-	for (;;)
-		GUEST_SYNC(STAGE_IDLED);
-}
-
-static void _test_errors_common(struct test_info info, enum mop_target target, int size)
-{
-	int rv;
-
-	/* Bad size: */
-	rv = ERR_MOP(info, target, WRITE, mem1, -1, GADDR_V(mem1));
-	TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes");
-
-	/* Zero size: */
-	rv = ERR_MOP(info, target, WRITE, mem1, 0, GADDR_V(mem1));
-	TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM),
-		    "ioctl allows 0 as size");
-
-	/* Bad flags: */
-	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), SET_FLAGS(-1));
-	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags");
-
-	/* Bad guest address: */
-	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL), CHECK_ONLY);
-	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address with CHECK_ONLY");
-	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL));
-	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address on write");
-
-	/* Bad host address: */
-	rv = ERR_MOP(info, target, WRITE, 0, size, GADDR_V(mem1));
-	TEST_ASSERT(rv == -1 && errno == EFAULT,
-		    "ioctl does not report bad host memory address");
-
-	/* Bad key: */
-	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), KEY(17));
-	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows invalid key");
-}
-
-static void test_errors(void)
-{
-	struct test_default t = test_default_init(guest_idle);
-	int rv;
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-
-	_test_errors_common(t.vcpu, LOGICAL, t.size);
-	_test_errors_common(t.vm, ABSOLUTE, t.size);
-
-	/* Bad operation: */
-	rv = ERR_MOP(t.vcpu, INVALID, WRITE, mem1, t.size, GADDR_V(mem1));
-	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations");
-	/* virtual addresses are not translated when passing INVALID */
-	rv = ERR_MOP(t.vm, INVALID, WRITE, mem1, PAGE_SIZE, GADDR(0));
-	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations");
-
-	/* Bad access register: */
-	t.run->psw_mask &= ~(3UL << (63 - 17));
-	t.run->psw_mask |= 1UL << (63 - 17);  /* Enable AR mode */
-	HOST_SYNC(t.vcpu, STAGE_IDLED); /* To sync new state to SIE block */
-	rv = ERR_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), AR(17));
-	TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15");
-	t.run->psw_mask &= ~(3UL << (63 - 17));   /* Disable AR mode */
-	HOST_SYNC(t.vcpu, STAGE_IDLED); /* Run to sync new state */
-
-	/* Check that the SIDA calls are rejected for non-protected guests */
-	rv = ERR_MOP(t.vcpu, SIDA, READ, mem1, 8, GADDR(0), SIDA_OFFSET(0x1c0));
-	TEST_ASSERT(rv == -1 && errno == EINVAL,
-		    "ioctl does not reject SIDA_READ in non-protected mode");
-	rv = ERR_MOP(t.vcpu, SIDA, WRITE, mem1, 8, GADDR(0), SIDA_OFFSET(0x1c0));
-	TEST_ASSERT(rv == -1 && errno == EINVAL,
-		    "ioctl does not reject SIDA_WRITE in non-protected mode");
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-static void test_errors_cmpxchg(void)
-{
-	struct test_default t = test_default_init(guest_idle);
-	__uint128_t old;
-	int rv, i, power = 1;
-
-	HOST_SYNC(t.vcpu, STAGE_INITED);
-
-	for (i = 0; i < 32; i++) {
-		if (i == power) {
-			power *= 2;
-			continue;
-		}
-		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1),
-			     CMPXCHG_OLD(&old));
-		TEST_ASSERT(rv == -1 && errno == EINVAL,
-			    "ioctl allows bad size for cmpxchg");
-	}
-	for (i = 1; i <= 16; i *= 2) {
-		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR((void *)~0xfffUL),
-			     CMPXCHG_OLD(&old));
-		TEST_ASSERT(rv > 0, "ioctl allows bad guest address for cmpxchg");
-	}
-	for (i = 2; i <= 16; i *= 2) {
-		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1 + 1),
-			     CMPXCHG_OLD(&old));
-		TEST_ASSERT(rv == -1 && errno == EINVAL,
-			    "ioctl allows bad alignment for cmpxchg");
-	}
-
-	kvm_vm_free(t.kvm_vm);
-}
-
-int main(int argc, char *argv[])
-{
-	int extension_cap, idx;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP));
-	extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION);
-
-	struct testdef {
-		const char *name;
-		void (*test)(void);
-		bool requirements_met;
-	} testlist[] = {
-		{
-			.name = "simple copy",
-			.test = test_copy,
-			.requirements_met = true,
-		},
-		{
-			.name = "generic error checks",
-			.test = test_errors,
-			.requirements_met = true,
-		},
-		{
-			.name = "copy with storage keys",
-			.test = test_copy_key,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "cmpxchg with storage keys",
-			.test = test_cmpxchg_key,
-			.requirements_met = extension_cap & 0x2,
-		},
-		{
-			.name = "concurrently cmpxchg with storage keys",
-			.test = test_cmpxchg_key_concurrent,
-			.requirements_met = extension_cap & 0x2,
-		},
-		{
-			.name = "copy with key storage protection override",
-			.test = test_copy_key_storage_prot_override,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "copy with key fetch protection",
-			.test = test_copy_key_fetch_prot,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "copy with key fetch protection override",
-			.test = test_copy_key_fetch_prot_override,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "copy with access register mode",
-			.test = test_copy_access_register,
-			.requirements_met = true,
-		},
-		{
-			.name = "error checks with key",
-			.test = test_errors_key,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "error checks for cmpxchg with key",
-			.test = test_errors_cmpxchg_key,
-			.requirements_met = extension_cap & 0x2,
-		},
-		{
-			.name = "error checks for cmpxchg",
-			.test = test_errors_cmpxchg,
-			.requirements_met = extension_cap & 0x2,
-		},
-		{
-			.name = "termination",
-			.test = test_termination,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "error checks with key storage protection override",
-			.test = test_errors_key_storage_prot_override,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "error checks without key fetch prot override",
-			.test = test_errors_key_fetch_prot_override_not_enabled,
-			.requirements_met = extension_cap > 0,
-		},
-		{
-			.name = "error checks with key fetch prot override",
-			.test = test_errors_key_fetch_prot_override_enabled,
-			.requirements_met = extension_cap > 0,
-		},
-	};
-
-	ksft_print_header();
-	ksft_set_plan(ARRAY_SIZE(testlist));
-
-	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		if (testlist[idx].requirements_met) {
-			testlist[idx].test();
-			ksft_test_result_pass("%s\n", testlist[idx].name);
-		} else {
-			ksft_test_result_skip("%s - requirements not met (kernel has extension cap %#x)\n",
-					      testlist[idx].name, extension_cap);
-		}
-	}
-
-	ksft_finished();	/* Print results and exit() accordingly */
-}
diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c
deleted file mode 100644
index b58f75b381e5..000000000000
--- a/tools/testing/selftests/kvm/s390x/resets.c
+++ /dev/null
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Test for s390x CPU resets
- *
- * Copyright (C) 2020, IBM
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "kselftest.h"
-
-#define LOCAL_IRQS 32
-
-#define ARBITRARY_NON_ZERO_VCPU_ID 3
-
-struct kvm_s390_irq buf[ARBITRARY_NON_ZERO_VCPU_ID + LOCAL_IRQS];
-
-static uint8_t regs_null[512];
-
-static void guest_code_initial(void)
-{
-	/* set several CRs to "safe" value */
-	unsigned long cr2_59 = 0x10;	/* enable guarded storage */
-	unsigned long cr8_63 = 0x1;	/* monitor mask = 1 */
-	unsigned long cr10 = 1;		/* PER START */
-	unsigned long cr11 = -1;	/* PER END */
-
-
-	/* Dirty registers */
-	asm volatile (
-		"	lghi	2,0x11\n"	/* Round toward 0 */
-		"	sfpc	2\n"		/* set fpc to !=0 */
-		"	lctlg	2,2,%0\n"
-		"	lctlg	8,8,%1\n"
-		"	lctlg	10,10,%2\n"
-		"	lctlg	11,11,%3\n"
-		/* now clobber some general purpose regs */
-		"	llihh	0,0xffff\n"
-		"	llihl	1,0x5555\n"
-		"	llilh	2,0xaaaa\n"
-		"	llill	3,0x0000\n"
-		/* now clobber a floating point reg */
-		"	lghi	4,0x1\n"
-		"	cdgbr	0,4\n"
-		/* now clobber an access reg */
-		"	sar	9,4\n"
-		/* We embed diag 501 here to control register content */
-		"	diag 0,0,0x501\n"
-		:
-		: "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11)
-		/* no clobber list as this should not return */
-		);
-}
-
-static void test_one_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t value)
-{
-	uint64_t eval_reg;
-
-	eval_reg = vcpu_get_reg(vcpu, id);
-	TEST_ASSERT(eval_reg == value, "value == 0x%lx", value);
-}
-
-static void assert_noirq(struct kvm_vcpu *vcpu)
-{
-	struct kvm_s390_irq_state irq_state;
-	int irqs;
-
-	irq_state.len = sizeof(buf);
-	irq_state.buf = (unsigned long)buf;
-	irqs = __vcpu_ioctl(vcpu, KVM_S390_GET_IRQ_STATE, &irq_state);
-	/*
-	 * irqs contains the number of retrieved interrupts. Any interrupt
-	 * (notably, the emergency call interrupt we have injected) should
-	 * be cleared by the resets, so this should be 0.
-	 */
-	TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d", errno);
-	TEST_ASSERT(!irqs, "IRQ pending");
-}
-
-static void assert_clear(struct kvm_vcpu *vcpu)
-{
-	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
-	struct kvm_sregs sregs;
-	struct kvm_regs regs;
-	struct kvm_fpu fpu;
-
-	vcpu_regs_get(vcpu, &regs);
-	TEST_ASSERT(!memcmp(&regs.gprs, regs_null, sizeof(regs.gprs)), "grs == 0");
-
-	vcpu_sregs_get(vcpu, &sregs);
-	TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0");
-
-	vcpu_fpu_get(vcpu, &fpu);
-	TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0");
-
-	/* sync regs */
-	TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)),
-		    "gprs0-15 == 0 (sync_regs)");
-
-	TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)),
-		    "acrs0-15 == 0 (sync_regs)");
-
-	TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)),
-		    "vrs0-15 == 0 (sync_regs)");
-}
-
-static void assert_initial_noclear(struct kvm_vcpu *vcpu)
-{
-	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
-
-	TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL,
-		    "gpr0 == 0xffff000000000000 (sync_regs)");
-	TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL,
-		    "gpr1 == 0x0000555500000000 (sync_regs)");
-	TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL,
-		    "gpr2 == 0x00000000aaaa0000 (sync_regs)");
-	TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL,
-		    "gpr3 == 0x0000000000000000 (sync_regs)");
-	TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL,
-		    "fpr0 == 0f1 (sync_regs)");
-	TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)");
-}
-
-static void assert_initial(struct kvm_vcpu *vcpu)
-{
-	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
-	struct kvm_sregs sregs;
-	struct kvm_fpu fpu;
-
-	/* KVM_GET_SREGS */
-	vcpu_sregs_get(vcpu, &sregs);
-	TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)");
-	TEST_ASSERT(sregs.crs[14] == 0xC2000000UL,
-		    "cr14 == 0xC2000000 (KVM_GET_SREGS)");
-	TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12),
-		    "cr1-13 == 0 (KVM_GET_SREGS)");
-	TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)");
-
-	/* sync regs */
-	TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)");
-	TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL,
-		    "cr14 == 0xC2000000 (sync_regs)");
-	TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12),
-		    "cr1-13 == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)");
-	TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)");
-
-	/* kvm_run */
-	TEST_ASSERT(vcpu->run->psw_addr == 0, "psw_addr == 0 (kvm_run)");
-	TEST_ASSERT(vcpu->run->psw_mask == 0, "psw_mask == 0 (kvm_run)");
-
-	vcpu_fpu_get(vcpu, &fpu);
-	TEST_ASSERT(!fpu.fpc, "fpc == 0");
-
-	test_one_reg(vcpu, KVM_REG_S390_GBEA, 1);
-	test_one_reg(vcpu, KVM_REG_S390_PP, 0);
-	test_one_reg(vcpu, KVM_REG_S390_TODPR, 0);
-	test_one_reg(vcpu, KVM_REG_S390_CPU_TIMER, 0);
-	test_one_reg(vcpu, KVM_REG_S390_CLOCK_COMP, 0);
-}
-
-static void assert_normal_noclear(struct kvm_vcpu *vcpu)
-{
-	struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs;
-
-	TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)");
-	TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)");
-	TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)");
-	TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)");
-}
-
-static void assert_normal(struct kvm_vcpu *vcpu)
-{
-	test_one_reg(vcpu, KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID);
-	TEST_ASSERT(vcpu->run->s.regs.pft == KVM_S390_PFAULT_TOKEN_INVALID,
-			"pft == 0xff.....  (sync_regs)");
-	assert_noirq(vcpu);
-}
-
-static void inject_irq(struct kvm_vcpu *vcpu)
-{
-	struct kvm_s390_irq_state irq_state;
-	struct kvm_s390_irq *irq = &buf[0];
-	int irqs;
-
-	/* Inject IRQ */
-	irq_state.len = sizeof(struct kvm_s390_irq);
-	irq_state.buf = (unsigned long)buf;
-	irq->type = KVM_S390_INT_EMERGENCY;
-	irq->u.emerg.code = vcpu->id;
-	irqs = __vcpu_ioctl(vcpu, KVM_S390_SET_IRQ_STATE, &irq_state);
-	TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d", errno);
-}
-
-static struct kvm_vm *create_vm(struct kvm_vcpu **vcpu)
-{
-	struct kvm_vm *vm;
-
-	vm = vm_create(1);
-
-	*vcpu = vm_vcpu_add(vm, ARBITRARY_NON_ZERO_VCPU_ID, guest_code_initial);
-
-	return vm;
-}
-
-static void test_normal(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	ksft_print_msg("Testing normal reset\n");
-	vm = create_vm(&vcpu);
-
-	vcpu_run(vcpu);
-
-	inject_irq(vcpu);
-
-	vcpu_ioctl(vcpu, KVM_S390_NORMAL_RESET, NULL);
-
-	/* must clears */
-	assert_normal(vcpu);
-	/* must not clears */
-	assert_normal_noclear(vcpu);
-	assert_initial_noclear(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-static void test_initial(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	ksft_print_msg("Testing initial reset\n");
-	vm = create_vm(&vcpu);
-
-	vcpu_run(vcpu);
-
-	inject_irq(vcpu);
-
-	vcpu_ioctl(vcpu, KVM_S390_INITIAL_RESET, NULL);
-
-	/* must clears */
-	assert_normal(vcpu);
-	assert_initial(vcpu);
-	/* must not clears */
-	assert_initial_noclear(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-static void test_clear(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	ksft_print_msg("Testing clear reset\n");
-	vm = create_vm(&vcpu);
-
-	vcpu_run(vcpu);
-
-	inject_irq(vcpu);
-
-	vcpu_ioctl(vcpu, KVM_S390_CLEAR_RESET, NULL);
-
-	/* must clears */
-	assert_normal(vcpu);
-	assert_initial(vcpu);
-	assert_clear(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-struct testdef {
-	const char *name;
-	void (*test)(void);
-	bool needs_cap;
-} testlist[] = {
-	{ "initial", test_initial, false },
-	{ "normal", test_normal, true },
-	{ "clear", test_clear, true },
-};
-
-int main(int argc, char *argv[])
-{
-	bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS);
-	int idx;
-
-	ksft_print_header();
-	ksft_set_plan(ARRAY_SIZE(testlist));
-
-	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		if (!testlist[idx].needs_cap || has_s390_vcpu_resets) {
-			testlist[idx].test();
-			ksft_test_result_pass("%s\n", testlist[idx].name);
-		} else {
-			ksft_test_result_skip("%s - no VCPU_RESETS capability\n",
-					      testlist[idx].name);
-		}
-	}
-
-	ksft_finished();	/* Print results and exit() accordingly */
-}
diff --git a/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c
deleted file mode 100644
index bba0d9a6dcc8..000000000000
--- a/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Test shared zeropage handling (with/without storage keys)
- *
- * Copyright (C) 2024, Red Hat, Inc.
- */
-#include <sys/mman.h>
-
-#include <linux/fs.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "kselftest.h"
-#include "ucall_common.h"
-
-static void set_storage_key(void *addr, uint8_t skey)
-{
-	asm volatile("sske %0,%1" : : "d" (skey), "a" (addr));
-}
-
-static void guest_code(void)
-{
-	/* Issue some storage key instruction. */
-	set_storage_key((void *)0, 0x98);
-	GUEST_DONE();
-}
-
-/*
- * Returns 1 if the shared zeropage is mapped, 0 if something else is mapped.
- * Returns < 0 on error or if nothing is mapped.
- */
-static int maps_shared_zeropage(int pagemap_fd, void *addr)
-{
-	struct page_region region;
-	struct pm_scan_arg arg = {
-		.start = (uintptr_t)addr,
-		.end = (uintptr_t)addr + 4096,
-		.vec = (uintptr_t)&region,
-		.vec_len = 1,
-		.size = sizeof(struct pm_scan_arg),
-		.category_mask = PAGE_IS_PFNZERO,
-		.category_anyof_mask = PAGE_IS_PRESENT,
-		.return_mask = PAGE_IS_PFNZERO,
-	};
-	return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
-}
-
-int main(int argc, char *argv[])
-{
-	char *mem, *page0, *page1, *page2, tmp;
-	const size_t pagesize = getpagesize();
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	int pagemap_fd;
-
-	ksft_print_header();
-	ksft_set_plan(3);
-
-	/*
-	 * We'll use memory that is not mapped into the VM for simplicity.
-	 * Shared zeropages are enabled/disabled per-process.
-	 */
-	mem = mmap(0, 3 * pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
-	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
-
-	/* Disable THP. Ignore errors on older kernels. */
-	madvise(mem, 3 * pagesize, MADV_NOHUGEPAGE);
-
-	page0 = mem;
-	page1 = page0 + pagesize;
-	page2 = page1 + pagesize;
-
-	/* Can we even detect shared zeropages? */
-	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-	TEST_REQUIRE(pagemap_fd >= 0);
-
-	tmp = *page0;
-	asm volatile("" : "+r" (tmp));
-	TEST_REQUIRE(maps_shared_zeropage(pagemap_fd, page0) == 1);
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	/* Verify that we get the shared zeropage after VM creation. */
-	tmp = *page1;
-	asm volatile("" : "+r" (tmp));
-	ksft_test_result(maps_shared_zeropage(pagemap_fd, page1) == 1,
-			 "Shared zeropages should be enabled\n");
-
-	/*
-	 * Let our VM execute a storage key instruction that should
-	 * unshare all shared zeropages.
-	 */
-	vcpu_run(vcpu);
-	get_ucall(vcpu, &uc);
-	TEST_ASSERT_EQ(uc.cmd, UCALL_DONE);
-
-	/* Verify that we don't have a shared zeropage anymore. */
-	ksft_test_result(!maps_shared_zeropage(pagemap_fd, page1),
-			 "Shared zeropage should be gone\n");
-
-	/* Verify that we don't get any new shared zeropages. */
-	tmp = *page2;
-	asm volatile("" : "+r" (tmp));
-	ksft_test_result(!maps_shared_zeropage(pagemap_fd, page2),
-			 "Shared zeropages should be disabled\n");
-
-	kvm_vm_free(vm);
-
-	ksft_finished();
-}
diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
deleted file mode 100644
index 53def355ccba..000000000000
--- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Test for s390x KVM_CAP_SYNC_REGS
- *
- * Based on the same test for x86:
- * Copyright (C) 2018, Google LLC.
- *
- * Adaptions for s390x:
- * Copyright (C) 2019, Red Hat, Inc.
- *
- * Test expected behavior of the KVM_CAP_SYNC_REGS functionality.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "diag318_test_handler.h"
-#include "kselftest.h"
-
-static void guest_code(void)
-{
-	/*
-	 * We embed diag 501 here instead of doing a ucall to avoid that
-	 * the compiler has messed with r11 at the time of the ucall.
-	 */
-	asm volatile (
-		"0:	diag 0,0,0x501\n"
-		"	ahi 11,1\n"
-		"	j 0b\n"
-	);
-}
-
-#define REG_COMPARE(reg) \
-	TEST_ASSERT(left->reg == right->reg, \
-		    "Register " #reg \
-		    " values did not match: 0x%llx, 0x%llx", \
-		    left->reg, right->reg)
-
-#define REG_COMPARE32(reg) \
-	TEST_ASSERT(left->reg == right->reg, \
-		    "Register " #reg \
-		    " values did not match: 0x%x, 0x%x", \
-		    left->reg, right->reg)
-
-
-static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right)
-{
-	int i;
-
-	for (i = 0; i < 16; i++)
-		REG_COMPARE(gprs[i]);
-}
-
-static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right)
-{
-	int i;
-
-	for (i = 0; i < 16; i++)
-		REG_COMPARE32(acrs[i]);
-
-	for (i = 0; i < 16; i++)
-		REG_COMPARE(crs[i]);
-}
-
-#undef REG_COMPARE
-
-#define TEST_SYNC_FIELDS   (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS|KVM_SYNC_DIAG318)
-#define INVALID_SYNC_FIELD 0x80000000
-
-void test_read_invalid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	int rv;
-
-	/* Request reading invalid register set from VCPU. */
-	run->kvm_valid_regs = INVALID_SYNC_FIELD;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_valid_regs = 0;
-
-	run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_valid_regs = 0;
-}
-
-void test_set_invalid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	int rv;
-
-	/* Request setting invalid register set into VCPU. */
-	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_dirty_regs = 0;
-
-	run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_dirty_regs = 0;
-}
-
-void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_sregs sregs;
-	struct kvm_regs regs;
-	int rv;
-
-	/* Request and verify all valid register sets. */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
-	TEST_ASSERT(run->s390_sieic.icptcode == 4 &&
-		    (run->s390_sieic.ipa >> 8) == 0x83 &&
-		    (run->s390_sieic.ipb >> 16) == 0x501,
-		    "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x",
-		    run->s390_sieic.icptcode, run->s390_sieic.ipa,
-		    run->s390_sieic.ipb);
-
-	vcpu_regs_get(vcpu, &regs);
-	compare_regs(&regs, &run->s.regs);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	compare_sregs(&sregs, &run->s.regs);
-}
-
-void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_sregs sregs;
-	struct kvm_regs regs;
-	int rv;
-
-	/* Set and verify various register values */
-	run->s.regs.gprs[11] = 0xBAD1DEA;
-	run->s.regs.acrs[0] = 1 << 11;
-
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS;
-
-	if (get_diag318_info() > 0) {
-		run->s.regs.diag318 = get_diag318_info();
-		run->kvm_dirty_regs |= KVM_SYNC_DIAG318;
-	}
-
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
-	TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1,
-		    "r11 sync regs value incorrect 0x%llx.",
-		    run->s.regs.gprs[11]);
-	TEST_ASSERT(run->s.regs.acrs[0]  == 1 << 11,
-		    "acr0 sync regs value incorrect 0x%x.",
-		    run->s.regs.acrs[0]);
-	TEST_ASSERT(run->s.regs.diag318 == get_diag318_info(),
-		    "diag318 sync regs value incorrect 0x%llx.",
-		    run->s.regs.diag318);
-
-	vcpu_regs_get(vcpu, &regs);
-	compare_regs(&regs, &run->s.regs);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	compare_sregs(&sregs, &run->s.regs);
-}
-
-void test_clear_kvm_dirty_regs_bits(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	int rv;
-
-	/* Clear kvm_dirty_regs bits, verify new s.regs values are
-	 * overwritten with existing guest values.
-	 */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	run->kvm_dirty_regs = 0;
-	run->s.regs.gprs[11] = 0xDEADBEEF;
-	run->s.regs.diag318 = 0x4B1D;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
-	TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF,
-		    "r11 sync regs value incorrect 0x%llx.",
-		    run->s.regs.gprs[11]);
-	TEST_ASSERT(run->s.regs.diag318 != 0x4B1D,
-		    "diag318 sync regs value incorrect 0x%llx.",
-		    run->s.regs.diag318);
-}
-
-struct testdef {
-	const char *name;
-	void (*test)(struct kvm_vcpu *vcpu);
-} testlist[] = {
-	{ "read invalid", test_read_invalid },
-	{ "set invalid", test_set_invalid },
-	{ "request+verify all valid regs", test_req_and_verify_all_valid_regs },
-	{ "set+verify various regs", test_set_and_verify_various_reg_values },
-	{ "clear kvm_dirty_regs bits", test_clear_kvm_dirty_regs_bits },
-};
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	int idx;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
-
-	ksft_print_header();
-
-	ksft_set_plan(ARRAY_SIZE(testlist));
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		testlist[idx].test(vcpu);
-		ksft_test_result_pass("%s\n", testlist[idx].name);
-	}
-
-	kvm_vm_free(vm);
-
-	ksft_finished();	/* Print results and exit() accordingly */
-}
diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c
deleted file mode 100644
index 12d5e1cb62e3..000000000000
--- a/tools/testing/selftests/kvm/s390x/tprot.c
+++ /dev/null
@@ -1,244 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Test TEST PROTECTION emulation.
- *
- * Copyright IBM Corp. 2021
- */
-#include <sys/mman.h>
-#include "test_util.h"
-#include "kvm_util.h"
-#include "kselftest.h"
-#include "ucall_common.h"
-#include "processor.h"
-
-#define CR0_FETCH_PROTECTION_OVERRIDE	(1UL << (63 - 38))
-#define CR0_STORAGE_PROTECTION_OVERRIDE	(1UL << (63 - 39))
-
-static __aligned(PAGE_SIZE) uint8_t pages[2][PAGE_SIZE];
-static uint8_t *const page_store_prot = pages[0];
-static uint8_t *const page_fetch_prot = pages[1];
-
-/* Nonzero return value indicates that address not mapped */
-static int set_storage_key(void *addr, uint8_t key)
-{
-	int not_mapped = 0;
-
-	asm volatile (
-		       "lra	%[addr], 0(0,%[addr])\n"
-		"	jz	0f\n"
-		"	llill	%[not_mapped],1\n"
-		"	j	1f\n"
-		"0:	sske	%[key], %[addr]\n"
-		"1:"
-		: [addr] "+&a" (addr), [not_mapped] "+r" (not_mapped)
-		: [key] "r" (key)
-		: "cc"
-	);
-	return -not_mapped;
-}
-
-enum permission {
-	READ_WRITE = 0,
-	READ = 1,
-	RW_PROTECTED = 2,
-	TRANSL_UNAVAIL = 3,
-};
-
-static enum permission test_protection(void *addr, uint8_t key)
-{
-	uint64_t mask;
-
-	asm volatile (
-		       "tprot	%[addr], 0(%[key])\n"
-		"	ipm	%[mask]\n"
-		: [mask] "=r" (mask)
-		: [addr] "Q" (*(char *)addr),
-		  [key] "a" (key)
-		: "cc"
-	);
-
-	return (enum permission)(mask >> 28);
-}
-
-enum stage {
-	STAGE_INIT_SIMPLE,
-	TEST_SIMPLE,
-	STAGE_INIT_FETCH_PROT_OVERRIDE,
-	TEST_FETCH_PROT_OVERRIDE,
-	TEST_STORAGE_PROT_OVERRIDE,
-	STAGE_END	/* must be the last entry (it's the amount of tests) */
-};
-
-struct test {
-	enum stage stage;
-	void *addr;
-	uint8_t key;
-	enum permission expected;
-} tests[] = {
-	/*
-	 * We perform each test in the array by executing TEST PROTECTION on
-	 * the specified addr with the specified key and checking if the returned
-	 * permissions match the expected value.
-	 * Both guest and host cooperate to set up the required test conditions.
-	 * A central condition is that the page targeted by addr has to be DAT
-	 * protected in the host mappings, in order for KVM to emulate the
-	 * TEST PROTECTION instruction.
-	 * Since the page tables are shared, the host uses mprotect to achieve
-	 * this.
-	 *
-	 * Test resulting in RW_PROTECTED/TRANSL_UNAVAIL will be interpreted
-	 * by SIE, not KVM, but there is no harm in testing them also.
-	 * See Enhanced Suppression-on-Protection Facilities in the
-	 * Interpretive-Execution Mode
-	 */
-	/*
-	 * guest: set storage key of page_store_prot to 1
-	 *        storage key of page_fetch_prot to 9 and enable
-	 *        protection for it
-	 * STAGE_INIT_SIMPLE
-	 * host: write protect both via mprotect
-	 */
-	/* access key 0 matches any storage key -> RW */
-	{ TEST_SIMPLE, page_store_prot, 0x00, READ_WRITE },
-	/* access key matches storage key -> RW */
-	{ TEST_SIMPLE, page_store_prot, 0x10, READ_WRITE },
-	/* mismatched keys, but no fetch protection -> RO */
-	{ TEST_SIMPLE, page_store_prot, 0x20, READ },
-	/* access key 0 matches any storage key -> RW */
-	{ TEST_SIMPLE, page_fetch_prot, 0x00, READ_WRITE },
-	/* access key matches storage key -> RW */
-	{ TEST_SIMPLE, page_fetch_prot, 0x90, READ_WRITE },
-	/* mismatched keys, fetch protection -> inaccessible */
-	{ TEST_SIMPLE, page_fetch_prot, 0x10, RW_PROTECTED },
-	/* page 0 not mapped yet -> translation not available */
-	{ TEST_SIMPLE, (void *)0x00, 0x10, TRANSL_UNAVAIL },
-	/*
-	 * host: try to map page 0
-	 * guest: set storage key of page 0 to 9 and enable fetch protection
-	 * STAGE_INIT_FETCH_PROT_OVERRIDE
-	 * host: write protect page 0
-	 *       enable fetch protection override
-	 */
-	/* mismatched keys, fetch protection, but override applies -> RO */
-	{ TEST_FETCH_PROT_OVERRIDE, (void *)0x00, 0x10, READ },
-	/* mismatched keys, fetch protection, override applies to 0-2048 only -> inaccessible */
-	{ TEST_FETCH_PROT_OVERRIDE, (void *)2049, 0x10, RW_PROTECTED },
-	/*
-	 * host: enable storage protection override
-	 */
-	/* mismatched keys, but override applies (storage key 9) -> RW */
-	{ TEST_STORAGE_PROT_OVERRIDE, page_fetch_prot, 0x10, READ_WRITE },
-	/* mismatched keys, no fetch protection, override doesn't apply -> RO */
-	{ TEST_STORAGE_PROT_OVERRIDE, page_store_prot, 0x20, READ },
-	/* mismatched keys, but override applies (storage key 9) -> RW */
-	{ TEST_STORAGE_PROT_OVERRIDE, (void *)2049, 0x10, READ_WRITE },
-	/* end marker */
-	{ STAGE_END, 0, 0, 0 },
-};
-
-static enum stage perform_next_stage(int *i, bool mapped_0)
-{
-	enum stage stage = tests[*i].stage;
-	enum permission result;
-	bool skip;
-
-	for (; tests[*i].stage == stage; (*i)++) {
-		/*
-		 * Some fetch protection override tests require that page 0
-		 * be mapped, however, when the hosts tries to map that page via
-		 * vm_vaddr_alloc, it may happen that some other page gets mapped
-		 * instead.
-		 * In order to skip these tests we detect this inside the guest
-		 */
-		skip = tests[*i].addr < (void *)PAGE_SIZE &&
-		       tests[*i].expected != TRANSL_UNAVAIL &&
-		       !mapped_0;
-		if (!skip) {
-			result = test_protection(tests[*i].addr, tests[*i].key);
-			__GUEST_ASSERT(result == tests[*i].expected,
-				       "Wanted %u, got %u, for i = %u",
-				       tests[*i].expected, result, *i);
-		}
-	}
-	return stage;
-}
-
-static void guest_code(void)
-{
-	bool mapped_0;
-	int i = 0;
-
-	GUEST_ASSERT_EQ(set_storage_key(page_store_prot, 0x10), 0);
-	GUEST_ASSERT_EQ(set_storage_key(page_fetch_prot, 0x98), 0);
-	GUEST_SYNC(STAGE_INIT_SIMPLE);
-	GUEST_SYNC(perform_next_stage(&i, false));
-
-	/* Fetch-protection override */
-	mapped_0 = !set_storage_key((void *)0, 0x98);
-	GUEST_SYNC(STAGE_INIT_FETCH_PROT_OVERRIDE);
-	GUEST_SYNC(perform_next_stage(&i, mapped_0));
-
-	/* Storage-protection override */
-	GUEST_SYNC(perform_next_stage(&i, mapped_0));
-}
-
-#define HOST_SYNC_NO_TAP(vcpup, stage)				\
-({								\
-	struct kvm_vcpu *__vcpu = (vcpup);			\
-	struct ucall uc;					\
-	int __stage = (stage);					\
-								\
-	vcpu_run(__vcpu);					\
-	get_ucall(__vcpu, &uc);					\
-	if (uc.cmd == UCALL_ABORT)				\
-		REPORT_GUEST_ASSERT(uc);			\
-	TEST_ASSERT_EQ(uc.cmd, UCALL_SYNC);			\
-	TEST_ASSERT_EQ(uc.args[1], __stage);			\
-})
-
-#define HOST_SYNC(vcpu, stage)			\
-({						\
-	HOST_SYNC_NO_TAP(vcpu, stage);		\
-	ksft_test_result_pass("" #stage "\n");	\
-})
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_run *run;
-	vm_vaddr_t guest_0_page;
-
-	ksft_print_header();
-	ksft_set_plan(STAGE_END);
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	run = vcpu->run;
-
-	HOST_SYNC(vcpu, STAGE_INIT_SIMPLE);
-	mprotect(addr_gva2hva(vm, (vm_vaddr_t)pages), PAGE_SIZE * 2, PROT_READ);
-	HOST_SYNC(vcpu, TEST_SIMPLE);
-
-	guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0);
-	if (guest_0_page != 0) {
-		/* Use NO_TAP so we don't get a PASS print */
-		HOST_SYNC_NO_TAP(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE);
-		ksft_test_result_skip("STAGE_INIT_FETCH_PROT_OVERRIDE - "
-				      "Did not allocate page at 0\n");
-	} else {
-		HOST_SYNC(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE);
-	}
-	if (guest_0_page == 0)
-		mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ);
-	run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE;
-	run->kvm_dirty_regs = KVM_SYNC_CRS;
-	HOST_SYNC(vcpu, TEST_FETCH_PROT_OVERRIDE);
-
-	run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE;
-	run->kvm_dirty_regs = KVM_SYNC_CRS;
-	HOST_SYNC(vcpu, TEST_STORAGE_PROT_OVERRIDE);
-
-	kvm_vm_free(vm);
-
-	ksft_finished();	/* Print results and exit() accordingly */
-}
diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
deleted file mode 100644
index 0c112319dab1..000000000000
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ /dev/null
@@ -1,638 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Test code for the s390x kvm ucontrol interface
- *
- * Copyright IBM Corp. 2024
- *
- * Authors:
- *  Christoph Schlameuss <schlameuss@linux.ibm.com>
- */
-#include "debug_print.h"
-#include "kselftest_harness.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "sie.h"
-
-#include <linux/capability.h>
-#include <linux/sizes.h>
-
-#define PGM_SEGMENT_TRANSLATION 0x10
-
-#define VM_MEM_SIZE (4 * SZ_1M)
-#define VM_MEM_EXT_SIZE (2 * SZ_1M)
-#define VM_MEM_MAX_M ((VM_MEM_SIZE + VM_MEM_EXT_SIZE) / SZ_1M)
-
-/* so directly declare capget to check caps without libcap */
-int capget(cap_user_header_t header, cap_user_data_t data);
-
-/**
- * In order to create user controlled virtual machines on S390,
- * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL
- * as privileged user (SYS_ADMIN).
- */
-void require_ucontrol_admin(void)
-{
-	struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3];
-	struct __user_cap_header_struct hdr = {
-		.version = _LINUX_CAPABILITY_VERSION_3,
-	};
-	int rc;
-
-	rc = capget(&hdr, data);
-	TEST_ASSERT_EQ(0, rc);
-	TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0);
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL));
-}
-
-/* Test program setting some registers and looping */
-extern char test_gprs_asm[];
-asm("test_gprs_asm:\n"
-	"xgr	%r0, %r0\n"
-	"lgfi	%r1,1\n"
-	"lgfi	%r2,2\n"
-	"lgfi	%r3,3\n"
-	"lgfi	%r4,4\n"
-	"lgfi	%r5,5\n"
-	"lgfi	%r6,6\n"
-	"lgfi	%r7,7\n"
-	"0:\n"
-	"	diag	0,0,0x44\n"
-	"	ahi	%r0,1\n"
-	"	j	0b\n"
-);
-
-/* Test program manipulating memory */
-extern char test_mem_asm[];
-asm("test_mem_asm:\n"
-	"xgr	%r0, %r0\n"
-
-	"0:\n"
-	"	ahi	%r0,1\n"
-	"	st	%r1,0(%r5,%r6)\n"
-
-	"	xgr	%r1,%r1\n"
-	"	l	%r1,0(%r5,%r6)\n"
-	"	ahi	%r0,1\n"
-	"	diag	0,0,0x44\n"
-
-	"	j	0b\n"
-);
-
-/* Test program manipulating storage keys */
-extern char test_skey_asm[];
-asm("test_skey_asm:\n"
-	"xgr	%r0, %r0\n"
-
-	"0:\n"
-	"	ahi	%r0,1\n"
-	"	st	%r1,0(%r5,%r6)\n"
-
-	"	iske	%r1,%r6\n"
-	"	ahi	%r0,1\n"
-	"	diag	0,0,0x44\n"
-
-	"	sske	%r1,%r6\n"
-	"	xgr	%r1,%r1\n"
-	"	iske	%r1,%r6\n"
-	"	ahi	%r0,1\n"
-	"	diag	0,0,0x44\n"
-
-	"	rrbe	%r1,%r6\n"
-	"	iske	%r1,%r6\n"
-	"	ahi	%r0,1\n"
-	"	diag	0,0,0x44\n"
-
-	"	j	0b\n"
-);
-
-FIXTURE(uc_kvm)
-{
-	struct kvm_s390_sie_block *sie_block;
-	struct kvm_run *run;
-	uintptr_t base_gpa;
-	uintptr_t code_gpa;
-	uintptr_t base_hva;
-	uintptr_t code_hva;
-	int kvm_run_size;
-	vm_paddr_t pgd;
-	void *vm_mem;
-	int vcpu_fd;
-	int kvm_fd;
-	int vm_fd;
-};
-
-/**
- * create VM with single vcpu, map kvm_run and SIE control block for easy access
- */
-FIXTURE_SETUP(uc_kvm)
-{
-	struct kvm_s390_vm_cpu_processor info;
-	int rc;
-
-	require_ucontrol_admin();
-
-	self->kvm_fd = open_kvm_dev_path_or_exit();
-	self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL);
-	ASSERT_GE(self->vm_fd, 0);
-
-	kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL,
-			    KVM_S390_VM_CPU_PROCESSOR, &info);
-	TH_LOG("create VM 0x%llx", info.cpuid);
-
-	self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0);
-	ASSERT_GE(self->vcpu_fd, 0);
-
-	self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
-	ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run))
-		  TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size));
-	self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size,
-		    PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0);
-	ASSERT_NE(self->run, MAP_FAILED);
-	/**
-	 * For virtual cpus that have been created with S390 user controlled
-	 * virtual machines, the resulting vcpu fd can be memory mapped at page
-	 * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of
-	 * the virtual cpu's hardware control block.
-	 */
-	self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE,
-			  PROT_READ | PROT_WRITE, MAP_SHARED,
-			  self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT);
-	ASSERT_NE(self->sie_block, MAP_FAILED);
-
-	TH_LOG("VM created %p %p", self->run, self->sie_block);
-
-	self->base_gpa = 0;
-	self->code_gpa = self->base_gpa + (3 * SZ_1M);
-
-	self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_MAX_M * SZ_1M);
-	ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno);
-	self->base_hva = (uintptr_t)self->vm_mem;
-	self->code_hva = self->base_hva - self->base_gpa + self->code_gpa;
-	struct kvm_s390_ucas_mapping map = {
-		.user_addr = self->base_hva,
-		.vcpu_addr = self->base_gpa,
-		.length = VM_MEM_SIZE,
-	};
-	TH_LOG("ucas map %p %p 0x%llx",
-	       (void *)map.user_addr, (void *)map.vcpu_addr, map.length);
-	rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map);
-	ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s",
-				rc, strerror(errno));
-
-	TH_LOG("page in %p", (void *)self->base_gpa);
-	rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa);
-	ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s",
-				(void *)self->base_hva, rc, strerror(errno));
-
-	self->sie_block->cpuflags &= ~CPUSTAT_STOPPED;
-}
-
-FIXTURE_TEARDOWN(uc_kvm)
-{
-	munmap(self->sie_block, PAGE_SIZE);
-	munmap(self->run, self->kvm_run_size);
-	close(self->vcpu_fd);
-	close(self->vm_fd);
-	close(self->kvm_fd);
-	free(self->vm_mem);
-}
-
-TEST_F(uc_kvm, uc_sie_assertions)
-{
-	/* assert interception of Code 08 (Program Interruption) is set */
-	EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI);
-}
-
-TEST_F(uc_kvm, uc_attr_mem_limit)
-{
-	u64 limit;
-	struct kvm_device_attr attr = {
-		.group = KVM_S390_VM_MEM_CTRL,
-		.attr = KVM_S390_VM_MEM_LIMIT_SIZE,
-		.addr = (unsigned long)&limit,
-	};
-	int rc;
-
-	rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr);
-	EXPECT_EQ(0, rc);
-	EXPECT_EQ(~0UL, limit);
-
-	/* assert set not supported */
-	rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr);
-	EXPECT_EQ(-1, rc);
-	EXPECT_EQ(EINVAL, errno);
-}
-
-TEST_F(uc_kvm, uc_no_dirty_log)
-{
-	struct kvm_dirty_log dlog;
-	int rc;
-
-	rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog);
-	EXPECT_EQ(-1, rc);
-	EXPECT_EQ(EINVAL, errno);
-}
-
-/**
- * Assert HPAGE CAP cannot be enabled on UCONTROL VM
- */
-TEST(uc_cap_hpage)
-{
-	int rc, kvm_fd, vm_fd, vcpu_fd;
-	struct kvm_enable_cap cap = {
-		.cap = KVM_CAP_S390_HPAGE_1M,
-	};
-
-	require_ucontrol_admin();
-
-	kvm_fd = open_kvm_dev_path_or_exit();
-	vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL);
-	ASSERT_GE(vm_fd, 0);
-
-	/* assert hpages are not supported on ucontrol vm */
-	rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M);
-	EXPECT_EQ(0, rc);
-
-	/* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */
-	rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap);
-	EXPECT_EQ(-1, rc);
-	EXPECT_EQ(EINVAL, errno);
-
-	/* assert HPAGE CAP is rejected after vCPU creation */
-	vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
-	ASSERT_GE(vcpu_fd, 0);
-	rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap);
-	EXPECT_EQ(-1, rc);
-	EXPECT_EQ(EBUSY, errno);
-
-	close(vcpu_fd);
-	close(vm_fd);
-	close(kvm_fd);
-}
-
-/* calculate host virtual addr from guest physical addr */
-static void *gpa2hva(FIXTURE_DATA(uc_kvm) *self, u64 gpa)
-{
-	return (void *)(self->base_hva - self->base_gpa + gpa);
-}
-
-/* map / make additional memory available */
-static int uc_map_ext(FIXTURE_DATA(uc_kvm) *self, u64 vcpu_addr, u64 length)
-{
-	struct kvm_s390_ucas_mapping map = {
-		.user_addr = (u64)gpa2hva(self, vcpu_addr),
-		.vcpu_addr = vcpu_addr,
-		.length = length,
-	};
-	pr_info("ucas map %p %p 0x%llx",
-		(void *)map.user_addr, (void *)map.vcpu_addr, map.length);
-	return ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map);
-}
-
-/* unmap previously mapped memory */
-static int uc_unmap_ext(FIXTURE_DATA(uc_kvm) *self, u64 vcpu_addr, u64 length)
-{
-	struct kvm_s390_ucas_mapping map = {
-		.user_addr = (u64)gpa2hva(self, vcpu_addr),
-		.vcpu_addr = vcpu_addr,
-		.length = length,
-	};
-	pr_info("ucas unmap %p %p 0x%llx",
-		(void *)map.user_addr, (void *)map.vcpu_addr, map.length);
-	return ioctl(self->vcpu_fd, KVM_S390_UCAS_UNMAP, &map);
-}
-
-/* handle ucontrol exit by mapping the accessed segment */
-static void uc_handle_exit_ucontrol(FIXTURE_DATA(uc_kvm) *self)
-{
-	struct kvm_run *run = self->run;
-	u64 seg_addr;
-	int rc;
-
-	TEST_ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
-	switch (run->s390_ucontrol.pgm_code) {
-	case PGM_SEGMENT_TRANSLATION:
-		seg_addr = run->s390_ucontrol.trans_exc_code & ~(SZ_1M - 1);
-		pr_info("ucontrol pic segment translation 0x%llx, mapping segment 0x%lx\n",
-			run->s390_ucontrol.trans_exc_code, seg_addr);
-		/* map / make additional memory available */
-		rc = uc_map_ext(self, seg_addr, SZ_1M);
-		TEST_ASSERT_EQ(0, rc);
-		break;
-	default:
-		TEST_FAIL("UNEXPECTED PGM CODE %d", run->s390_ucontrol.pgm_code);
-	}
-}
-
-/*
- * Handle the SIEIC exit
- * * fail on codes not expected in the test cases
- * Returns if interception is handled / execution can be continued
- */
-static void uc_skey_enable(FIXTURE_DATA(uc_kvm) *self)
-{
-	struct kvm_s390_sie_block *sie_block = self->sie_block;
-
-	/* disable KSS */
-	sie_block->cpuflags &= ~CPUSTAT_KSS;
-	/* disable skey inst interception */
-	sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
-}
-
-/*
- * Handle the instruction intercept
- * Returns if interception is handled / execution can be continued
- */
-static bool uc_handle_insn_ic(FIXTURE_DATA(uc_kvm) *self)
-{
-	struct kvm_s390_sie_block *sie_block = self->sie_block;
-	int ilen = insn_length(sie_block->ipa >> 8);
-	struct kvm_run *run = self->run;
-
-	switch (run->s390_sieic.ipa) {
-	case 0xB229: /* ISKE */
-	case 0xB22b: /* SSKE */
-	case 0xB22a: /* RRBE */
-		uc_skey_enable(self);
-
-		/* rewind to reexecute intercepted instruction */
-		run->psw_addr = run->psw_addr - ilen;
-		pr_info("rewind guest addr to 0x%.16llx\n", run->psw_addr);
-		return true;
-	default:
-		return false;
-	}
-}
-
-/*
- * Handle the SIEIC exit
- * * fail on codes not expected in the test cases
- * Returns if interception is handled / execution can be continued
- */
-static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) *self)
-{
-	struct kvm_s390_sie_block *sie_block = self->sie_block;
-	struct kvm_run *run = self->run;
-
-	/* check SIE interception code */
-	pr_info("sieic: 0x%.2x 0x%.4x 0x%.8x\n",
-		run->s390_sieic.icptcode,
-		run->s390_sieic.ipa,
-		run->s390_sieic.ipb);
-	switch (run->s390_sieic.icptcode) {
-	case ICPT_INST:
-		/* end execution in caller on intercepted instruction */
-		pr_info("sie instruction interception\n");
-		return uc_handle_insn_ic(self);
-	case ICPT_KSS:
-		uc_skey_enable(self);
-		return true;
-	case ICPT_OPEREXC:
-		/* operation exception */
-		TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb);
-	default:
-		TEST_FAIL("UNEXPECTED SIEIC CODE %d", run->s390_sieic.icptcode);
-	}
-	return true;
-}
-
-/* verify VM state on exit */
-static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) *self)
-{
-	struct kvm_run *run = self->run;
-
-	switch (run->exit_reason) {
-	case KVM_EXIT_S390_UCONTROL:
-		/** check program interruption code
-		 * handle page fault --> ucas map
-		 */
-		uc_handle_exit_ucontrol(self);
-		break;
-	case KVM_EXIT_S390_SIEIC:
-		return uc_handle_sieic(self);
-	default:
-		pr_info("exit_reason %2d not handled\n", run->exit_reason);
-	}
-	return true;
-}
-
-/* run the VM until interrupted */
-static int uc_run_once(FIXTURE_DATA(uc_kvm) *self)
-{
-	int rc;
-
-	rc = ioctl(self->vcpu_fd, KVM_RUN, NULL);
-	print_run(self->run, self->sie_block);
-	print_regs(self->run);
-	pr_debug("run %d / %d %s\n", rc, errno, strerror(errno));
-	return rc;
-}
-
-static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) *self)
-{
-	struct kvm_s390_sie_block *sie_block = self->sie_block;
-
-	/* assert vm was interrupted by diag 0x0044 */
-	TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason);
-	TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode);
-	TEST_ASSERT_EQ(0x8300, sie_block->ipa);
-	TEST_ASSERT_EQ(0x440000, sie_block->ipb);
-}
-
-TEST_F(uc_kvm, uc_no_user_region)
-{
-	struct kvm_userspace_memory_region region = {
-		.slot = 1,
-		.guest_phys_addr = self->code_gpa,
-		.memory_size = VM_MEM_EXT_SIZE,
-		.userspace_addr = (uintptr_t)self->code_hva,
-	};
-	struct kvm_userspace_memory_region2 region2 = {
-		.slot = 1,
-		.guest_phys_addr = self->code_gpa,
-		.memory_size = VM_MEM_EXT_SIZE,
-		.userspace_addr = (uintptr_t)self->code_hva,
-	};
-
-	ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, &region));
-	ASSERT_EQ(EINVAL, errno);
-
-	ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION2, &region2));
-	ASSERT_EQ(EINVAL, errno);
-}
-
-TEST_F(uc_kvm, uc_map_unmap)
-{
-	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
-	struct kvm_run *run = self->run;
-	const u64 disp = 1;
-	int rc;
-
-	/* copy test_mem_asm to code_hva / code_gpa */
-	TH_LOG("copy code %p to vm mapped memory %p / %p",
-	       &test_mem_asm, (void *)self->code_hva, (void *)self->code_gpa);
-	memcpy((void *)self->code_hva, &test_mem_asm, PAGE_SIZE);
-
-	/* DAT disabled + 64 bit mode */
-	run->psw_mask = 0x0000000180000000ULL;
-	run->psw_addr = self->code_gpa;
-
-	/* set register content for test_mem_asm to access not mapped memory*/
-	sync_regs->gprs[1] = 0x55;
-	sync_regs->gprs[5] = self->base_gpa;
-	sync_regs->gprs[6] = VM_MEM_SIZE + disp;
-	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
-
-	/* run and expect to fail with ucontrol pic segment translation */
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(1, sync_regs->gprs[0]);
-	ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
-
-	ASSERT_EQ(PGM_SEGMENT_TRANSLATION, run->s390_ucontrol.pgm_code);
-	ASSERT_EQ(self->base_gpa + VM_MEM_SIZE, run->s390_ucontrol.trans_exc_code);
-
-	/* fail to map memory with not segment aligned address */
-	rc = uc_map_ext(self, self->base_gpa + VM_MEM_SIZE + disp, VM_MEM_EXT_SIZE);
-	ASSERT_GT(0, rc)
-		TH_LOG("ucas map for non segment address should fail but didn't; "
-		       "result %d not expected, %s", rc, strerror(errno));
-
-	/* map / make additional memory available */
-	rc = uc_map_ext(self, self->base_gpa + VM_MEM_SIZE, VM_MEM_EXT_SIZE);
-	ASSERT_EQ(0, rc)
-		TH_LOG("ucas map result %d not expected, %s", rc, strerror(errno));
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(false, uc_handle_exit(self));
-	uc_assert_diag44(self);
-
-	/* assert registers and memory are in expected state */
-	ASSERT_EQ(2, sync_regs->gprs[0]);
-	ASSERT_EQ(0x55, sync_regs->gprs[1]);
-	ASSERT_EQ(0x55, *(u32 *)gpa2hva(self, self->base_gpa + VM_MEM_SIZE + disp));
-
-	/* unmap and run loop again */
-	rc = uc_unmap_ext(self, self->base_gpa + VM_MEM_SIZE, VM_MEM_EXT_SIZE);
-	ASSERT_EQ(0, rc)
-		TH_LOG("ucas unmap result %d not expected, %s", rc, strerror(errno));
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(3, sync_regs->gprs[0]);
-	ASSERT_EQ(KVM_EXIT_S390_UCONTROL, run->exit_reason);
-	ASSERT_EQ(PGM_SEGMENT_TRANSLATION, run->s390_ucontrol.pgm_code);
-	/* handle ucontrol exit and remap memory after previous map and unmap */
-	ASSERT_EQ(true, uc_handle_exit(self));
-}
-
-TEST_F(uc_kvm, uc_gprs)
-{
-	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
-	struct kvm_run *run = self->run;
-	struct kvm_regs regs = {};
-
-	/* Set registers to values that are different from the ones that we expect below */
-	for (int i = 0; i < 8; i++)
-		sync_regs->gprs[i] = 8;
-	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
-
-	/* copy test_gprs_asm to code_hva / code_gpa */
-	TH_LOG("copy code %p to vm mapped memory %p / %p",
-	       &test_gprs_asm, (void *)self->code_hva, (void *)self->code_gpa);
-	memcpy((void *)self->code_hva, &test_gprs_asm, PAGE_SIZE);
-
-	/* DAT disabled + 64 bit mode */
-	run->psw_mask = 0x0000000180000000ULL;
-	run->psw_addr = self->code_gpa;
-
-	/* run and expect interception of diag 44 */
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(false, uc_handle_exit(self));
-	uc_assert_diag44(self);
-
-	/* Retrieve and check guest register values */
-	ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs));
-	for (int i = 0; i < 8; i++) {
-		ASSERT_EQ(i, regs.gprs[i]);
-		ASSERT_EQ(i, sync_regs->gprs[i]);
-	}
-
-	/* run and expect interception of diag 44 again */
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(false, uc_handle_exit(self));
-	uc_assert_diag44(self);
-
-	/* check continued increment of register 0 value */
-	ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs));
-	ASSERT_EQ(1, regs.gprs[0]);
-	ASSERT_EQ(1, sync_regs->gprs[0]);
-}
-
-TEST_F(uc_kvm, uc_skey)
-{
-	struct kvm_s390_sie_block *sie_block = self->sie_block;
-	struct kvm_sync_regs *sync_regs = &self->run->s.regs;
-	u64 test_vaddr = VM_MEM_SIZE - (SZ_1M / 2);
-	struct kvm_run *run = self->run;
-	const u8 skeyvalue = 0x34;
-
-	/* copy test_skey_asm to code_hva / code_gpa */
-	TH_LOG("copy code %p to vm mapped memory %p / %p",
-	       &test_skey_asm, (void *)self->code_hva, (void *)self->code_gpa);
-	memcpy((void *)self->code_hva, &test_skey_asm, PAGE_SIZE);
-
-	/* set register content for test_skey_asm to access not mapped memory */
-	sync_regs->gprs[1] = skeyvalue;
-	sync_regs->gprs[5] = self->base_gpa;
-	sync_regs->gprs[6] = test_vaddr;
-	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
-
-	/* DAT disabled + 64 bit mode */
-	run->psw_mask = 0x0000000180000000ULL;
-	run->psw_addr = self->code_gpa;
-
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(true, uc_handle_exit(self));
-	ASSERT_EQ(1, sync_regs->gprs[0]);
-
-	/* ISKE */
-	ASSERT_EQ(0, uc_run_once(self));
-
-	/*
-	 * Bail out and skip the test after uc_skey_enable was executed but iske
-	 * is still intercepted. Instructions are not handled by the kernel.
-	 * Thus there is no need to test this here.
-	 */
-	TEST_ASSERT_EQ(0, sie_block->cpuflags & CPUSTAT_KSS);
-	TEST_ASSERT_EQ(0, sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE));
-	TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason);
-	TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode);
-	TEST_REQUIRE(sie_block->ipa != 0xb229);
-
-	/* ISKE contd. */
-	ASSERT_EQ(false, uc_handle_exit(self));
-	ASSERT_EQ(2, sync_regs->gprs[0]);
-	/* assert initial skey (ACC = 0, R & C = 1) */
-	ASSERT_EQ(0x06, sync_regs->gprs[1]);
-	uc_assert_diag44(self);
-
-	/* SSKE + ISKE */
-	sync_regs->gprs[1] = skeyvalue;
-	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(false, uc_handle_exit(self));
-	ASSERT_EQ(3, sync_regs->gprs[0]);
-	ASSERT_EQ(skeyvalue, sync_regs->gprs[1]);
-	uc_assert_diag44(self);
-
-	/* RRBE + ISKE */
-	sync_regs->gprs[1] = skeyvalue;
-	run->kvm_dirty_regs |= KVM_SYNC_GPRS;
-	ASSERT_EQ(0, uc_run_once(self));
-	ASSERT_EQ(false, uc_handle_exit(self));
-	ASSERT_EQ(4, sync_regs->gprs[0]);
-	/* assert R reset but rest of skey unchanged */
-	ASSERT_EQ(skeyvalue & 0xfa, sync_regs->gprs[1]);
-	ASSERT_EQ(0, sync_regs->gprs[1] & 0x04);
-	uc_assert_diag44(self);
-}
-
-TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index a8267628e9ed..86ee3385e860 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -17,9 +17,9 @@
 #include <processor.h>
 
 /*
- * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a
- * 2MB sized and aligned region so that the initial region corresponds to
- * exactly one large page.
+ * s390 needs at least 1MB alignment, and the x86 MOVE/DELETE tests need a 2MB
+ * sized and aligned region so that the initial region corresponds to exactly
+ * one large page.
  */
 #define MEM_REGION_SIZE		0x200000
 
diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c
new file mode 100644
index 000000000000..f4ce5a185a7d
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/amx_test.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * amx tests
+ *
+ * Copyright (C) 2021, Intel, Inc.
+ *
+ * Tests for amx #NM exception and save/restore.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define NUM_TILES			8
+#define TILE_SIZE			1024
+#define XSAVE_SIZE			((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+/* Tile configuration associated: */
+#define PALETTE_TABLE_INDEX		1
+#define MAX_TILES			16
+#define RESERVED_BYTES			14
+
+#define XSAVE_HDR_OFFSET		512
+
+struct tile_config {
+	u8  palette_id;
+	u8  start_row;
+	u8  reserved[RESERVED_BYTES];
+	u16 colsb[MAX_TILES];
+	u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+	u8 data[NUM_TILES * TILE_SIZE];
+};
+
+struct xtile_info {
+	u16 bytes_per_tile;
+	u16 bytes_per_row;
+	u16 max_names;
+	u16 max_rows;
+	u32 xsave_offset;
+	u32 xsave_size;
+};
+
+static struct xtile_info xtile;
+
+static inline void __ldtilecfg(void *cfg)
+{
+	asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+		     : : "a"(cfg));
+}
+
+static inline void __tileloadd(void *tile)
+{
+	asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+		     : : "a"(tile), "d"(0));
+}
+
+static inline void __tilerelease(void)
+{
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
+}
+
+static inline void __xsavec(struct xstate *xstate, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsavec (%%rdi)"
+		     : : "D" (xstate), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static void check_xtile_info(void)
+{
+	GUEST_ASSERT((xgetbv(0) & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE);
+
+	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0));
+	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE);
+
+	xtile.xsave_offset = this_cpu_property(X86_PROPERTY_XSTATE_TILE_OFFSET);
+	GUEST_ASSERT(xtile.xsave_offset == 2816);
+	xtile.xsave_size = this_cpu_property(X86_PROPERTY_XSTATE_TILE_SIZE);
+	GUEST_ASSERT(xtile.xsave_size == 8192);
+	GUEST_ASSERT(sizeof(struct tile_data) >= xtile.xsave_size);
+
+	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_MAX_PALETTE_TABLES));
+	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_AMX_MAX_PALETTE_TABLES) >=
+		     PALETTE_TABLE_INDEX);
+
+	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_NR_TILE_REGS));
+	xtile.max_names = this_cpu_property(X86_PROPERTY_AMX_NR_TILE_REGS);
+	GUEST_ASSERT(xtile.max_names == 8);
+	xtile.bytes_per_tile = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_TILE);
+	GUEST_ASSERT(xtile.bytes_per_tile == 1024);
+	xtile.bytes_per_row = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_ROW);
+	GUEST_ASSERT(xtile.bytes_per_row == 64);
+	xtile.max_rows = this_cpu_property(X86_PROPERTY_AMX_MAX_ROWS);
+	GUEST_ASSERT(xtile.max_rows == 16);
+}
+
+static void set_tilecfg(struct tile_config *cfg)
+{
+	int i;
+
+	/* Only palette id 1 */
+	cfg->palette_id = 1;
+	for (i = 0; i < xtile.max_names; i++) {
+		cfg->colsb[i] = xtile.bytes_per_row;
+		cfg->rows[i] = xtile.max_rows;
+	}
+}
+
+static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
+						    struct tile_data *tiledata,
+						    struct xstate *xstate)
+{
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
+		     this_cpu_has(X86_FEATURE_OSXSAVE));
+	check_xtile_info();
+	GUEST_SYNC(1);
+
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(2);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	GUEST_SYNC(3);
+	/* Check save/restore when trap to userspace */
+	__tileloadd(tiledata);
+	GUEST_SYNC(4);
+	__tilerelease();
+	GUEST_SYNC(5);
+	/*
+	 * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
+	 * the xcomp_bv.
+	 */
+	xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA;
+	__xsavec(xstate, XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
+	GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);
+
+	/* xfd=0x40000, disable amx tiledata */
+	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
+
+	/*
+	 * XTILEDATA is cleared in xstate_bv but set in xcomp_bv, this property
+	 * remains the same even when amx tiledata is disabled by IA32_XFD.
+	 */
+	xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA;
+	__xsavec(xstate, XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
+	GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));
+
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	/* Trigger #NM exception */
+	__tileloadd(tiledata);
+	GUEST_SYNC(10);
+
+	GUEST_DONE();
+}
+
+void guest_nm_handler(struct ex_regs *regs)
+{
+	/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
+	GUEST_SYNC(7);
+	GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
+	GUEST_SYNC(8);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
+	/* Clear xfd_err */
+	wrmsr(MSR_IA32_XFD_ERR, 0);
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(9);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_regs regs1, regs2;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_x86_state *state;
+	int xsave_restore_size;
+	vm_vaddr_t amx_cfg, tiledata, xstate;
+	struct ucall uc;
+	u32 amx_offset;
+	int ret;
+
+	/*
+	 * Note, all off-by-default features must be enabled before anything
+	 * caches KVM_GET_SUPPORTED_CPUID, e.g. before using kvm_cpu_has().
+	 */
+	vm_xsave_require_permission(XFEATURE_MASK_XTILE_DATA);
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA_XFD));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE),
+		    "KVM should enumerate max XSAVE size when XSAVE is supported");
+	xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE);
+
+	vcpu_regs_get(vcpu, &regs1);
+
+	/* Register #NM handler */
+	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
+
+	/* amx cfg for guest_code */
+	amx_cfg = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
+
+	/* amx tiledata for guest_code */
+	tiledata = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize());
+
+	/* XSAVE state for guest_code */
+	xstate = vm_vaddr_alloc_pages(vm, DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
+	memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
+	vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);
+
+	for (;;) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			switch (uc.args[1]) {
+			case 1:
+			case 2:
+			case 3:
+			case 5:
+			case 6:
+			case 7:
+			case 8:
+				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
+				break;
+			case 4:
+			case 10:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+
+				/* Compacted mode, get amx offset by xsave area
+				 * size subtract 8K amx size.
+				 */
+				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+				state = vcpu_save_state(vcpu);
+				void *amx_start = (void *)state->xsave + amx_offset;
+				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
+				/* Only check TMM0 register, 1 tile */
+				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
+				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
+				kvm_x86_state_cleanup(state);
+				break;
+			case 9:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			fprintf(stderr, "UCALL_DONE\n");
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		state = vcpu_save_state(vcpu);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vcpu, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vcpu, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c b/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c
new file mode 100644
index 000000000000..f8916bb34405
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * Verify KVM correctly emulates the APIC bus frequency when the VMM configures
+ * the frequency via KVM_CAP_X86_APIC_BUS_CYCLES_NS.  Start the APIC timer by
+ * programming TMICT (timer initial count) to the largest value possible (so
+ * that the timer will not expire during the test).  Then, after an arbitrary
+ * amount of time has elapsed, verify TMCCT (timer current count) is within 1%
+ * of the expected value based on the time elapsed, the APIC bus frequency, and
+ * the programmed TDCR (timer divide configuration register).
+ */
+
+#include "apic.h"
+#include "test_util.h"
+
+/*
+ * Possible TDCR values with matching divide count. Used to modify APIC
+ * timer frequency.
+ */
+static const struct {
+	const uint32_t tdcr;
+	const uint32_t divide_count;
+} tdcrs[] = {
+	{0x0, 2},
+	{0x1, 4},
+	{0x2, 8},
+	{0x3, 16},
+	{0x8, 32},
+	{0x9, 64},
+	{0xa, 128},
+	{0xb, 1},
+};
+
+static bool is_x2apic;
+
+static void apic_enable(void)
+{
+	if (is_x2apic)
+		x2apic_enable();
+	else
+		xapic_enable();
+}
+
+static uint32_t apic_read_reg(unsigned int reg)
+{
+	return is_x2apic ? x2apic_read_reg(reg) : xapic_read_reg(reg);
+}
+
+static void apic_write_reg(unsigned int reg, uint32_t val)
+{
+	if (is_x2apic)
+		x2apic_write_reg(reg, val);
+	else
+		xapic_write_reg(reg, val);
+}
+
+static void apic_guest_code(uint64_t apic_hz, uint64_t delay_ms)
+{
+	uint64_t tsc_hz = guest_tsc_khz * 1000;
+	const uint32_t tmict = ~0u;
+	uint64_t tsc0, tsc1, freq;
+	uint32_t tmcct;
+	int i;
+
+	apic_enable();
+
+	/*
+	 * Setup one-shot timer.  The vector does not matter because the
+	 * interrupt should not fire.
+	 */
+	apic_write_reg(APIC_LVTT, APIC_LVT_TIMER_ONESHOT | APIC_LVT_MASKED);
+
+	for (i = 0; i < ARRAY_SIZE(tdcrs); i++) {
+		apic_write_reg(APIC_TDCR, tdcrs[i].tdcr);
+		apic_write_reg(APIC_TMICT, tmict);
+
+		tsc0 = rdtsc();
+		udelay(delay_ms * 1000);
+		tmcct = apic_read_reg(APIC_TMCCT);
+		tsc1 = rdtsc();
+
+		/*
+		 * Stop the timer _after_ reading the current, final count, as
+		 * writing the initial counter also modifies the current count.
+		 */
+		apic_write_reg(APIC_TMICT, 0);
+
+		freq = (tmict - tmcct) * tdcrs[i].divide_count * tsc_hz / (tsc1 - tsc0);
+		/* Check if measured frequency is within 5% of configured frequency. */
+		__GUEST_ASSERT(freq < apic_hz * 105 / 100 && freq > apic_hz * 95 / 100,
+			       "Frequency = %lu (wanted %lu - %lu), bus = %lu, div = %u, tsc = %lu",
+			       freq, apic_hz * 95 / 100, apic_hz * 105 / 100,
+			       apic_hz, tdcrs[i].divide_count, tsc_hz);
+	}
+
+	GUEST_DONE();
+}
+
+static void test_apic_bus_clock(struct kvm_vcpu *vcpu)
+{
+	bool done = false;
+	struct ucall uc;
+
+	while (!done) {
+		vcpu_run(vcpu);
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_DONE:
+			done = true;
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+			break;
+		}
+	}
+}
+
+static void run_apic_bus_clock_test(uint64_t apic_hz, uint64_t delay_ms,
+				    bool x2apic)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int ret;
+
+	is_x2apic = x2apic;
+
+	vm = vm_create(1);
+
+	sync_global_to_guest(vm, is_x2apic);
+
+	vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+		      NSEC_PER_SEC / apic_hz);
+
+	vcpu = vm_vcpu_add(vm, 0, apic_guest_code);
+	vcpu_args_set(vcpu, 2, apic_hz, delay_ms);
+
+	ret = __vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+			      NSEC_PER_SEC / apic_hz);
+	TEST_ASSERT(ret < 0 && errno == EINVAL,
+		    "Setting of APIC bus frequency after vCPU is created should fail.");
+
+	if (!is_x2apic)
+		virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	test_apic_bus_clock(vcpu);
+	kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-d delay] [-f APIC bus freq]\n", name);
+	puts("");
+	printf("-d: Delay (in msec) guest uses to measure APIC bus frequency.\n");
+	printf("-f: The APIC bus frequency (in MHz) to be configured for the guest.\n");
+	puts("");
+}
+
+int main(int argc, char *argv[])
+{
+	/*
+	 * Arbitrarilty default to 25MHz for the APIC bus frequency, which is
+	 * different enough from the default 1GHz to be interesting.
+	 */
+	uint64_t apic_hz = 25 * 1000 * 1000;
+	uint64_t delay_ms = 100;
+	int opt;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS));
+
+	while ((opt = getopt(argc, argv, "d:f:h")) != -1) {
+		switch (opt) {
+		case 'f':
+			apic_hz = atoi_positive("APIC bus frequency", optarg) * 1000 * 1000;
+			break;
+		case 'd':
+			delay_ms = atoi_positive("Delay in milliseconds", optarg);
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			exit(KSFT_SKIP);
+		}
+	}
+
+	run_apic_bus_clock_test(apic_hz, delay_ms, false);
+	run_apic_bus_clock_test(apic_hz, delay_ms, true);
+}
diff --git a/tools/testing/selftests/kvm/x86/cpuid_test.c b/tools/testing/selftests/kvm/x86/cpuid_test.c
new file mode 100644
index 000000000000..7b3fda6842bc
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/cpuid_test.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat Inc.
+ *
+ * Generic tests for KVM CPUID set/get ioctls
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+struct cpuid_mask {
+	union {
+		struct {
+			u32 eax;
+			u32 ebx;
+			u32 ecx;
+			u32 edx;
+		};
+		u32 regs[4];
+	};
+};
+
+static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
+{
+	int i;
+	u32 eax, ebx, ecx, edx;
+
+	for (i = 0; i < guest_cpuid->nent; i++) {
+		__cpuid(guest_cpuid->entries[i].function,
+			guest_cpuid->entries[i].index,
+			&eax, &ebx, &ecx, &edx);
+
+		GUEST_ASSERT_EQ(eax, guest_cpuid->entries[i].eax);
+		GUEST_ASSERT_EQ(ebx, guest_cpuid->entries[i].ebx);
+		GUEST_ASSERT_EQ(ecx, guest_cpuid->entries[i].ecx);
+		GUEST_ASSERT_EQ(edx, guest_cpuid->entries[i].edx);
+	}
+
+}
+
+static void guest_main(struct kvm_cpuid2 *guest_cpuid)
+{
+	GUEST_SYNC(1);
+
+	test_guest_cpuids(guest_cpuid);
+
+	GUEST_SYNC(2);
+
+	GUEST_ASSERT_EQ(this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF), 0x40000001);
+
+	GUEST_DONE();
+}
+
+static struct cpuid_mask get_const_cpuid_mask(const struct kvm_cpuid_entry2 *entry)
+{
+	struct cpuid_mask mask;
+
+	memset(&mask, 0xff, sizeof(mask));
+
+	switch (entry->function) {
+	case 0x1:
+		mask.regs[X86_FEATURE_OSXSAVE.reg] &= ~BIT(X86_FEATURE_OSXSAVE.bit);
+		break;
+	case 0x7:
+		mask.regs[X86_FEATURE_OSPKE.reg] &= ~BIT(X86_FEATURE_OSPKE.bit);
+		break;
+	case 0xd:
+		/*
+		 * CPUID.0xD.{0,1}.EBX enumerate XSAVE size based on the current
+		 * XCR0 and IA32_XSS MSR values.
+		 */
+		if (entry->index < 2)
+			mask.ebx = 0;
+		break;
+	}
+	return mask;
+}
+
+static void compare_cpuids(const struct kvm_cpuid2 *cpuid1,
+			   const struct kvm_cpuid2 *cpuid2)
+{
+	const struct kvm_cpuid_entry2 *e1, *e2;
+	int i;
+
+	TEST_ASSERT(cpuid1->nent == cpuid2->nent,
+		    "CPUID nent mismatch: %d vs. %d", cpuid1->nent, cpuid2->nent);
+
+	for (i = 0; i < cpuid1->nent; i++) {
+		struct cpuid_mask mask;
+
+		e1 = &cpuid1->entries[i];
+		e2 = &cpuid2->entries[i];
+
+		TEST_ASSERT(e1->function == e2->function &&
+			    e1->index == e2->index && e1->flags == e2->flags,
+			    "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x",
+			    i, e1->function, e1->index, e1->flags,
+			    e2->function, e2->index, e2->flags);
+
+		/* Mask off dynamic bits, e.g. OSXSAVE, when comparing entries. */
+		mask = get_const_cpuid_mask(e1);
+
+		TEST_ASSERT((e1->eax & mask.eax) == (e2->eax & mask.eax) &&
+			    (e1->ebx & mask.ebx) == (e2->ebx & mask.ebx) &&
+			    (e1->ecx & mask.ecx) == (e2->ecx & mask.ecx) &&
+			    (e1->edx & mask.edx) == (e2->edx & mask.edx),
+			    "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
+			    e1->function, e1->index,
+			    e1->eax & mask.eax, e1->ebx & mask.ebx,
+			    e1->ecx & mask.ecx, e1->edx & mask.edx,
+			    e2->eax & mask.eax, e2->ebx & mask.ebx,
+			    e2->ecx & mask.ecx, e2->edx & mask.edx);
+	}
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage + 1,
+			    "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage + 1, (ulong)uc.args[1]);
+		return;
+	case UCALL_DONE:
+		return;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_ASSERT(false, "Unexpected exit: %s",
+			    exit_reason_str(vcpu->run->exit_reason));
+	}
+}
+
+struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
+{
+	int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
+	vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR);
+	struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
+
+	memcpy(guest_cpuids, cpuid, size);
+
+	*p_gva = gva;
+	return guest_cpuids;
+}
+
+static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *ent;
+	int rc;
+	u32 eax, ebx, x;
+
+	/* Setting unmodified CPUID is allowed */
+	rc = __vcpu_set_cpuid(vcpu);
+	TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
+
+	/* Changing CPU features is forbidden */
+	ent = vcpu_get_cpuid_entry(vcpu, 0x7);
+	ebx = ent->ebx;
+	ent->ebx--;
+	rc = __vcpu_set_cpuid(vcpu);
+	TEST_ASSERT(rc, "Changing CPU features should fail");
+	ent->ebx = ebx;
+
+	/* Changing MAXPHYADDR is forbidden */
+	ent = vcpu_get_cpuid_entry(vcpu, 0x80000008);
+	eax = ent->eax;
+	x = eax & 0xff;
+	ent->eax = (eax & ~0xffu) | (x - 1);
+	rc = __vcpu_set_cpuid(vcpu);
+	TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
+	ent->eax = eax;
+}
+
+static void test_get_cpuid2(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1);
+	int i, r;
+
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+	TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent,
+		    "KVM didn't update nent on success, wanted %u, got %u",
+		    vcpu->cpuid->nent, cpuid->nent);
+
+	for (i = 0; i < vcpu->cpuid->nent; i++) {
+		cpuid->nent = i;
+		r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+		TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r));
+		TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure");
+	}
+	free(cpuid);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t cpuid_gva;
+	struct kvm_vm *vm;
+	int stage;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	compare_cpuids(kvm_get_supported_cpuid(), vcpu->cpuid);
+
+	vcpu_alloc_cpuid(vm, &cpuid_gva, vcpu->cpuid);
+
+	vcpu_args_set(vcpu, 1, cpuid_gva);
+
+	for (stage = 0; stage < 3; stage++)
+		run_vcpu(vcpu, stage);
+
+	set_cpuid_after_run(vcpu);
+
+	test_get_cpuid2(vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86/cr4_cpuid_sync_test.c
new file mode 100644
index 000000000000..28cc66454601
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/cr4_cpuid_sync_test.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CR4 and CPUID sync test
+ *
+ * Copyright 2018, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ *   Wei Huang <wei@redhat.com>
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define MAGIC_HYPERCALL_PORT	0x80
+
+static void guest_code(void)
+{
+	u32 regs[4] = {
+		[KVM_CPUID_EAX] = X86_FEATURE_OSXSAVE.function,
+		[KVM_CPUID_ECX] = X86_FEATURE_OSXSAVE.index,
+	};
+
+	/* CR4.OSXSAVE should be enabled by default (for selftests vCPUs). */
+	GUEST_ASSERT(get_cr4() & X86_CR4_OSXSAVE);
+
+	/* verify CR4.OSXSAVE == CPUID.OSXSAVE */
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
+
+	/*
+	 * Notify hypervisor to clear CR4.0SXSAVE, do CPUID and save output,
+	 * and then restore CR4.  Do this all in  assembly to ensure no AVX
+	 * instructions are executed while OSXSAVE=0.
+	 */
+	asm volatile (
+		"out %%al, $" __stringify(MAGIC_HYPERCALL_PORT) "\n\t"
+		"cpuid\n\t"
+		"mov %%rdi, %%cr4\n\t"
+		: "+a" (regs[KVM_CPUID_EAX]),
+		  "=b" (regs[KVM_CPUID_EBX]),
+		  "+c" (regs[KVM_CPUID_ECX]),
+		  "=d" (regs[KVM_CPUID_EDX])
+		: "D" (get_cr4())
+	);
+
+	/* Verify KVM cleared OSXSAVE in CPUID when it was cleared in CR4. */
+	GUEST_ASSERT(!(regs[X86_FEATURE_OSXSAVE.reg] & BIT(X86_FEATURE_OSXSAVE.bit)));
+
+	/* Verify restoring CR4 also restored OSXSAVE in CPUID. */
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_sregs sregs;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	while (1) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		if (vcpu->run->io.port == MAGIC_HYPERCALL_PORT &&
+		    vcpu->run->io.direction == KVM_EXIT_IO_OUT) {
+			/* emulate hypervisor clearing CR4.OSXSAVE */
+			vcpu_sregs_get(vcpu, &sregs);
+			sregs.cr4 &= ~X86_CR4_OSXSAVE;
+			vcpu_sregs_set(vcpu, &sregs);
+			continue;
+		}
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/debug_regs.c b/tools/testing/selftests/kvm/x86/debug_regs.c
new file mode 100644
index 000000000000..2d814c1d1dc4
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/debug_regs.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest debug register tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+#include "apic.h"
+
+#define DR6_BD		(1 << 13)
+#define DR7_GD		(1 << 13)
+
+#define IRQ_VECTOR 0xAA
+
+/* For testing data access debug BP */
+uint32_t guest_value;
+
+extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
+
+static void guest_code(void)
+{
+	/* Create a pending interrupt on current vCPU */
+	x2apic_enable();
+	x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT |
+			 APIC_DM_FIXED | IRQ_VECTOR);
+
+	/*
+	 * Software BP tests.
+	 *
+	 * NOTE: sw_bp need to be before the cmd here, because int3 is an
+	 * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we
+	 * capture it using the vcpu exception bitmap).
+	 */
+	asm volatile("sw_bp: int3");
+
+	/* Hardware instruction BP test */
+	asm volatile("hw_bp: nop");
+
+	/* Hardware data BP test */
+	asm volatile("mov $1234,%%rax;\n\t"
+		     "mov %%rax,%0;\n\t write_data:"
+		     : "=m" (guest_value) : : "rax");
+
+	/*
+	 * Single step test, covers 2 basic instructions and 2 emulated
+	 *
+	 * Enable interrupts during the single stepping to see that pending
+	 * interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ.
+	 *
+	 * Write MSR_IA32_TSC_DEADLINE to verify that KVM's fastpath handler
+	 * exits to userspace due to single-step being enabled.
+	 */
+	asm volatile("ss_start: "
+		     "sti\n\t"
+		     "xor %%eax,%%eax\n\t"
+		     "cpuid\n\t"
+		     "movl $" __stringify(MSR_IA32_TSC_DEADLINE) ", %%ecx\n\t"
+		     "wrmsr\n\t"
+		     "cli\n\t"
+		     : : : "eax", "ebx", "ecx", "edx");
+
+	/* DR6.BD test */
+	asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
+	GUEST_DONE();
+}
+
+#define  CAST_TO_RIP(v)  ((unsigned long long)&(v))
+
+static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len)
+{
+	struct kvm_regs regs;
+
+	vcpu_regs_get(vcpu, &regs);
+	regs.rip += insn_len;
+	vcpu_regs_set(vcpu, &regs);
+}
+
+int main(void)
+{
+	struct kvm_guest_debug debug;
+	unsigned long long target_dr6, target_rip;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	uint64_t cmd;
+	int i;
+	/* Instruction lengths starting at ss_start */
+	int ss_size[6] = {
+		1,		/* sti*/
+		2,		/* xor */
+		2,		/* cpuid */
+		5,		/* mov */
+		2,		/* rdmsr */
+		1,		/* cli */
+	};
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+
+	/* Test software BPs - int3 */
+	memset(&debug, 0, sizeof(debug));
+	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+	vcpu_guest_debug_set(vcpu, &debug);
+	vcpu_run(vcpu);
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+		    run->debug.arch.exception == BP_VECTOR &&
+		    run->debug.arch.pc == CAST_TO_RIP(sw_bp),
+		    "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)",
+		    run->exit_reason, run->debug.arch.exception,
+		    run->debug.arch.pc, CAST_TO_RIP(sw_bp));
+	vcpu_skip_insn(vcpu, 1);
+
+	/* Test instruction HW BP over DR[0-3] */
+	for (i = 0; i < 4; i++) {
+		memset(&debug, 0, sizeof(debug));
+		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+		debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp);
+		debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1));
+		vcpu_guest_debug_set(vcpu, &debug);
+		vcpu_run(vcpu);
+		target_dr6 = 0xffff0ff0 | (1UL << i);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+			    run->debug.arch.exception == DB_VECTOR &&
+			    run->debug.arch.pc == CAST_TO_RIP(hw_bp) &&
+			    run->debug.arch.dr6 == target_dr6,
+			    "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+			    i, run->exit_reason, run->debug.arch.exception,
+			    run->debug.arch.pc, CAST_TO_RIP(hw_bp),
+			    run->debug.arch.dr6, target_dr6);
+	}
+	/* Skip "nop" */
+	vcpu_skip_insn(vcpu, 1);
+
+	/* Test data access HW BP over DR[0-3] */
+	for (i = 0; i < 4; i++) {
+		memset(&debug, 0, sizeof(debug));
+		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+		debug.arch.debugreg[i] = CAST_TO_RIP(guest_value);
+		debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) |
+		    (0x000d0000UL << (4*i));
+		vcpu_guest_debug_set(vcpu, &debug);
+		vcpu_run(vcpu);
+		target_dr6 = 0xffff0ff0 | (1UL << i);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+			    run->debug.arch.exception == DB_VECTOR &&
+			    run->debug.arch.pc == CAST_TO_RIP(write_data) &&
+			    run->debug.arch.dr6 == target_dr6,
+			    "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+			    i, run->exit_reason, run->debug.arch.exception,
+			    run->debug.arch.pc, CAST_TO_RIP(write_data),
+			    run->debug.arch.dr6, target_dr6);
+		/* Rollback the 4-bytes "mov" */
+		vcpu_skip_insn(vcpu, -7);
+	}
+	/* Skip the 4-bytes "mov" */
+	vcpu_skip_insn(vcpu, 7);
+
+	/* Test single step */
+	target_rip = CAST_TO_RIP(ss_start);
+	target_dr6 = 0xffff4ff0ULL;
+	for (i = 0; i < ARRAY_SIZE(ss_size); i++) {
+		target_rip += ss_size[i];
+		memset(&debug, 0, sizeof(debug));
+		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |
+				KVM_GUESTDBG_BLOCKIRQ;
+		debug.arch.debugreg[7] = 0x00000400;
+		vcpu_guest_debug_set(vcpu, &debug);
+		vcpu_run(vcpu);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+			    run->debug.arch.exception == DB_VECTOR &&
+			    run->debug.arch.pc == target_rip &&
+			    run->debug.arch.dr6 == target_dr6,
+			    "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx "
+			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+			    i, run->exit_reason, run->debug.arch.exception,
+			    run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+			    target_dr6);
+	}
+
+	/* Finally test global disable */
+	memset(&debug, 0, sizeof(debug));
+	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+	debug.arch.debugreg[7] = 0x400 | DR7_GD;
+	vcpu_guest_debug_set(vcpu, &debug);
+	vcpu_run(vcpu);
+	target_dr6 = 0xffff0ff0 | DR6_BD;
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+		    run->debug.arch.exception == DB_VECTOR &&
+		    run->debug.arch.pc == CAST_TO_RIP(bd_start) &&
+		    run->debug.arch.dr6 == target_dr6,
+			    "DR7.GD: exit %d exception %d rip 0x%llx "
+			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+			    run->exit_reason, run->debug.arch.exception,
+			    run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+			    target_dr6);
+
+	/* Disable all debug controls, run to the end */
+	memset(&debug, 0, sizeof(debug));
+	vcpu_guest_debug_set(vcpu, &debug);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	cmd = get_ucall(vcpu, &uc);
+	TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE");
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c
new file mode 100644
index 000000000000..2929c067c207
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty logging page splitting test
+ *
+ * Based on dirty_log_perf.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2023, Google, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "memstress.h"
+#include "guest_modes.h"
+#include "ucall_common.h"
+
+#define VCPUS		2
+#define SLOTS		2
+#define ITERATIONS	2
+
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB;
+
+static u64 dirty_log_manual_caps;
+static bool host_quit;
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+
+struct kvm_page_stats {
+	uint64_t pages_4k;
+	uint64_t pages_2m;
+	uint64_t pages_1g;
+	uint64_t hugepages;
+};
+
+static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage)
+{
+	stats->pages_4k = vm_get_stat(vm, "pages_4k");
+	stats->pages_2m = vm_get_stat(vm, "pages_2m");
+	stats->pages_1g = vm_get_stat(vm, "pages_1g");
+	stats->hugepages = stats->pages_2m + stats->pages_1g;
+
+	pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
+		 stage, stats->pages_4k, stats->pages_2m, stats->pages_1g,
+		 stats->hugepages);
+}
+
+static void run_vcpu_iteration(struct kvm_vm *vm)
+{
+	int i;
+
+	iteration++;
+	for (i = 0; i < VCPUS; i++) {
+		while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
+		       iteration)
+			;
+	}
+}
+
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
+{
+	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
+	int vcpu_idx = vcpu_args->vcpu_idx;
+
+	while (!READ_ONCE(host_quit)) {
+		int current_iteration = READ_ONCE(iteration);
+
+		vcpu_run(vcpu);
+
+		TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+
+		vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
+
+		/* Wait for the start of the next iteration to be signaled. */
+		while (current_iteration == READ_ONCE(iteration) &&
+		       READ_ONCE(iteration) >= 0 &&
+		       !READ_ONCE(host_quit))
+			;
+	}
+}
+
+static void run_test(enum vm_guest_mode mode, void *unused)
+{
+	struct kvm_vm *vm;
+	unsigned long **bitmaps;
+	uint64_t guest_num_pages;
+	uint64_t host_num_pages;
+	uint64_t pages_per_slot;
+	int i;
+	struct kvm_page_stats stats_populated;
+	struct kvm_page_stats stats_dirty_logging_enabled;
+	struct kvm_page_stats stats_dirty_pass[ITERATIONS];
+	struct kvm_page_stats stats_clear_pass[ITERATIONS];
+	struct kvm_page_stats stats_dirty_logging_disabled;
+	struct kvm_page_stats stats_repopulated;
+
+	vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size,
+				 SLOTS, backing_src, false);
+
+	guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift;
+	guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+	pages_per_slot = host_num_pages / SLOTS;
+	TEST_ASSERT_EQ(host_num_pages, pages_per_slot * SLOTS);
+	TEST_ASSERT(!(host_num_pages % 512),
+		    "Number of pages, '%lu' not a multiple of 2MiB", host_num_pages);
+
+	bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot);
+
+	if (dirty_log_manual_caps)
+		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
+			      dirty_log_manual_caps);
+
+	/* Start the iterations */
+	iteration = -1;
+	host_quit = false;
+
+	for (i = 0; i < VCPUS; i++)
+		vcpu_last_completed_iteration[i] = -1;
+
+	memstress_start_vcpu_threads(VCPUS, vcpu_worker);
+
+	run_vcpu_iteration(vm);
+	get_page_stats(vm, &stats_populated, "populating memory");
+
+	/* Enable dirty logging */
+	memstress_enable_dirty_logging(vm, SLOTS);
+
+	get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging");
+
+	while (iteration < ITERATIONS) {
+		run_vcpu_iteration(vm);
+		get_page_stats(vm, &stats_dirty_pass[iteration - 1],
+			       "dirtying memory");
+
+		memstress_get_dirty_log(vm, bitmaps, SLOTS);
+
+		if (dirty_log_manual_caps) {
+			memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot);
+
+			get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log");
+		}
+	}
+
+	/* Disable dirty logging */
+	memstress_disable_dirty_logging(vm, SLOTS);
+
+	get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging");
+
+	/* Run vCPUs again to fault pages back in. */
+	run_vcpu_iteration(vm);
+	get_page_stats(vm, &stats_repopulated, "repopulating memory");
+
+	/*
+	 * Tell the vCPU threads to quit.  No need to manually check that vCPUs
+	 * have stopped running after disabling dirty logging, the join will
+	 * wait for them to exit.
+	 */
+	host_quit = true;
+	memstress_join_vcpu_threads(VCPUS);
+
+	memstress_free_bitmaps(bitmaps, SLOTS);
+	memstress_destroy_vm(vm);
+
+	TEST_ASSERT_EQ((stats_populated.pages_2m * 512 +
+			stats_populated.pages_1g * 512 * 512), host_num_pages);
+
+	/*
+	 * Check that all huge pages were split. Since large pages can only
+	 * exist in the data slot, and the vCPUs should have dirtied all pages
+	 * in the data slot, there should be no huge pages left after splitting.
+	 * Splitting happens at dirty log enable time without
+	 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
+	 * with that capability.
+	 */
+	if (dirty_log_manual_caps) {
+		TEST_ASSERT_EQ(stats_clear_pass[0].hugepages, 0);
+		TEST_ASSERT(stats_clear_pass[0].pages_4k >= host_num_pages,
+			    "Expected at least '%lu' 4KiB pages, found only '%lu'",
+			    host_num_pages, stats_clear_pass[0].pages_4k);
+		TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages);
+	} else {
+		TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0);
+		TEST_ASSERT(stats_dirty_logging_enabled.pages_4k >= host_num_pages,
+			    "Expected at least '%lu' 4KiB pages, found only '%lu'",
+			    host_num_pages, stats_dirty_logging_enabled.pages_4k);
+	}
+
+	/*
+	 * Once dirty logging is disabled and the vCPUs have touched all their
+	 * memory again, the hugepage counts should be the same as they were
+	 * right after initial population of memory.
+	 */
+	TEST_ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m);
+	TEST_ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n",
+	       name);
+	puts("");
+	printf(" -b: specify the size of the memory region which should be\n"
+	       "     dirtied by each vCPU. e.g. 10M or 3G.\n"
+	       "     (default: 1G)\n");
+	backing_src_help("-s");
+	puts("");
+}
+
+int main(int argc, char *argv[])
+{
+	int opt;
+
+	TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
+	TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
+
+	while ((opt = getopt(argc, argv, "b:hs:")) != -1) {
+		switch (opt) {
+		case 'b':
+			guest_percpu_mem_size = parse_size(optarg);
+			break;
+		case 'h':
+			help(argv[0]);
+			exit(0);
+		case 's':
+			backing_src = parse_backing_src_type(optarg);
+			break;
+		default:
+			help(argv[0]);
+			exit(1);
+		}
+	}
+
+	if (!is_backing_src_hugetlb(backing_src)) {
+		pr_info("This test will only work reliably with HugeTLB memory. "
+			"It can work with THP, but that is best effort.\n");
+	}
+
+	guest_modes_append_default();
+
+	dirty_log_manual_caps = 0;
+	for_each_guest_mode(run_test, NULL);
+
+	dirty_log_manual_caps =
+		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+
+	if (dirty_log_manual_caps) {
+		dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+					  KVM_DIRTY_LOG_INITIALLY_SET);
+		for_each_guest_mode(run_test, NULL);
+	} else {
+		pr_info("Skipping testing with MANUAL_PROTECT as it is not supported");
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86/exit_on_emulation_failure_test.c
new file mode 100644
index 000000000000..81055476d394
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/exit_on_emulation_failure_test.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022, Google LLC.
+ *
+ * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE.
+ */
+#include "flds_emulation.h"
+#include "test_util.h"
+#include "ucall_common.h"
+
+#define MMIO_GPA	0x700000000
+#define MMIO_GVA	MMIO_GPA
+
+static void guest_code(void)
+{
+	/* Execute flds with an MMIO address to force KVM to emulate it. */
+	flds(MMIO_GVA);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
+	virt_map(vm, MMIO_GVA, MMIO_GPA, 1);
+
+	vcpu_run(vcpu);
+	handle_flds_emulation_failure_exit(vcpu);
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/feature_msrs_test.c b/tools/testing/selftests/kvm/x86/feature_msrs_test.c
new file mode 100644
index 000000000000..a72f13ae2edb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/feature_msrs_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+static bool is_kvm_controlled_msr(uint32_t msr)
+{
+	return msr == MSR_IA32_VMX_CR0_FIXED1 || msr == MSR_IA32_VMX_CR4_FIXED1;
+}
+
+/*
+ * For VMX MSRs with a "true" variant, KVM requires userspace to set the "true"
+ * MSR, and doesn't allow setting the hidden version.
+ */
+static bool is_hidden_vmx_msr(uint32_t msr)
+{
+	switch (msr) {
+	case MSR_IA32_VMX_PINBASED_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	case MSR_IA32_VMX_EXIT_CTLS:
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool is_quirked_msr(uint32_t msr)
+{
+	return msr != MSR_AMD64_DE_CFG;
+}
+
+static void test_feature_msr(uint32_t msr)
+{
+	const uint64_t supported_mask = kvm_get_feature_msr(msr);
+	uint64_t reset_value = is_quirked_msr(msr) ? supported_mask : 0;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/*
+	 * Don't bother testing KVM-controlled MSRs beyond verifying that the
+	 * MSR can be read from userspace.  Any value is effectively legal, as
+	 * KVM is bound by x86 architecture, not by ABI.
+	 */
+	if (is_kvm_controlled_msr(msr))
+		return;
+
+	/*
+	 * More goofy behavior.  KVM reports the host CPU's actual revision ID,
+	 * but initializes the vCPU's revision ID to an arbitrary value.
+	 */
+	if (msr == MSR_IA32_UCODE_REV)
+		reset_value = host_cpu_is_intel ? 0x100000000ULL : 0x01000065;
+
+	/*
+	 * For quirked MSRs, KVM's ABI is to initialize the vCPU's value to the
+	 * full set of features supported by KVM.  For non-quirked MSRs, and
+	 * when the quirk is disabled, KVM must zero-initialize the MSR and let
+	 * userspace do the configuration.
+	 */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+	TEST_ASSERT(vcpu_get_msr(vcpu, msr) == reset_value,
+		    "Wanted 0x%lx for %squirked MSR 0x%x, got 0x%lx",
+		    reset_value, is_quirked_msr(msr) ? "" : "non-", msr,
+		    vcpu_get_msr(vcpu, msr));
+	if (!is_hidden_vmx_msr(msr))
+		vcpu_set_msr(vcpu, msr, supported_mask);
+	kvm_vm_free(vm);
+
+	if (is_hidden_vmx_msr(msr))
+		return;
+
+	if (!kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2) ||
+	    !(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+		return;
+
+	vm = vm_create(1);
+	vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_STUFF_FEATURE_MSRS);
+
+	vcpu = vm_vcpu_add(vm, 0, NULL);
+	TEST_ASSERT(!vcpu_get_msr(vcpu, msr),
+		    "Quirk disabled, wanted '0' for MSR 0x%x, got 0x%lx",
+		    msr, vcpu_get_msr(vcpu, msr));
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	const struct kvm_msr_list *feature_list;
+	int i;
+
+	/*
+	 * Skip the entire test if MSR_FEATURES isn't supported, other tests
+	 * will cover the "regular" list of MSRs, the coverage here is purely
+	 * opportunistic and not interesting on its own.
+	 */
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES));
+
+	(void)kvm_get_msr_index_list();
+
+	feature_list = kvm_get_feature_msr_index_list();
+	for (i = 0; i < feature_list->nmsrs; i++)
+		test_feature_msr(feature_list->indices[i]);
+}
diff --git a/tools/testing/selftests/kvm/x86/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86/fix_hypercall_test.c
new file mode 100644
index 000000000000..762628f7d4ba
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/fix_hypercall_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <linux/stringify.h>
+#include <stdint.h>
+
+#include "kvm_test_harness.h"
+#include "apic.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+/* VMCALL and VMMCALL are both 3-byte opcodes. */
+#define HYPERCALL_INSN_SIZE	3
+
+static bool quirk_disabled;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	regs->rax = -EFAULT;
+	regs->rip += HYPERCALL_INSN_SIZE;
+}
+
+static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE]  = { 0x0f, 0x01, 0xc1 };
+static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 };
+
+extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE];
+static uint64_t do_sched_yield(uint8_t apic_id)
+{
+	uint64_t ret;
+
+	asm volatile("hypercall_insn:\n\t"
+		     ".byte 0xcc,0xcc,0xcc\n\t"
+		     : "=a"(ret)
+		     : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id)
+		     : "memory");
+
+	return ret;
+}
+
+static void guest_main(void)
+{
+	const uint8_t *native_hypercall_insn;
+	const uint8_t *other_hypercall_insn;
+	uint64_t ret;
+
+	if (host_cpu_is_intel) {
+		native_hypercall_insn = vmx_vmcall;
+		other_hypercall_insn  = svm_vmmcall;
+	} else if (host_cpu_is_amd) {
+		native_hypercall_insn = svm_vmmcall;
+		other_hypercall_insn  = vmx_vmcall;
+	} else {
+		GUEST_ASSERT(0);
+		/* unreachable */
+		return;
+	}
+
+	memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE);
+
+	ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)));
+
+	/*
+	 * If the quirk is disabled, verify that guest_ud_handler() "returned"
+	 * -EFAULT and that KVM did NOT patch the hypercall.  If the quirk is
+	 * enabled, verify that the hypercall succeeded and that KVM patched in
+	 * the "right" hypercall.
+	 */
+	if (quirk_disabled) {
+		GUEST_ASSERT(ret == (uint64_t)-EFAULT);
+		GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn,
+			     HYPERCALL_INSN_SIZE));
+	} else {
+		GUEST_ASSERT(!ret);
+		GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn,
+			     HYPERCALL_INSN_SIZE));
+	}
+
+	GUEST_DONE();
+}
+
+KVM_ONE_VCPU_TEST_SUITE(fix_hypercall);
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]);
+		break;
+	case UCALL_DONE:
+		return;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)",
+			  uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason));
+	}
+}
+
+static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk)
+{
+	struct kvm_vm *vm = vcpu->vm;
+
+	vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler);
+
+	if (disable_quirk)
+		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2,
+			      KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
+
+	quirk_disabled = disable_quirk;
+	sync_global_to_guest(vm, quirk_disabled);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	enter_guest(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(fix_hypercall, enable_quirk, guest_main)
+{
+	test_fix_hypercall(vcpu, false);
+}
+
+KVM_ONE_VCPU_TEST(fix_hypercall, disable_quirk, guest_main)
+{
+	test_fix_hypercall(vcpu, true);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86/flds_emulation.h b/tools/testing/selftests/kvm/x86/flds_emulation.h
new file mode 100644
index 000000000000..37b1a9f52864
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/flds_emulation.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_FLDS_EMULATION_H
+#define SELFTEST_KVM_FLDS_EMULATION_H
+
+#include "kvm_util.h"
+
+#define FLDS_MEM_EAX ".byte 0xd9, 0x00"
+
+/*
+ * flds is an instruction that the KVM instruction emulator is known not to
+ * support. This can be used in guest code along with a mechanism to force
+ * KVM to emulate the instruction (e.g. by providing an MMIO address) to
+ * exercise emulation failures.
+ */
+static inline void flds(uint64_t address)
+{
+	__asm__ __volatile__(FLDS_MEM_EAX :: "a"(address));
+}
+
+static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+	uint8_t *insn_bytes;
+	uint64_t flags;
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+
+	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+		    "Unexpected suberror: %u",
+		    run->emulation_failure.suberror);
+
+	flags = run->emulation_failure.flags;
+	TEST_ASSERT(run->emulation_failure.ndata >= 3 &&
+		    flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES,
+		    "run->emulation_failure is missing instruction bytes");
+
+	TEST_ASSERT(run->emulation_failure.insn_size >= 2,
+		    "Expected a 2-byte opcode for 'flds', got %d bytes",
+		    run->emulation_failure.insn_size);
+
+	insn_bytes = run->emulation_failure.insn_bytes;
+	TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0,
+		    "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x",
+		    insn_bytes[0], insn_bytes[1]);
+
+	vcpu_regs_get(vcpu, &regs);
+	regs.rip += 2;
+	vcpu_regs_set(vcpu, &regs);
+}
+
+#endif /* !SELFTEST_KVM_FLDS_EMULATION_H */
diff --git a/tools/testing/selftests/kvm/x86/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86/hwcr_msr_test.c
new file mode 100644
index 000000000000..10b1b0ba374e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hwcr_msr_test.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit)
+{
+	const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8);
+	const uint64_t valid = BIT_ULL(18) | BIT_ULL(24);
+	const uint64_t legal = ignored | valid;
+	uint64_t val = BIT_ULL(bit);
+	uint64_t actual;
+	int r;
+
+	r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val);
+	TEST_ASSERT(val & ~legal ? !r : r == 1,
+		    "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s",
+		    val, val & ~legal ? "fail" : "succeed");
+
+	actual = vcpu_get_msr(vcpu, MSR_K7_HWCR);
+	TEST_ASSERT(actual == (val & valid),
+		    "Bit %u: unexpected HWCR 0x%lx; expected 0x%lx",
+		    bit, actual, (val & valid));
+
+	vcpu_set_msr(vcpu, MSR_K7_HWCR, 0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	unsigned int bit;
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	for (bit = 0; bit < BITS_PER_LONG; bit++)
+		test_hwcr_bit(vcpu, bit);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_clock.c b/tools/testing/selftests/kvm/x86/hyperv_clock.c
new file mode 100644
index 000000000000..e058bc676cd6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_clock.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Tests for Hyper-V clocksources
+ */
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+struct ms_hyperv_tsc_page {
+	volatile u32 tsc_sequence;
+	u32 reserved1;
+	volatile u64 tsc_scale;
+	volatile s64 tsc_offset;
+} __packed;
+
+/* Simplified mul_u64_u64_shr() */
+static inline u64 mul_u64_u64_shr64(u64 a, u64 b)
+{
+	union {
+		u64 ll;
+		struct {
+			u32 low, high;
+		} l;
+	} rm, rn, rh, a0, b0;
+	u64 c;
+
+	a0.ll = a;
+	b0.ll = b;
+
+	rm.ll = (u64)a0.l.low * b0.l.high;
+	rn.ll = (u64)a0.l.high * b0.l.low;
+	rh.ll = (u64)a0.l.high * b0.l.high;
+
+	rh.l.low = c = rm.l.high + rn.l.high + rh.l.low;
+	rh.l.high = (c >> 32) + rh.l.high;
+
+	return rh.ll;
+}
+
+static inline void nop_loop(void)
+{
+	int i;
+
+	for (i = 0; i < 100000000; i++)
+		asm volatile("nop");
+}
+
+static inline void check_tsc_msr_rdtsc(void)
+{
+	u64 tsc_freq, r1, r2, t1, t2;
+	s64 delta_ns;
+
+	tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
+	GUEST_ASSERT(tsc_freq > 0);
+
+	/* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
+	r1 = rdtsc();
+	t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	r1 = (r1 + rdtsc()) / 2;
+	nop_loop();
+	r2 = rdtsc();
+	t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	r2 = (r2 + rdtsc()) / 2;
+
+	GUEST_ASSERT(r2 > r1 && t2 > t1);
+
+	/* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
+	delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
+	if (delta_ns < 0)
+		delta_ns = -delta_ns;
+
+	/* 1% tolerance */
+	GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100);
+}
+
+static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page)
+{
+	return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset;
+}
+
+static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page)
+{
+	u64 r1, r2, t1, t2;
+
+	/* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */
+	t1 = get_tscpage_ts(tsc_page);
+	r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+
+	/* 10 ms tolerance */
+	GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000);
+	nop_loop();
+
+	t2 = get_tscpage_ts(tsc_page);
+	r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000);
+}
+
+static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa)
+{
+	u64 tsc_scale, tsc_offset;
+
+	/* Set Guest OS id to enable Hyper-V emulation */
+	GUEST_SYNC(1);
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	GUEST_SYNC(2);
+
+	check_tsc_msr_rdtsc();
+
+	GUEST_SYNC(3);
+
+	/* Set up TSC page is disabled state, check that it's clean */
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa);
+	GUEST_ASSERT(tsc_page->tsc_sequence == 0);
+	GUEST_ASSERT(tsc_page->tsc_scale == 0);
+	GUEST_ASSERT(tsc_page->tsc_offset == 0);
+
+	GUEST_SYNC(4);
+
+	/* Set up TSC page is enabled state */
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1);
+	GUEST_ASSERT(tsc_page->tsc_sequence != 0);
+
+	GUEST_SYNC(5);
+
+	check_tsc_msr_tsc_page(tsc_page);
+
+	GUEST_SYNC(6);
+
+	tsc_offset = tsc_page->tsc_offset;
+	/* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */
+
+	GUEST_SYNC(7);
+	/* Sanity check TSC page timestamp, it should be close to 0 */
+	GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000);
+
+	GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset);
+
+	nop_loop();
+
+	/*
+	 * Enable Re-enlightenment and check that TSC page stays constant across
+	 * KVM_SET_CLOCK.
+	 */
+	wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff);
+	wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1);
+	tsc_offset = tsc_page->tsc_offset;
+	tsc_scale = tsc_page->tsc_scale;
+	GUEST_SYNC(8);
+	GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset);
+	GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale);
+
+	GUEST_SYNC(9);
+
+	check_tsc_msr_tsc_page(tsc_page);
+
+	/*
+	 * Disable re-enlightenment and TSC page, check that KVM doesn't update
+	 * it anymore.
+	 */
+	wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
+	wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, 0);
+	memset(tsc_page, 0, sizeof(*tsc_page));
+
+	GUEST_SYNC(10);
+	GUEST_ASSERT(tsc_page->tsc_sequence == 0);
+	GUEST_ASSERT(tsc_page->tsc_offset == 0);
+	GUEST_ASSERT(tsc_page->tsc_scale == 0);
+
+	GUEST_DONE();
+}
+
+static void host_check_tsc_msr_rdtsc(struct kvm_vcpu *vcpu)
+{
+	u64 tsc_freq, r1, r2, t1, t2;
+	s64 delta_ns;
+
+	tsc_freq = vcpu_get_msr(vcpu, HV_X64_MSR_TSC_FREQUENCY);
+	TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
+
+	/* For increased accuracy, take mean rdtsc() before and afrer ioctl */
+	r1 = rdtsc();
+	t1 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT);
+	r1 = (r1 + rdtsc()) / 2;
+	nop_loop();
+	r2 = rdtsc();
+	t2 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT);
+	r2 = (r2 + rdtsc()) / 2;
+
+	TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
+
+	/* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
+	delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
+	if (delta_ns < 0)
+		delta_ns = -delta_ns;
+
+	/* 1% tolerance */
+	TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100,
+		    "Elapsed time does not match (MSR=%ld, TSC=%ld)",
+		    (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	vm_vaddr_t tsc_page_gva;
+	int stage;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME));
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	vcpu_set_hv_cpuid(vcpu);
+
+	tsc_page_gva = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize());
+	TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0,
+		"TSC page has to be page aligned");
+	vcpu_args_set(vcpu, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva));
+
+	host_check_tsc_msr_rdtsc(vcpu);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			/* Keep in sync with guest_main() */
+			TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d",
+				    stage);
+			goto out;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage,
+			    "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+		/* Reset kvmclock triggering TSC page update */
+		if (stage == 7 || stage == 8 || stage == 10) {
+			struct kvm_clock_data clock = {0};
+
+			vm_ioctl(vm, KVM_SET_CLOCK, &clock);
+		}
+	}
+
+out:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c
new file mode 100644
index 000000000000..4f5881d4ef66
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_HYPERV_CPUID
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+static void guest_code(void)
+{
+}
+
+static bool smt_possible(void)
+{
+	char buf[16];
+	FILE *f;
+	bool res = true;
+
+	f = fopen("/sys/devices/system/cpu/smt/control", "r");
+	if (f) {
+		if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
+			if (!strncmp(buf, "forceoff", 8) ||
+			    !strncmp(buf, "notsupported", 12))
+				res = false;
+		}
+		fclose(f);
+	}
+
+	return res;
+}
+
+static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries,
+			  bool evmcs_expected)
+{
+	int i;
+	int nent_expected = 10;
+	u32 test_val;
+
+	TEST_ASSERT(hv_cpuid_entries->nent == nent_expected,
+		    "KVM_GET_SUPPORTED_HV_CPUID should return %d entries"
+		    " (returned %d)",
+		    nent_expected, hv_cpuid_entries->nent);
+
+	for (i = 0; i < hv_cpuid_entries->nent; i++) {
+		const struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
+
+		TEST_ASSERT((entry->function >= 0x40000000) &&
+			    (entry->function <= 0x40000082),
+			    "function %x is our of supported range",
+			    entry->function);
+
+		TEST_ASSERT(entry->index == 0,
+			    ".index field should be zero");
+
+		TEST_ASSERT(entry->flags == 0,
+			    ".flags field should be zero");
+
+		TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
+			    !entry->padding[2], "padding should be zero");
+
+		switch (entry->function) {
+		case 0x40000000:
+			test_val = 0x40000082;
+
+			TEST_ASSERT(entry->eax == test_val,
+				    "Wrong max leaf report in 0x40000000.EAX: %x"
+				    " (evmcs=%d)",
+				    entry->eax, evmcs_expected
+				);
+			break;
+		case 0x40000004:
+			test_val = entry->eax & (1UL << 18);
+
+			TEST_ASSERT(!!test_val == !smt_possible(),
+				    "NoNonArchitecturalCoreSharing bit"
+				    " doesn't reflect SMT setting");
+			break;
+		case 0x4000000A:
+			TEST_ASSERT(entry->eax & (1UL << 19),
+				    "Enlightened MSR-Bitmap should always be supported"
+				    " 0x40000000.EAX: %x", entry->eax);
+			if (evmcs_expected)
+				TEST_ASSERT((entry->eax & 0xffff) == 0x101,
+				    "Supported Enlightened VMCS version range is supposed to be 1:1"
+				    " 0x40000000.EAX: %x", entry->eax);
+
+			break;
+		default:
+			break;
+
+		}
+		/*
+		 * If needed for debug:
+		 * fprintf(stdout,
+		 *	"CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n",
+		 *	entry->function, entry->eax, entry->ebx, entry->ecx,
+		 *	entry->edx);
+		 */
+	}
+}
+
+void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
+{
+	static struct kvm_cpuid2 cpuid = {.nent = 0};
+	int ret;
+
+	if (vcpu)
+		ret = __vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
+	else
+		ret = __kvm_ioctl(vm->kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
+
+	TEST_ASSERT(ret == -1 && errno == E2BIG,
+		    "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
+		    " it should have: %d %d", !vcpu ? "KVM" : "vCPU", ret, errno);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	const struct kvm_cpuid2 *hv_cpuid_entries;
+	struct kvm_vcpu *vcpu;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	/* Test vCPU ioctl version */
+	test_hv_cpuid_e2big(vm, vcpu);
+
+	hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu);
+	test_hv_cpuid(hv_cpuid_entries, false);
+	free((void *)hv_cpuid_entries);
+
+	if (!kvm_cpu_has(X86_FEATURE_VMX) ||
+	    !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+		print_skip("Enlightened VMCS is unsupported");
+		goto do_sys;
+	}
+	vcpu_enable_evmcs(vcpu);
+	hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu);
+	test_hv_cpuid(hv_cpuid_entries, true);
+	free((void *)hv_cpuid_entries);
+
+do_sys:
+	/* Test system ioctl version */
+	if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) {
+		print_skip("KVM_CAP_SYS_HYPERV_CPUID not supported");
+		goto out;
+	}
+
+	test_hv_cpuid_e2big(vm, NULL);
+
+	hv_cpuid_entries = kvm_get_supported_hv_cpuid();
+	test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX));
+
+out:
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86/hyperv_evmcs.c
new file mode 100644
index 000000000000..74cf19661309
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_evmcs.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for Enlightened VMCS, including nested guest state.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/bitmap.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "hyperv.h"
+#include "vmx.h"
+
+static int ud_count;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	ud_count++;
+	regs->rip += 3; /* VMLAUNCH */
+}
+
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+}
+
+static inline void rdmsr_from_l2(uint32_t msr)
+{
+	/* Currently, L1 doesn't preserve GPRs during vmexits. */
+	__asm__ __volatile__ ("rdmsr" : : "c"(msr) :
+			      "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+			      "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
+/* Exit to L1 from L2 with RDMSR instruction */
+void l2_guest_code(void)
+{
+	u64 unused;
+
+	GUEST_SYNC(7);
+
+	GUEST_SYNC(8);
+
+	/* Forced exit to L1 upon restore */
+	GUEST_SYNC(9);
+
+	vmcall();
+
+	/* MSR-Bitmap tests */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
+	vmcall();
+	rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
+
+	/* L2 TLB flush tests */
+	hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
+			 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS);
+	rdmsr_from_l2(MSR_FS_BASE);
+	/*
+	 * Note: hypercall status (RAX) is not preserved correctly by L1 after
+	 * synthetic vmexit, use unchecked version.
+	 */
+	__hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
+			   HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS,
+			   &unused);
+
+	/* Done, exit to L1 and never come back.  */
+	vmcall();
+}
+
+void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages,
+		vm_vaddr_t hv_hcall_page_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa);
+
+	x2apic_enable();
+
+	GUEST_SYNC(1);
+	GUEST_SYNC(2);
+
+	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
+	evmcs_enable();
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_SYNC(3);
+	GUEST_ASSERT(load_evmcs(hv_pages));
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+
+	GUEST_SYNC(4);
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(5);
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+	current_evmcs->revision_id = -1u;
+	GUEST_ASSERT(vmlaunch());
+	current_evmcs->revision_id = EVMCS_VERSION;
+	GUEST_SYNC(6);
+
+	vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+		PIN_BASED_NMI_EXITING);
+
+	/* L2 TLB flush setup */
+	current_evmcs->partition_assist_page = hv_pages->partition_assist_gpa;
+	current_evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+	current_evmcs->hv_vm_id = 1;
+	current_evmcs->hv_vp_id = 1;
+	current_vp_assist->nested_control.features.directhypercall = 1;
+	*(u32 *)(hv_pages->partition_assist) = 0;
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
+	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), NMI_VECTOR);
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+
+	/*
+	 * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
+	 * up-to-date (RIP points where it should and not at the beginning
+	 * of l2_guest_code(). GUEST_SYNC(9) checkes that.
+	 */
+	GUEST_ASSERT(!vmresume());
+
+	GUEST_SYNC(10);
+
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	current_evmcs->guest_rip += 3; /* vmcall */
+
+	/* Intercept RDMSR 0xc0000100 */
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) |
+		CPU_BASED_USE_MSR_BITMAPS);
+	__set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400);
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+
+	/* Enable enlightened MSR bitmap */
+	current_evmcs->hv_enlightenments_control.msr_bitmap = 1;
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+
+	/* Intercept RDMSR 0xc0000101 without telling KVM about it */
+	__set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400);
+	/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
+	current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+	GUEST_ASSERT(!vmresume());
+	/* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	current_evmcs->guest_rip += 3; /* vmcall */
+
+	/* Now tell KVM we've changed MSR-Bitmap */
+	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+
+	/*
+	 * L2 TLB flush test. First VMCALL should be handled directly by L0,
+	 * no VMCALL exit expected.
+	 */
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+	/* Enable synthetic vmexit */
+	*(u32 *)(hv_pages->partition_assist) = 1;
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH);
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_SYNC(11);
+
+	/* Try enlightened vmptrld with an incorrect GPA */
+	evmcs_vmptrld(0xdeadbeef, hv_pages->enlightened_vmcs);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(ud_count == 1);
+	GUEST_DONE();
+}
+
+void inject_nmi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	events.nmi.pending = 1;
+	events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
+
+	vcpu_events_set(vcpu, &events);
+}
+
+static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm,
+					struct kvm_vcpu *vcpu)
+{
+	struct kvm_regs regs1, regs2;
+	struct kvm_x86_state *state;
+
+	state = vcpu_save_state(vcpu);
+	memset(&regs1, 0, sizeof(regs1));
+	vcpu_regs_get(vcpu, &regs1);
+
+	kvm_vm_release(vm);
+
+	/* Restore state in a new VM.  */
+	vcpu = vm_recreate_with_one_vcpu(vm);
+	vcpu_set_hv_cpuid(vcpu);
+	vcpu_enable_evmcs(vcpu);
+	vcpu_load_state(vcpu, state);
+	kvm_x86_state_cleanup(state);
+
+	memset(&regs2, 0, sizeof(regs2));
+	vcpu_regs_get(vcpu, &regs2);
+	TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+		    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+		    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	return vcpu;
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0;
+	vm_vaddr_t hcall_page;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS));
+	TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	hcall_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_page), 0x0,  getpagesize());
+
+	vcpu_set_hv_cpuid(vcpu);
+	vcpu_enable_evmcs(vcpu);
+
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+	vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
+
+	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
+
+	pr_info("Running L1 which uses EVMCS to run L2\n");
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+		vcpu = save_restore_vm(vm, vcpu);
+
+		/* Force immediate L2->L1 exit before resuming */
+		if (stage == 8) {
+			pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n");
+			inject_nmi(vcpu);
+		}
+
+		/*
+		 * Do KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE for a freshly
+		 * restored VM (before the first KVM_RUN) to check that
+		 * KVM_STATE_NESTED_EVMCS is not lost.
+		 */
+		if (stage == 9) {
+			pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n");
+			vcpu = save_restore_vm(vm, vcpu);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_extended_hypercalls.c b/tools/testing/selftests/kvm/x86/hyperv_extended_hypercalls.c
new file mode 100644
index 000000000000..949e08e98f31
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_extended_hypercalls.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test Hyper-V extended hypercall, HV_EXT_CALL_QUERY_CAPABILITIES (0x8001),
+ * exit to userspace and receive result in guest.
+ *
+ * Negative tests are present in hyperv_features.c
+ *
+ * Copyright 2022 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+/* Any value is fine */
+#define EXT_CAPABILITIES 0xbull
+
+static void guest_code(vm_paddr_t in_pg_gpa, vm_paddr_t out_pg_gpa,
+		       vm_vaddr_t out_pg_gva)
+{
+	uint64_t *output_gva;
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, in_pg_gpa);
+
+	output_gva = (uint64_t *)out_pg_gva;
+
+	hyperv_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, in_pg_gpa, out_pg_gpa);
+
+	/* TLFS states output will be a uint64_t value */
+	GUEST_ASSERT_EQ(*output_gva, EXT_CAPABILITIES);
+
+	GUEST_DONE();
+}
+
+int main(void)
+{
+	vm_vaddr_t hcall_out_page;
+	vm_vaddr_t hcall_in_page;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	uint64_t *outval;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
+
+	/* Verify if extended hypercalls are supported */
+	if (!kvm_cpuid_has(kvm_get_supported_hv_cpuid(),
+			   HV_ENABLE_EXTENDED_HYPERCALLS)) {
+		print_skip("Extended calls not supported by the kernel");
+		exit(KSFT_SKIP);
+	}
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+	vcpu_set_hv_cpuid(vcpu);
+
+	/* Hypercall input */
+	hcall_in_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_in_page), 0x0, vm->page_size);
+
+	/* Hypercall output */
+	hcall_out_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_out_page), 0x0, vm->page_size);
+
+	vcpu_args_set(vcpu, 3, addr_gva2gpa(vm, hcall_in_page),
+		      addr_gva2gpa(vm, hcall_out_page), hcall_out_page);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERV,
+		    "Unexpected exit reason: %u (%s)",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	outval = addr_gpa2hva(vm, run->hyperv.u.hcall.params[1]);
+	*outval = EXT_CAPABILITIES;
+	run->hyperv.u.hcall.result = HV_STATUS_SUCCESS;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+		    "Unexpected exit reason: %u (%s)",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_features.c b/tools/testing/selftests/kvm/x86/hyperv_features.c
new file mode 100644
index 000000000000..068e9c69710d
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_features.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Tests for Hyper-V features enablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+/*
+ * HYPERV_CPUID_ENLIGHTMENT_INFO.EBX is not a 'feature' CPUID leaf
+ * but to activate the feature it is sufficient to set it to a non-zero
+ * value. Use BIT(0) for that.
+ */
+#define HV_PV_SPINLOCKS_TEST            \
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EBX, 0)
+
+struct msr_data {
+	uint32_t idx;
+	bool fault_expected;
+	bool write;
+	u64 write_val;
+};
+
+struct hcall_data {
+	uint64_t control;
+	uint64_t expect;
+	bool ud_expected;
+};
+
+static bool is_write_only_msr(uint32_t msr)
+{
+	return msr == HV_X64_MSR_EOI;
+}
+
+static void guest_msr(struct msr_data *msr)
+{
+	uint8_t vector = 0;
+	uint64_t msr_val = 0;
+
+	GUEST_ASSERT(msr->idx);
+
+	if (msr->write)
+		vector = wrmsr_safe(msr->idx, msr->write_val);
+
+	if (!vector && (!msr->write || !is_write_only_msr(msr->idx)))
+		vector = rdmsr_safe(msr->idx, &msr_val);
+
+	if (msr->fault_expected)
+		__GUEST_ASSERT(vector == GP_VECTOR,
+			       "Expected #GP on %sMSR(0x%x), got vector '0x%x'",
+			       msr->write ? "WR" : "RD", msr->idx, vector);
+	else
+		__GUEST_ASSERT(!vector,
+			       "Expected success on %sMSR(0x%x), got vector '0x%x'",
+			       msr->write ? "WR" : "RD", msr->idx, vector);
+
+	if (vector || is_write_only_msr(msr->idx))
+		goto done;
+
+	if (msr->write)
+		__GUEST_ASSERT(!vector,
+			       "WRMSR(0x%x) to '0x%lx', RDMSR read '0x%lx'",
+			       msr->idx, msr->write_val, msr_val);
+
+	/* Invariant TSC bit appears when TSC invariant control MSR is written to */
+	if (msr->idx == HV_X64_MSR_TSC_INVARIANT_CONTROL) {
+		if (!this_cpu_has(HV_ACCESS_TSC_INVARIANT))
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC));
+		else
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC) ==
+				     !!(msr_val & HV_INVARIANT_TSC_EXPOSED));
+	}
+
+done:
+	GUEST_DONE();
+}
+
+static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
+{
+	u64 res, input, output;
+	uint8_t vector;
+
+	GUEST_ASSERT_NE(hcall->control, 0);
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+
+	if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) {
+		input = pgs_gpa;
+		output = pgs_gpa + 4096;
+	} else {
+		input = output = 0;
+	}
+
+	vector = __hyperv_hypercall(hcall->control, input, output, &res);
+	if (hcall->ud_expected) {
+		__GUEST_ASSERT(vector == UD_VECTOR,
+			       "Expected #UD for control '%lu', got vector '0x%x'",
+			       hcall->control, vector);
+	} else {
+		__GUEST_ASSERT(!vector,
+			       "Expected no exception for control '%lu', got vector '0x%x'",
+			       hcall->control, vector);
+		GUEST_ASSERT_EQ(res, hcall->expect);
+	}
+
+	GUEST_DONE();
+}
+
+static void vcpu_reset_hv_cpuid(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Enable all supported Hyper-V features, then clear the leafs holding
+	 * the features that will be tested one by one.
+	 */
+	vcpu_set_hv_cpuid(vcpu);
+
+	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
+	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
+	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
+}
+
+static void guest_test_msrs_access(void)
+{
+	struct kvm_cpuid2 *prev_cpuid = NULL;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage = 0;
+	vm_vaddr_t msr_gva;
+	struct msr_data *msr;
+	bool has_invtsc = kvm_cpu_has(X86_FEATURE_INVTSC);
+
+	while (true) {
+		vm = vm_create_with_one_vcpu(&vcpu, guest_msr);
+
+		msr_gva = vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
+		msr = addr_gva2hva(vm, msr_gva);
+
+		vcpu_args_set(vcpu, 1, msr_gva);
+		vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1);
+
+		if (!prev_cpuid) {
+			vcpu_reset_hv_cpuid(vcpu);
+
+			prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent);
+		} else {
+			vcpu_init_cpuid(vcpu, prev_cpuid);
+		}
+
+		/* TODO: Make this entire test easier to maintain. */
+		if (stage >= 21)
+			vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0);
+
+		switch (stage) {
+		case 0:
+			/*
+			 * Only available when Hyper-V identification is set
+			 */
+			msr->idx = HV_X64_MSR_GUEST_OS_ID;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 1:
+			msr->idx = HV_X64_MSR_HYPERCALL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 2:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
+			/*
+			 * HV_X64_MSR_GUEST_OS_ID has to be written first to make
+			 * HV_X64_MSR_HYPERCALL available.
+			 */
+			msr->idx = HV_X64_MSR_GUEST_OS_ID;
+			msr->write = true;
+			msr->write_val = HYPERV_LINUX_OS_ID;
+			msr->fault_expected = false;
+			break;
+		case 3:
+			msr->idx = HV_X64_MSR_GUEST_OS_ID;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 4:
+			msr->idx = HV_X64_MSR_HYPERCALL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+
+		case 5:
+			msr->idx = HV_X64_MSR_VP_RUNTIME;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 6:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_RUNTIME_AVAILABLE);
+			msr->idx = HV_X64_MSR_VP_RUNTIME;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 7:
+			/* Read only */
+			msr->idx = HV_X64_MSR_VP_RUNTIME;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 8:
+			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 9:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_TIME_REF_COUNT_AVAILABLE);
+			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 10:
+			/* Read only */
+			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 11:
+			msr->idx = HV_X64_MSR_VP_INDEX;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 12:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_INDEX_AVAILABLE);
+			msr->idx = HV_X64_MSR_VP_INDEX;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 13:
+			/* Read only */
+			msr->idx = HV_X64_MSR_VP_INDEX;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 14:
+			msr->idx = HV_X64_MSR_RESET;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 15:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_RESET_AVAILABLE);
+			msr->idx = HV_X64_MSR_RESET;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 16:
+			msr->idx = HV_X64_MSR_RESET;
+			msr->write = true;
+			/*
+			 * TODO: the test only writes '0' to HV_X64_MSR_RESET
+			 * at the moment, writing some other value there will
+			 * trigger real vCPU reset and the code is not prepared
+			 * to handle it yet.
+			 */
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 17:
+			msr->idx = HV_X64_MSR_REFERENCE_TSC;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 18:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_REFERENCE_TSC_AVAILABLE);
+			msr->idx = HV_X64_MSR_REFERENCE_TSC;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 19:
+			msr->idx = HV_X64_MSR_REFERENCE_TSC;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 20:
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 21:
+			/*
+			 * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2
+			 * capability enabled and guest visible CPUID bit unset.
+			 */
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 22:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNIC_AVAILABLE);
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 23:
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 24:
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 25:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNTIMER_AVAILABLE);
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 26:
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+		case 27:
+			/* Direct mode test */
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = true;
+			msr->write_val = 1 << 12;
+			msr->fault_expected = true;
+			break;
+		case 28:
+			vcpu_set_cpuid_feature(vcpu, HV_STIMER_DIRECT_MODE_AVAILABLE);
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = true;
+			msr->write_val = 1 << 12;
+			msr->fault_expected = false;
+			break;
+
+		case 29:
+			msr->idx = HV_X64_MSR_EOI;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 30:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_APIC_ACCESS_AVAILABLE);
+			msr->idx = HV_X64_MSR_EOI;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		case 31:
+			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 32:
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_FREQUENCY_MSRS);
+			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 33:
+			/* Read only */
+			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 34:
+			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 35:
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_REENLIGHTENMENT);
+			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 36:
+			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+		case 37:
+			/* Can only write '0' */
+			msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 38:
+			msr->idx = HV_X64_MSR_CRASH_P0;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 39:
+			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE);
+			msr->idx = HV_X64_MSR_CRASH_P0;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 40:
+			msr->idx = HV_X64_MSR_CRASH_P0;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		case 41:
+			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 42:
+			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_DEBUG_MSRS_AVAILABLE);
+			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
+			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 43:
+			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 44:
+			/* MSR is not available when CPUID feature bit is unset */
+			if (!has_invtsc)
+				goto next_stage;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 45:
+			/* MSR is vailable when CPUID feature bit is set */
+			if (!has_invtsc)
+				goto next_stage;
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT);
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 46:
+			/* Writing bits other than 0 is forbidden */
+			if (!has_invtsc)
+				goto next_stage;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = true;
+			msr->write_val = 0xdeadbeef;
+			msr->fault_expected = true;
+			break;
+		case 47:
+			/* Setting bit 0 enables the feature */
+			if (!has_invtsc)
+				goto next_stage;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		default:
+			kvm_vm_free(vm);
+			return;
+		}
+
+		vcpu_set_cpuid(vcpu);
+
+		memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent));
+
+		pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage,
+			 msr->idx, msr->write ? "write" : "read");
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			return;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+			return;
+		}
+
+next_stage:
+		stage++;
+		kvm_vm_free(vm);
+	}
+}
+
+static void guest_test_hcalls_access(void)
+{
+	struct kvm_cpuid2 *prev_cpuid = NULL;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage = 0;
+	vm_vaddr_t hcall_page, hcall_params;
+	struct hcall_data *hcall;
+
+	while (true) {
+		vm = vm_create_with_one_vcpu(&vcpu, guest_hcall);
+
+		/* Hypercall input/output */
+		hcall_page = vm_vaddr_alloc_pages(vm, 2);
+		memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+		hcall_params = vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
+		hcall = addr_gva2hva(vm, hcall_params);
+
+		vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
+		vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1);
+
+		if (!prev_cpuid) {
+			vcpu_reset_hv_cpuid(vcpu);
+
+			prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent);
+		} else {
+			vcpu_init_cpuid(vcpu, prev_cpuid);
+		}
+
+		switch (stage) {
+		case 0:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
+			hcall->control = 0xbeef;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
+			break;
+
+		case 1:
+			hcall->control = HVCALL_POST_MESSAGE;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 2:
+			vcpu_set_cpuid_feature(vcpu, HV_POST_MESSAGES);
+			hcall->control = HVCALL_POST_MESSAGE;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+
+		case 3:
+			hcall->control = HVCALL_SIGNAL_EVENT;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 4:
+			vcpu_set_cpuid_feature(vcpu, HV_SIGNAL_EVENTS);
+			hcall->control = HVCALL_SIGNAL_EVENT;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+
+		case 5:
+			hcall->control = HVCALL_RESET_DEBUG_SESSION;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
+			break;
+		case 6:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
+			hcall->control = HVCALL_RESET_DEBUG_SESSION;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 7:
+			vcpu_set_cpuid_feature(vcpu, HV_DEBUGGING);
+			hcall->control = HVCALL_RESET_DEBUG_SESSION;
+			hcall->expect = HV_STATUS_OPERATION_DENIED;
+			break;
+
+		case 8:
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 9:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED);
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+		case 10:
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 11:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED);
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+
+		case 12:
+			hcall->control = HVCALL_SEND_IPI;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 13:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_CLUSTER_IPI_RECOMMENDED);
+			hcall->control = HVCALL_SEND_IPI;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+		case 14:
+			/* Nothing in 'sparse banks' -> success */
+			hcall->control = HVCALL_SEND_IPI_EX;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+
+		case 15:
+			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 16:
+			vcpu_set_cpuid_feature(vcpu, HV_PV_SPINLOCKS_TEST);
+			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+		case 17:
+			/* XMM fast hypercall */
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
+			hcall->ud_expected = true;
+			break;
+		case 18:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE);
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
+			hcall->ud_expected = false;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+		case 19:
+			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 20:
+			vcpu_set_cpuid_feature(vcpu, HV_ENABLE_EXTENDED_HYPERCALLS);
+			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES | HV_HYPERCALL_FAST_BIT;
+			hcall->expect = HV_STATUS_INVALID_PARAMETER;
+			break;
+		case 21:
+			kvm_vm_free(vm);
+			return;
+		}
+
+		vcpu_set_cpuid(vcpu);
+
+		memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent));
+
+		pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, hcall->control);
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			return;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+			return;
+		}
+
+		stage++;
+		kvm_vm_free(vm);
+	}
+}
+
+int main(void)
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENFORCE_CPUID));
+
+	pr_info("Testing access to Hyper-V specific MSRs\n");
+	guest_test_msrs_access();
+
+	pr_info("Testing access to Hyper-V hypercalls\n");
+	guest_test_hcalls_access();
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_ipi.c b/tools/testing/selftests/kvm/x86/hyperv_ipi.c
new file mode 100644
index 000000000000..22c0c124582f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_ipi.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V HvCallSendSyntheticClusterIpi{,Ex} tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ */
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "hyperv.h"
+#include "test_util.h"
+#include "vmx.h"
+
+#define RECEIVER_VCPU_ID_1 2
+#define RECEIVER_VCPU_ID_2 65
+
+#define IPI_VECTOR	 0xfe
+
+static volatile uint64_t ipis_rcvd[RECEIVER_VCPU_ID_2 + 1];
+
+struct hv_vpset {
+	u64 format;
+	u64 valid_bank_mask;
+	u64 bank_contents[2];
+};
+
+enum HV_GENERIC_SET_FORMAT {
+	HV_GENERIC_SET_SPARSE_4K,
+	HV_GENERIC_SET_ALL,
+};
+
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+	u32 vector;
+	u32 reserved;
+	u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
+	u32 vector;
+	u32 reserved;
+	struct hv_vpset vp_set;
+};
+
+static inline void hv_init(vm_vaddr_t pgs_gpa)
+{
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+}
+
+static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa)
+{
+	u32 vcpu_id;
+
+	x2apic_enable();
+	hv_init(pgs_gpa);
+
+	vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+
+	/* Signal sender vCPU we're ready */
+	ipis_rcvd[vcpu_id] = (u64)-1;
+
+	for (;;)
+		asm volatile("sti; hlt; cli");
+}
+
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+	u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+
+	ipis_rcvd[vcpu_id]++;
+	wrmsr(HV_X64_MSR_EOI, 1);
+}
+
+static inline void nop_loop(void)
+{
+	int i;
+
+	for (i = 0; i < 100000000; i++)
+		asm volatile("nop");
+}
+
+static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa)
+{
+	struct hv_send_ipi *ipi = (struct hv_send_ipi *)hcall_page;
+	struct hv_send_ipi_ex *ipi_ex = (struct hv_send_ipi_ex *)hcall_page;
+	int stage = 1, ipis_expected[2] = {0};
+
+	hv_init(pgs_gpa);
+	GUEST_SYNC(stage++);
+
+	/* Wait for receiver vCPUs to come up */
+	while (!ipis_rcvd[RECEIVER_VCPU_ID_1] || !ipis_rcvd[RECEIVER_VCPU_ID_2])
+		nop_loop();
+	ipis_rcvd[RECEIVER_VCPU_ID_1] = ipis_rcvd[RECEIVER_VCPU_ID_2] = 0;
+
+	/* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
+	ipi->vector = IPI_VECTOR;
+	ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1;
+	hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'Fast' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
+	hyperv_hypercall(HVCALL_SEND_IPI | HV_HYPERCALL_FAST_BIT,
+			 IPI_VECTOR, 1 << RECEIVER_VCPU_ID_1);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	ipi_ex->vp_set.valid_bank_mask = 1 << 0;
+	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+			 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	ipi_ex->vp_set.valid_bank_mask = 1 << 1;
+	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+			 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1;
+	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
+	ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1, 2} */
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+			 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_ALL;
+	hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/*
+	 * 'XMM Fast' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL.
+	 */
+	ipi_ex->vp_set.valid_bank_mask = 0;
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT,
+			 IPI_VECTOR, HV_GENERIC_SET_ALL);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	GUEST_DONE();
+}
+
+static void *vcpu_thread(void *arg)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
+	int old, r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+		    vcpu->id, r);
+
+	vcpu_run(vcpu);
+
+	TEST_FAIL("vCPU %u exited unexpectedly", vcpu->id);
+
+	return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+	void *retval;
+	int r;
+
+	r = pthread_cancel(thread);
+	TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+
+	r = pthread_join(thread, &retval);
+	TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+	TEST_ASSERT(retval == PTHREAD_CANCELED,
+		    "expected retval=%p, got %p", PTHREAD_CANCELED,
+		    retval);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu[3];
+	vm_vaddr_t hcall_page;
+	pthread_t threads[2];
+	int stage = 1, r;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_SEND_IPI));
+
+	vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
+
+	/* Hypercall input/output */
+	hcall_page = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+
+	vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code);
+	vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1);
+	vcpu_set_hv_cpuid(vcpu[1]);
+
+	vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code);
+	vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2);
+	vcpu_set_hv_cpuid(vcpu[2]);
+
+	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
+
+	vcpu_args_set(vcpu[0], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_hv_cpuid(vcpu[0]);
+
+	r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
+	TEST_ASSERT(!r, "pthread_create failed errno=%d", r);
+
+	r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
+	TEST_ASSERT(!r, "pthread_create failed errno=%d", errno);
+
+	while (true) {
+		vcpu_run(vcpu[0]);
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu[0], &uc)) {
+		case UCALL_SYNC:
+			TEST_ASSERT(uc.args[1] == stage,
+				    "Unexpected stage: %ld (%d expected)",
+				    uc.args[1], stage);
+			break;
+		case UCALL_DONE:
+			goto done;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		stage++;
+	}
+
+done:
+	cancel_join_vcpu_thread(threads[0], vcpu[1]);
+	cancel_join_vcpu_thread(threads[1], vcpu[2]);
+	kvm_vm_free(vm);
+
+	return r;
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86/hyperv_svm_test.c
new file mode 100644
index 000000000000..0ddb63229bcb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_svm_test.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ * Tests for Hyper-V extensions to SVM.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/bitmap.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "hyperv.h"
+
+#define L2_GUEST_STACK_SIZE 256
+
+/* Exit to L1 from L2 with RDMSR instruction */
+static inline void rdmsr_from_l2(uint32_t msr)
+{
+	/* Currently, L1 doesn't preserve GPRs during vmexits. */
+	__asm__ __volatile__ ("rdmsr" : : "c"(msr) :
+			      "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+			      "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
+void l2_guest_code(void)
+{
+	u64 unused;
+
+	GUEST_SYNC(3);
+	/* Exit to L1 */
+	vmmcall();
+
+	/* MSR-Bitmap tests */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
+	vmmcall();
+	rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
+
+	GUEST_SYNC(5);
+
+	/* L2 TLB flush tests */
+	hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+			 HV_HYPERCALL_FAST_BIT, 0x0,
+			 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			 HV_FLUSH_ALL_PROCESSORS);
+	rdmsr_from_l2(MSR_FS_BASE);
+	/*
+	 * Note: hypercall status (RAX) is not preserved correctly by L1 after
+	 * synthetic vmexit, use unchecked version.
+	 */
+	__hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+			   HV_HYPERCALL_FAST_BIT, 0x0,
+			   HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			   HV_FLUSH_ALL_PROCESSORS, &unused);
+
+	/* Done, exit to L1 and never come back.  */
+	vmmcall();
+}
+
+static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm,
+						    struct hyperv_test_pages *hv_pages,
+						    vm_vaddr_t pgs_gpa)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+	struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+	GUEST_SYNC(1);
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
+
+	GUEST_ASSERT(svm->vmcb_gpa);
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* L2 TLB flush setup */
+	hve->partition_assist_page = hv_pages->partition_assist_gpa;
+	hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+	hve->hv_vm_id = 1;
+	hve->hv_vp_id = 1;
+	current_vp_assist->nested_control.features.directhypercall = 1;
+	*(u32 *)(hv_pages->partition_assist) = 0;
+
+	GUEST_SYNC(2);
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_SYNC(4);
+	vmcb->save.rip += 3;
+
+	/* Intercept RDMSR 0xc0000100 */
+	vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT;
+	__set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800);
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+
+	/* Enable enlightened MSR bitmap */
+	hve->hv_enlightenments_control.msr_bitmap = 1;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+
+	/* Intercept RDMSR 0xc0000101 without telling KVM about it */
+	__set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800);
+	/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
+	vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS;
+	run_guest(vmcb, svm->vmcb_gpa);
+	/* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	vmcb->save.rip += 3; /* vmcall */
+
+	/* Now tell KVM we've changed MSR-Bitmap */
+	vmcb->control.clean &= ~HV_VMCB_NESTED_ENLIGHTENMENTS;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+
+
+	/*
+	 * L2 TLB flush test. First VMCALL should be handled directly by L0,
+	 * no VMCALL exit expected.
+	 */
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+	/* Enable synthetic vmexit */
+	*(u32 *)(hv_pages->partition_assist) = 1;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == HV_SVM_EXITCODE_ENL);
+	GUEST_ASSERT(vmcb->control.exit_info_1 == HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH);
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_SYNC(6);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_gva = 0, hv_pages_gva = 0;
+	vm_vaddr_t hcall_page;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+	TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_set_hv_cpuid(vcpu);
+	vcpu_alloc_svm(vm, &nested_gva);
+	vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+
+	hcall_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_page), 0x0,  getpagesize());
+
+	vcpu_args_set(vcpu, 3, nested_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c
new file mode 100644
index 000000000000..077cd0ec3040
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c
@@ -0,0 +1,680 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V HvFlushVirtualAddress{List,Space}{,Ex} tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ */
+#include <asm/barrier.h>
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+#include "test_util.h"
+#include "vmx.h"
+
+#define WORKER_VCPU_ID_1 2
+#define WORKER_VCPU_ID_2 65
+
+#define NTRY 100
+#define NTEST_PAGES 2
+
+struct hv_vpset {
+	u64 format;
+	u64 valid_bank_mask;
+	u64 bank_contents[];
+};
+
+enum HV_GENERIC_SET_FORMAT {
+	HV_GENERIC_SET_SPARSE_4K,
+	HV_GENERIC_SET_ALL,
+};
+
+#define HV_FLUSH_ALL_PROCESSORS			BIT(0)
+#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
+#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
+#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
+
+/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
+struct hv_tlb_flush {
+	u64 address_space;
+	u64 flags;
+	u64 processor_mask;
+	u64 gva_list[];
+} __packed;
+
+/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
+struct hv_tlb_flush_ex {
+	u64 address_space;
+	u64 flags;
+	struct hv_vpset hv_vp_set;
+	u64 gva_list[];
+} __packed;
+
+/*
+ * Pass the following info to 'workers' and 'sender'
+ * - Hypercall page's GVA
+ * - Hypercall page's GPA
+ * - Test pages GVA
+ * - GVAs of the test pages' PTEs
+ */
+struct test_data {
+	vm_vaddr_t hcall_gva;
+	vm_paddr_t hcall_gpa;
+	vm_vaddr_t test_pages;
+	vm_vaddr_t test_pages_pte[NTEST_PAGES];
+};
+
+/* 'Worker' vCPU code checking the contents of the test page */
+static void worker_guest_code(vm_vaddr_t test_data)
+{
+	struct test_data *data = (struct test_data *)test_data;
+	u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+	void *exp_page = (void *)data->test_pages + PAGE_SIZE * NTEST_PAGES;
+	u64 *this_cpu = (u64 *)(exp_page + vcpu_id * sizeof(u64));
+	u64 expected, val;
+
+	x2apic_enable();
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+
+	for (;;) {
+		cpu_relax();
+
+		expected = READ_ONCE(*this_cpu);
+
+		/*
+		 * Make sure the value in the test page is read after reading
+		 * the expectation for the first time. Pairs with wmb() in
+		 * prepare_to_test().
+		 */
+		rmb();
+
+		val = READ_ONCE(*(u64 *)data->test_pages);
+
+		/*
+		 * Make sure the value in the test page is read after before
+		 * reading the expectation for the second time. Pairs with wmb()
+		 * post_test().
+		 */
+		rmb();
+
+		/*
+		 * '0' indicates the sender is between iterations, wait until
+		 * the sender is ready for this vCPU to start checking again.
+		 */
+		if (!expected)
+			continue;
+
+		/*
+		 * Re-read the per-vCPU byte to ensure the sender didn't move
+		 * onto a new iteration.
+		 */
+		if (expected != READ_ONCE(*this_cpu))
+			continue;
+
+		GUEST_ASSERT(val == expected);
+	}
+}
+
+/*
+ * Write per-CPU info indicating what each 'worker' CPU is supposed to see in
+ * test page. '0' means don't check.
+ */
+static void set_expected_val(void *addr, u64 val, int vcpu_id)
+{
+	void *exp_page = addr + PAGE_SIZE * NTEST_PAGES;
+
+	*(u64 *)(exp_page + vcpu_id * sizeof(u64)) = val;
+}
+
+/*
+ * Update PTEs swapping two test pages.
+ * TODO: use swap()/xchg() when these are provided.
+ */
+static void swap_two_test_pages(vm_paddr_t pte_gva1, vm_paddr_t pte_gva2)
+{
+	uint64_t tmp = *(uint64_t *)pte_gva1;
+
+	*(uint64_t *)pte_gva1 = *(uint64_t *)pte_gva2;
+	*(uint64_t *)pte_gva2 = tmp;
+}
+
+/*
+ * TODO: replace the silly NOP loop with a proper udelay() implementation.
+ */
+static inline void do_delay(void)
+{
+	int i;
+
+	for (i = 0; i < 1000000; i++)
+		asm volatile("nop");
+}
+
+/*
+ * Prepare to test: 'disable' workers by setting the expectation to '0',
+ * clear hypercall input page and then swap two test pages.
+ */
+static inline void prepare_to_test(struct test_data *data)
+{
+	/* Clear hypercall input page */
+	memset((void *)data->hcall_gva, 0, PAGE_SIZE);
+
+	/* 'Disable' workers */
+	set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_1);
+	set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_2);
+
+	/* Make sure workers are 'disabled' before we swap PTEs. */
+	wmb();
+
+	/* Make sure workers have enough time to notice */
+	do_delay();
+
+	/* Swap test page mappings */
+	swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]);
+}
+
+/*
+ * Finalize the test: check hypercall resule set the expected val for
+ * 'worker' CPUs and give them some time to test.
+ */
+static inline void post_test(struct test_data *data, u64 exp1, u64 exp2)
+{
+	/* Make sure we change the expectation after swapping PTEs */
+	wmb();
+
+	/* Set the expectation for workers, '0' means don't test */
+	set_expected_val((void *)data->test_pages, exp1, WORKER_VCPU_ID_1);
+	set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2);
+
+	/* Make sure workers have enough time to test */
+	do_delay();
+}
+
+#define TESTVAL1 0x0101010101010101
+#define TESTVAL2 0x0202020202020202
+
+/* Main vCPU doing the test */
+static void sender_guest_code(vm_vaddr_t test_data)
+{
+	struct test_data *data = (struct test_data *)test_data;
+	struct hv_tlb_flush *flush = (struct hv_tlb_flush *)data->hcall_gva;
+	struct hv_tlb_flush_ex *flush_ex = (struct hv_tlb_flush_ex *)data->hcall_gva;
+	vm_paddr_t hcall_gpa = data->hcall_gpa;
+	int i, stage = 1;
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, data->hcall_gpa);
+
+	/* "Slow" hypercalls */
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
+				 hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			HV_FLUSH_ALL_PROCESSORS;
+		flush->processor_mask = 0;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
+				 hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			HV_FLUSH_ALL_PROCESSORS;
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [1] */
+		flush_ex->gva_list[1] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_1 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [2] */
+		flush_ex->gva_list[2] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		flush_ex->gva_list[0] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	/* "Fast" hypercalls */
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+				 HV_HYPERCALL_FAST_BIT, 0x0,
+				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+				 HV_HYPERCALL_FAST_BIT, 0x0,
+				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+				 HV_FLUSH_ALL_PROCESSORS);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET), 0x0,
+				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+				 HV_FLUSH_ALL_PROCESSORS);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [1] */
+		flush_ex->gva_list[1] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_1 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 :
+			  TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [2] */
+		flush_ex->gva_list[2] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 3);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 HV_HYPERCALL_FAST_BIT,
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		flush_ex->gva_list[0] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_DONE();
+}
+
+static void *vcpu_thread(void *arg)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
+	struct ucall uc;
+	int old;
+	int r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+		    vcpu->id, r);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		/* NOT REACHED */
+	default:
+		TEST_FAIL("Unexpected ucall %lu, vCPU %d", uc.cmd, vcpu->id);
+	}
+
+	return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+	void *retval;
+	int r;
+
+	r = pthread_cancel(thread);
+	TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+
+	r = pthread_join(thread, &retval);
+	TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+	TEST_ASSERT(retval == PTHREAD_CANCELED,
+		    "expected retval=%p, got %p", PTHREAD_CANCELED,
+		    retval);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu[3];
+	pthread_t threads[2];
+	vm_vaddr_t test_data_page, gva;
+	vm_paddr_t gpa;
+	uint64_t *pte;
+	struct test_data *data;
+	struct ucall uc;
+	int stage = 1, r, i;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TLBFLUSH));
+
+	vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
+
+	/* Test data page */
+	test_data_page = vm_vaddr_alloc_page(vm);
+	data = (struct test_data *)addr_gva2hva(vm, test_data_page);
+
+	/* Hypercall input/output */
+	data->hcall_gva = vm_vaddr_alloc_pages(vm, 2);
+	data->hcall_gpa = addr_gva2gpa(vm, data->hcall_gva);
+	memset(addr_gva2hva(vm, data->hcall_gva), 0x0, 2 * PAGE_SIZE);
+
+	/*
+	 * Test pages: the first one is filled with '0x01's, the second with '0x02's
+	 * and the test will swap their mappings. The third page keeps the indication
+	 * about the current state of mappings.
+	 */
+	data->test_pages = vm_vaddr_alloc_pages(vm, NTEST_PAGES + 1);
+	for (i = 0; i < NTEST_PAGES; i++)
+		memset(addr_gva2hva(vm, data->test_pages + PAGE_SIZE * i),
+		       (u8)(i + 1), PAGE_SIZE);
+	set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_1);
+	set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_2);
+
+	/*
+	 * Get PTE pointers for test pages and map them inside the guest.
+	 * Use separate page for each PTE for simplicity.
+	 */
+	gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR);
+	for (i = 0; i < NTEST_PAGES; i++) {
+		pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE);
+		gpa = addr_hva2gpa(vm, pte);
+		__virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K);
+		data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK);
+	}
+
+	/*
+	 * Sender vCPU which performs the test: swaps test pages, sets expectation
+	 * for 'workers' and issues TLB flush hypercalls.
+	 */
+	vcpu_args_set(vcpu[0], 1, test_data_page);
+	vcpu_set_hv_cpuid(vcpu[0]);
+
+	/* Create worker vCPUs which check the contents of the test pages */
+	vcpu[1] = vm_vcpu_add(vm, WORKER_VCPU_ID_1, worker_guest_code);
+	vcpu_args_set(vcpu[1], 1, test_data_page);
+	vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_1);
+	vcpu_set_hv_cpuid(vcpu[1]);
+
+	vcpu[2] = vm_vcpu_add(vm, WORKER_VCPU_ID_2, worker_guest_code);
+	vcpu_args_set(vcpu[2], 1, test_data_page);
+	vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_2);
+	vcpu_set_hv_cpuid(vcpu[2]);
+
+	r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
+	TEST_ASSERT(!r, "pthread_create() failed");
+
+	r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
+	TEST_ASSERT(!r, "pthread_create() failed");
+
+	while (true) {
+		vcpu_run(vcpu[0]);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu[0], &uc)) {
+		case UCALL_SYNC:
+			TEST_ASSERT(uc.args[1] == stage,
+				    "Unexpected stage: %ld (%d expected)",
+				    uc.args[1], stage);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		stage++;
+	}
+
+done:
+	cancel_join_vcpu_thread(threads[0], vcpu[1]);
+	cancel_join_vcpu_thread(threads[1], vcpu[2]);
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/kvm_clock_test.c b/tools/testing/selftests/kvm/x86/kvm_clock_test.c
new file mode 100644
index 000000000000..5bc12222d87a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/kvm_clock_test.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ *
+ * Tests for adjusting the KVM clock from userspace
+ */
+#include <asm/kvm_para.h>
+#include <asm/pvclock.h>
+#include <asm/pvclock-abi.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+struct test_case {
+	uint64_t kvmclock_base;
+	int64_t realtime_offset;
+};
+
+static struct test_case test_cases[] = {
+	{ .kvmclock_base = 0 },
+	{ .kvmclock_base = 180 * NSEC_PER_SEC },
+	{ .kvmclock_base = 0, .realtime_offset = -180 * NSEC_PER_SEC },
+	{ .kvmclock_base = 0, .realtime_offset = 180 * NSEC_PER_SEC },
+};
+
+#define GUEST_SYNC_CLOCK(__stage, __val)			\
+		GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
+
+static void guest_main(vm_paddr_t pvti_pa, struct pvclock_vcpu_time_info *pvti)
+{
+	int i;
+
+	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+		GUEST_SYNC_CLOCK(i, __pvclock_read_cycles(pvti, rdtsc()));
+}
+
+#define EXPECTED_FLAGS (KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+static inline void assert_flags(struct kvm_clock_data *data)
+{
+	TEST_ASSERT((data->flags & EXPECTED_FLAGS) == EXPECTED_FLAGS,
+		    "unexpected clock data flags: %x (want set: %x)",
+		    data->flags, EXPECTED_FLAGS);
+}
+
+static void handle_sync(struct ucall *uc, struct kvm_clock_data *start,
+			struct kvm_clock_data *end)
+{
+	uint64_t obs, exp_lo, exp_hi;
+
+	obs = uc->args[2];
+	exp_lo = start->clock;
+	exp_hi = end->clock;
+
+	assert_flags(start);
+	assert_flags(end);
+
+	TEST_ASSERT(exp_lo <= obs && obs <= exp_hi,
+		    "unexpected kvm-clock value: %"PRIu64" expected range: [%"PRIu64", %"PRIu64"]",
+		    obs, exp_lo, exp_hi);
+
+	pr_info("kvm-clock value: %"PRIu64" expected range [%"PRIu64", %"PRIu64"]\n",
+		obs, exp_lo, exp_hi);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+	REPORT_GUEST_ASSERT(*uc);
+}
+
+static void setup_clock(struct kvm_vm *vm, struct test_case *test_case)
+{
+	struct kvm_clock_data data;
+
+	memset(&data, 0, sizeof(data));
+
+	data.clock = test_case->kvmclock_base;
+	if (test_case->realtime_offset) {
+		struct timespec ts;
+		int r;
+
+		data.flags |= KVM_CLOCK_REALTIME;
+		do {
+			r = clock_gettime(CLOCK_REALTIME, &ts);
+			if (!r)
+				break;
+		} while (errno == EINTR);
+
+		TEST_ASSERT(!r, "clock_gettime() failed: %d", r);
+
+		data.realtime = ts.tv_sec * NSEC_PER_SEC;
+		data.realtime += ts.tv_nsec;
+		data.realtime += test_case->realtime_offset;
+	}
+
+	vm_ioctl(vm, KVM_SET_CLOCK, &data);
+}
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct kvm_clock_data start, end;
+	struct kvm_vm *vm = vcpu->vm;
+	struct ucall uc;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		setup_clock(vm, &test_cases[i]);
+
+		vm_ioctl(vm, KVM_GET_CLOCK, &start);
+
+		vcpu_run(vcpu);
+		vm_ioctl(vm, KVM_GET_CLOCK, &end);
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			handle_sync(&uc, &start, &end);
+			break;
+		case UCALL_ABORT:
+			handle_abort(&uc);
+			return;
+		default:
+			TEST_ASSERT(0, "unhandled ucall: %ld", uc.cmd);
+		}
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t pvti_gva;
+	vm_paddr_t pvti_gpa;
+	struct kvm_vm *vm;
+	int flags;
+
+	flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK);
+	TEST_REQUIRE(flags & KVM_CLOCK_REALTIME);
+
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	pvti_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000);
+	pvti_gpa = addr_gva2gpa(vm, pvti_gva);
+	vcpu_args_set(vcpu, 2, pvti_gpa, pvti_gva);
+
+	enter_guest(vcpu);
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/kvm_pv_test.c b/tools/testing/selftests/kvm/x86/kvm_pv_test.c
new file mode 100644
index 000000000000..78878b3a2725
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/kvm_pv_test.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+struct msr_data {
+	uint32_t idx;
+	const char *name;
+};
+
+#define TEST_MSR(msr) { .idx = msr, .name = #msr }
+#define UCALL_PR_MSR 0xdeadbeef
+#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr)
+
+/*
+ * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or
+ * written, as the KVM_CPUID_FEATURES leaf is cleared.
+ */
+static struct msr_data msrs_to_test[] = {
+	TEST_MSR(MSR_KVM_SYSTEM_TIME),
+	TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW),
+	TEST_MSR(MSR_KVM_WALL_CLOCK),
+	TEST_MSR(MSR_KVM_WALL_CLOCK_NEW),
+	TEST_MSR(MSR_KVM_ASYNC_PF_EN),
+	TEST_MSR(MSR_KVM_STEAL_TIME),
+	TEST_MSR(MSR_KVM_PV_EOI_EN),
+	TEST_MSR(MSR_KVM_POLL_CONTROL),
+	TEST_MSR(MSR_KVM_ASYNC_PF_INT),
+	TEST_MSR(MSR_KVM_ASYNC_PF_ACK),
+};
+
+static void test_msr(struct msr_data *msr)
+{
+	uint64_t ignored;
+	uint8_t vector;
+
+	PR_MSR(msr);
+
+	vector = rdmsr_safe(msr->idx, &ignored);
+	GUEST_ASSERT_EQ(vector, GP_VECTOR);
+
+	vector = wrmsr_safe(msr->idx, 0);
+	GUEST_ASSERT_EQ(vector, GP_VECTOR);
+}
+
+struct hcall_data {
+	uint64_t nr;
+	const char *name;
+};
+
+#define TEST_HCALL(hc) { .nr = hc, .name = #hc }
+#define UCALL_PR_HCALL 0xdeadc0de
+#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc)
+
+/*
+ * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding
+ * features have been cleared in KVM_CPUID_FEATURES.
+ */
+static struct hcall_data hcalls_to_test[] = {
+	TEST_HCALL(KVM_HC_KICK_CPU),
+	TEST_HCALL(KVM_HC_SEND_IPI),
+	TEST_HCALL(KVM_HC_SCHED_YIELD),
+};
+
+static void test_hcall(struct hcall_data *hc)
+{
+	uint64_t r;
+
+	PR_HCALL(hc);
+	r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
+	GUEST_ASSERT_EQ(r, -KVM_ENOSYS);
+}
+
+static void guest_main(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) {
+		test_msr(&msrs_to_test[i]);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) {
+		test_hcall(&hcalls_to_test[i]);
+	}
+
+	GUEST_DONE();
+}
+
+static void pr_msr(struct ucall *uc)
+{
+	struct msr_data *msr = (struct msr_data *)uc->args[0];
+
+	pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx);
+}
+
+static void pr_hcall(struct ucall *uc)
+{
+	struct hcall_data *hc = (struct hcall_data *)uc->args[0];
+
+	pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr);
+}
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	while (true) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_PR_MSR:
+			pr_msr(&uc);
+			break;
+		case UCALL_PR_HCALL:
+			pr_hcall(&uc);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			return;
+		case UCALL_DONE:
+			return;
+		}
+	}
+}
+
+static void test_pv_unhalt(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_cpuid_entry2 *ent;
+	u32 kvm_sig_old;
+
+	pr_info("testing KVM_FEATURE_PV_UNHALT\n");
+
+	TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS);
+
+	/* KVM_PV_UNHALT test */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT);
+
+	TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect");
+
+	/* Make sure KVM clears vcpu->arch.kvm_cpuid */
+	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
+	kvm_sig_old = ent->ebx;
+	ent->ebx = 0xdeadbeef;
+	vcpu_set_cpuid(vcpu);
+
+	vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT);
+	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
+	ent->ebx = kvm_sig_old;
+	vcpu_set_cpuid(vcpu);
+
+	TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS");
+
+	/* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */
+
+	kvm_vm_free(vm);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1);
+
+	vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES);
+
+	enter_guest(vcpu);
+	kvm_vm_free(vm);
+
+	test_pv_unhalt();
+}
diff --git a/tools/testing/selftests/kvm/x86/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86/max_vcpuid_cap_test.c
new file mode 100644
index 000000000000..7e2bfb3c3f3b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/max_vcpuid_cap_test.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * maximum APIC ID capability tests
+ *
+ * Copyright (C) 2022, Intel, Inc.
+ *
+ * Tests for getting/setting maximum APIC ID capability
+ */
+
+#include "kvm_util.h"
+
+#define MAX_VCPU_ID	2
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	int ret;
+
+	vm = vm_create_barebones();
+
+	/* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */
+	ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */
+	ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, ret + 1);
+	TEST_ASSERT(ret < 0,
+		    "Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail");
+
+	/* Test BOOT_CPU_ID interaction (MAX_VCPU_ID cannot be lower) */
+	if (kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) {
+		vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)MAX_VCPU_ID);
+
+		/* Try setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID */
+		ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID - 1);
+		TEST_ASSERT(ret < 0,
+			    "Setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID should fail");
+	}
+
+	/* Set KVM_CAP_MAX_VCPU_ID */
+	vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID);
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID again */
+	ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1);
+	TEST_ASSERT(ret < 0,
+		    "Setting KVM_CAP_MAX_VCPU_ID multiple times should fail");
+
+	/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap */
+	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID);
+	TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail");
+
+	/* Create vCPU with bits 63:32 != 0, but an otherwise valid id */
+	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(1L << 32));
+	TEST_ASSERT(ret < 0, "Creating vCPU with ID[63:32] != 0 should fail");
+
+	/* Create vCPU with id within bounds */
+	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)0);
+	TEST_ASSERT(ret >= 0, "Creating vCPU with ID 0 should succeed");
+
+	close(ret);
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86/monitor_mwait_test.c
new file mode 100644
index 000000000000..2b550eff35f1
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/monitor_mwait_test.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define CPUID_MWAIT (1u << 3)
+
+enum monitor_mwait_testcases {
+	MWAIT_QUIRK_DISABLED = BIT(0),
+	MISC_ENABLES_QUIRK_DISABLED = BIT(1),
+	MWAIT_DISABLED = BIT(2),
+};
+
+/*
+ * If both MWAIT and its quirk are disabled, MONITOR/MWAIT should #UD, in all
+ * other scenarios KVM should emulate them as nops.
+ */
+#define GUEST_ASSERT_MONITOR_MWAIT(insn, testcase, vector)		\
+do {									\
+	bool fault_wanted = ((testcase) & MWAIT_QUIRK_DISABLED) &&	\
+			    ((testcase) & MWAIT_DISABLED);		\
+									\
+	if (fault_wanted)						\
+		__GUEST_ASSERT((vector) == UD_VECTOR,			\
+			       "Expected #UD on " insn " for testcase '0x%x', got '0x%x'", \
+			       testcase, vector);			\
+	else								\
+		__GUEST_ASSERT(!(vector),				\
+			       "Expected success on " insn " for testcase '0x%x', got '0x%x'", \
+			       testcase, vector);			\
+} while (0)
+
+static void guest_monitor_wait(int testcase)
+{
+	u8 vector;
+
+	GUEST_SYNC(testcase);
+
+	/*
+	 * Arbitrarily MONITOR this function, SVM performs fault checks before
+	 * intercept checks, so the inputs for MONITOR and MWAIT must be valid.
+	 */
+	vector = kvm_asm_safe("monitor", "a"(guest_monitor_wait), "c"(0), "d"(0));
+	GUEST_ASSERT_MONITOR_MWAIT("MONITOR", testcase, vector);
+
+	vector = kvm_asm_safe("mwait", "a"(guest_monitor_wait), "c"(0), "d"(0));
+	GUEST_ASSERT_MONITOR_MWAIT("MWAIT", testcase, vector);
+}
+
+static void guest_code(void)
+{
+	guest_monitor_wait(MWAIT_DISABLED);
+
+	guest_monitor_wait(MWAIT_QUIRK_DISABLED | MWAIT_DISABLED);
+
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_DISABLED);
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED);
+
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED | MWAIT_DISABLED);
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t disabled_quirks;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int testcase;
+
+	TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT);
+
+	while (1) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			testcase = uc.args[1];
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			goto done;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+			goto done;
+		}
+
+		disabled_quirks = 0;
+		if (testcase & MWAIT_QUIRK_DISABLED)
+			disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS;
+		if (testcase & MISC_ENABLES_QUIRK_DISABLED)
+			disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT;
+		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks);
+
+		/*
+		 * If the MISC_ENABLES quirk (KVM neglects to update CPUID to
+		 * enable/disable MWAIT) is disabled, toggle the ENABLE_MWAIT
+		 * bit in MISC_ENABLES accordingly.  If the quirk is enabled,
+		 * the only valid configuration is MWAIT disabled, as CPUID
+		 * can't be manually changed after running the vCPU.
+		 */
+		if (!(testcase & MISC_ENABLES_QUIRK_DISABLED)) {
+			TEST_ASSERT(testcase & MWAIT_DISABLED,
+				    "Can't toggle CPUID features after running vCPU");
+			continue;
+		}
+
+		vcpu_set_msr(vcpu, MSR_IA32_MISC_ENABLE,
+			     (testcase & MWAIT_DISABLED) ? 0 : MSR_IA32_MISC_ENABLE_MWAIT);
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c
new file mode 100644
index 000000000000..3eb0313ffa39
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#define L2_GUEST_STACK_SIZE 256
+
+/*
+ * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with
+ * the "real" exceptions used, #SS/#GP/#DF (12/13/8).
+ */
+#define FAKE_TRIPLE_FAULT_VECTOR	0xaa
+
+/* Arbitrary 32-bit error code injected by this test. */
+#define SS_ERROR_CODE 0xdeadbeef
+
+/*
+ * Bit '0' is set on Intel if the exception occurs while delivering a previous
+ * event/exception.  AMD's wording is ambiguous, but presumably the bit is set
+ * if the exception occurs while delivering an external event, e.g. NMI or INTR,
+ * but not for exceptions that occur when delivering other exceptions or
+ * software interrupts.
+ *
+ * Note, Intel's name for it, "External event", is misleading and much more
+ * aligned with AMD's behavior, but the SDM is quite clear on its behavior.
+ */
+#define ERROR_CODE_EXT_FLAG	BIT(0)
+
+/*
+ * Bit '1' is set if the fault occurred when looking up a descriptor in the
+ * IDT, which is the case here as the IDT is empty/NULL.
+ */
+#define ERROR_CODE_IDT_FLAG	BIT(1)
+
+/*
+ * The #GP that occurs when vectoring #SS should show the index into the IDT
+ * for #SS, plus have the "IDT flag" set.
+ */
+#define GP_ERROR_CODE_AMD ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG)
+#define GP_ERROR_CODE_INTEL ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG | ERROR_CODE_EXT_FLAG)
+
+/*
+ * Intel and AMD both shove '0' into the error code on #DF, regardless of what
+ * led to the double fault.
+ */
+#define DF_ERROR_CODE 0
+
+#define INTERCEPT_SS		(BIT_ULL(SS_VECTOR))
+#define INTERCEPT_SS_DF		(INTERCEPT_SS | BIT_ULL(DF_VECTOR))
+#define INTERCEPT_SS_GP_DF	(INTERCEPT_SS_DF | BIT_ULL(GP_VECTOR))
+
+static void l2_ss_pending_test(void)
+{
+	GUEST_SYNC(SS_VECTOR);
+}
+
+static void l2_ss_injected_gp_test(void)
+{
+	GUEST_SYNC(GP_VECTOR);
+}
+
+static void l2_ss_injected_df_test(void)
+{
+	GUEST_SYNC(DF_VECTOR);
+}
+
+static void l2_ss_injected_tf_test(void)
+{
+	GUEST_SYNC(FAKE_TRIPLE_FAULT_VECTOR);
+}
+
+static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
+		       uint32_t error_code)
+{
+	struct vmcb *vmcb = svm->vmcb;
+	struct vmcb_control_area *ctrl = &vmcb->control;
+
+	vmcb->save.rip = (u64)l2_code;
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
+		return;
+
+	GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
+	GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
+}
+
+static void l1_svm_code(struct svm_test_data *svm)
+{
+	struct vmcb_control_area *ctrl = &svm->vmcb->control;
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	svm->vmcb->save.idtr.limit = 0;
+	ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS_GP_DF;
+	svm_run_l2(svm, l2_ss_pending_test, SS_VECTOR, SS_ERROR_CODE);
+	svm_run_l2(svm, l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_AMD);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS_DF;
+	svm_run_l2(svm, l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS;
+	svm_run_l2(svm, l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
+	GUEST_ASSERT_EQ(ctrl->exit_code, SVM_EXIT_SHUTDOWN);
+
+	GUEST_DONE();
+}
+
+static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
+{
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_code));
+
+	GUEST_ASSERT_EQ(vector == SS_VECTOR ? vmlaunch() : vmresume(), 0);
+
+	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
+		return;
+
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
+	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
+}
+
+static void l1_vmx_code(struct vmx_pages *vmx)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
+
+	GUEST_ASSERT_EQ(load_vmcs(vmx), true);
+
+	prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0);
+
+	/*
+	 * VMX disallows injecting an exception with error_code[31:16] != 0,
+	 * and hardware will never generate a VM-Exit with bits 31:16 set.
+	 * KVM should likewise truncate the "bad" userspace value.
+	 */
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_GP_DF), 0);
+	vmx_run_l2(l2_ss_pending_test, SS_VECTOR, (u16)SS_ERROR_CODE);
+	vmx_run_l2(l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_INTEL);
+
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_DF), 0);
+	vmx_run_l2(l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
+
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS), 0);
+	vmx_run_l2(l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_TRIPLE_FAULT);
+
+	GUEST_DONE();
+}
+
+static void __attribute__((__flatten__)) l1_guest_code(void *test_data)
+{
+	if (this_cpu_has(X86_FEATURE_SVM))
+		l1_svm_code(test_data);
+	else
+		l1_vmx_code(test_data);
+}
+
+static void assert_ucall_vector(struct kvm_vcpu *vcpu, int vector)
+{
+	struct ucall uc;
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(vector == uc.args[1],
+			    "Expected L2 to ask for %d, got %ld", vector, uc.args[1]);
+		break;
+	case UCALL_DONE:
+		TEST_ASSERT(vector == -1,
+			    "Expected L2 to ask for %d, L2 says it's done", vector);
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	default:
+		TEST_FAIL("Expected L2 to ask for %d, got unexpected ucall %lu", vector, uc.cmd);
+	}
+}
+
+static void queue_ss_exception(struct kvm_vcpu *vcpu, bool inject)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	TEST_ASSERT(!events.exception.pending,
+		    "Vector %d unexpectedlt pending", events.exception.nr);
+	TEST_ASSERT(!events.exception.injected,
+		    "Vector %d unexpectedly injected", events.exception.nr);
+
+	events.flags = KVM_VCPUEVENT_VALID_PAYLOAD;
+	events.exception.pending = !inject;
+	events.exception.injected = inject;
+	events.exception.nr = SS_VECTOR;
+	events.exception.has_error_code = true;
+	events.exception.error_code = SS_ERROR_CODE;
+	vcpu_events_set(vcpu, &events);
+}
+
+/*
+ * Verify KVM_{G,S}ET_EVENTS play nice with pending vs. injected exceptions
+ * when an exception is being queued for L2.  Specifically, verify that KVM
+ * honors L1 exception intercept controls when a #SS is pending/injected,
+ * triggers a #GP on vectoring the #SS, morphs to #DF if #GP isn't intercepted
+ * by L1, and finally causes (nested) SHUTDOWN if #DF isn't intercepted by L1.
+ */
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_test_data_gva;
+	struct kvm_vcpu_events events;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXCEPTION_PAYLOAD));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul);
+
+	if (kvm_cpu_has(X86_FEATURE_SVM))
+		vcpu_alloc_svm(vm, &nested_test_data_gva);
+	else
+		vcpu_alloc_vmx(vm, &nested_test_data_gva);
+
+	vcpu_args_set(vcpu, 1, nested_test_data_gva);
+
+	/* Run L1 => L2.  L2 should sync and request #SS. */
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, SS_VECTOR);
+
+	/* Pend #SS and request immediate exit.  #SS should still be pending. */
+	queue_ss_exception(vcpu, false);
+	vcpu->run->immediate_exit = true;
+	vcpu_run_complete_io(vcpu);
+
+	/* Verify the pending events comes back out the same as it went in. */
+	vcpu_events_get(vcpu, &events);
+	TEST_ASSERT_EQ(events.flags & KVM_VCPUEVENT_VALID_PAYLOAD,
+			KVM_VCPUEVENT_VALID_PAYLOAD);
+	TEST_ASSERT_EQ(events.exception.pending, true);
+	TEST_ASSERT_EQ(events.exception.nr, SS_VECTOR);
+	TEST_ASSERT_EQ(events.exception.has_error_code, true);
+	TEST_ASSERT_EQ(events.exception.error_code, SS_ERROR_CODE);
+
+	/*
+	 * Run for real with the pending #SS, L1 should get a VM-Exit due to
+	 * #SS interception and re-enter L2 to request #GP (via injected #SS).
+	 */
+	vcpu->run->immediate_exit = false;
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, GP_VECTOR);
+
+	/*
+	 * Inject #SS, the #SS should bypass interception and cause #GP, which
+	 * L1 should intercept before KVM morphs it to #DF.  L1 should then
+	 * disable #GP interception and run L2 to request #DF (via #SS => #GP).
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, DF_VECTOR);
+
+	/*
+	 * Inject #SS, the #SS should bypass interception and cause #GP, which
+	 * L1 is no longer interception, and so should see a #DF VM-Exit.  L1
+	 * should then signal that is done.
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, FAKE_TRIPLE_FAULT_VECTOR);
+
+	/*
+	 * Inject #SS yet again.  L1 is not intercepting #GP or #DF, and so
+	 * should see nested TRIPLE_FAULT / SHUTDOWN.
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, -1);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c
new file mode 100644
index 000000000000..e7efb2b35f8b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Usage: to be run via nx_huge_page_test.sh, which does the necessary
+ * environment setup and teardown
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+#include <fcntl.h>
+#include <stdint.h>
+#include <time.h>
+
+#include <test_util.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define HPAGE_SLOT		10
+#define HPAGE_GPA		(4UL << 30) /* 4G prevents collision w/ slot 0 */
+#define HPAGE_GVA		HPAGE_GPA /* GVA is arbitrary, so use GPA. */
+#define PAGES_PER_2MB_HUGE_PAGE 512
+#define HPAGE_SLOT_NPAGES	(3 * PAGES_PER_2MB_HUGE_PAGE)
+
+/*
+ * Passed by nx_huge_pages_test.sh to provide an easy warning if this test is
+ * being run without it.
+ */
+#define MAGIC_TOKEN 887563923
+
+/*
+ * x86 opcode for the return instruction. Used to call into, and then
+ * immediately return from, memory backed with hugepages.
+ */
+#define RETURN_OPCODE 0xC3
+
+/* Call the specified memory address. */
+static void guest_do_CALL(uint64_t target)
+{
+	((void (*)(void)) target)();
+}
+
+/*
+ * Exit the VM after each memory access so that the userspace component of the
+ * test can make assertions about the pages backing the VM.
+ *
+ * See the below for an explanation of how each access should affect the
+ * backing mappings.
+ */
+void guest_code(void)
+{
+	uint64_t hpage_1 = HPAGE_GVA;
+	uint64_t hpage_2 = hpage_1 + (PAGE_SIZE * 512);
+	uint64_t hpage_3 = hpage_2 + (PAGE_SIZE * 512);
+
+	READ_ONCE(*(uint64_t *)hpage_1);
+	GUEST_SYNC(1);
+
+	READ_ONCE(*(uint64_t *)hpage_2);
+	GUEST_SYNC(2);
+
+	guest_do_CALL(hpage_1);
+	GUEST_SYNC(3);
+
+	guest_do_CALL(hpage_3);
+	GUEST_SYNC(4);
+
+	READ_ONCE(*(uint64_t *)hpage_1);
+	GUEST_SYNC(5);
+
+	READ_ONCE(*(uint64_t *)hpage_3);
+	GUEST_SYNC(6);
+}
+
+static void check_2m_page_count(struct kvm_vm *vm, int expected_pages_2m)
+{
+	int actual_pages_2m;
+
+	actual_pages_2m = vm_get_stat(vm, "pages_2m");
+
+	TEST_ASSERT(actual_pages_2m == expected_pages_2m,
+		    "Unexpected 2m page count. Expected %d, got %d",
+		    expected_pages_2m, actual_pages_2m);
+}
+
+static void check_split_count(struct kvm_vm *vm, int expected_splits)
+{
+	int actual_splits;
+
+	actual_splits = vm_get_stat(vm, "nx_lpage_splits");
+
+	TEST_ASSERT(actual_splits == expected_splits,
+		    "Unexpected NX huge page split count. Expected %d, got %d",
+		    expected_splits, actual_splits);
+}
+
+static void wait_for_reclaim(int reclaim_period_ms)
+{
+	long reclaim_wait_ms;
+	struct timespec ts;
+
+	reclaim_wait_ms = reclaim_period_ms * 5;
+	ts.tv_sec = reclaim_wait_ms / 1000;
+	ts.tv_nsec = (reclaim_wait_ms - (ts.tv_sec * 1000)) * 1000000;
+	nanosleep(&ts, NULL);
+}
+
+void run_test(int reclaim_period_ms, bool disable_nx_huge_pages,
+	      bool reboot_permissions)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t nr_bytes;
+	void *hva;
+	int r;
+
+	vm = vm_create(1);
+
+	if (disable_nx_huge_pages) {
+		r = __vm_disable_nx_huge_pages(vm);
+		if (reboot_permissions) {
+			TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions");
+		} else {
+			TEST_ASSERT(r == -1 && errno == EPERM,
+				    "This process should not have permission to disable NX huge pages");
+			return;
+		}
+	}
+
+	vcpu = vm_vcpu_add(vm, 0, guest_code);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_HUGETLB,
+				    HPAGE_GPA, HPAGE_SLOT,
+				    HPAGE_SLOT_NPAGES, 0);
+
+	nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size;
+
+	/*
+	 * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the
+	 * region into the guest with 2MiB pages whenever TDP is disabled (i.e.
+	 * whenever KVM is shadowing the guest page tables).
+	 *
+	 * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge
+	 * pages irrespective of the guest page size, so map with 4KiB pages
+	 * to test that that is the case.
+	 */
+	if (kvm_is_tdp_enabled())
+		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K);
+	else
+		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M);
+
+	hva = addr_gpa2hva(vm, HPAGE_GPA);
+	memset(hva, RETURN_OPCODE, nr_bytes);
+
+	check_2m_page_count(vm, 0);
+	check_split_count(vm, 0);
+
+	/*
+	 * The guest code will first read from the first hugepage, resulting
+	 * in a huge page mapping being created.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, 1);
+	check_split_count(vm, 0);
+
+	/*
+	 * Then the guest code will read from the second hugepage, resulting
+	 * in another huge page mapping being created.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, 2);
+	check_split_count(vm, 0);
+
+	/*
+	 * Next, the guest will execute from the first huge page, causing it
+	 * to be remapped at 4k.
+	 *
+	 * If NX huge pages are disabled, this should have no effect.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 2 : 1);
+	check_split_count(vm, disable_nx_huge_pages ? 0 : 1);
+
+	/*
+	 * Executing from the third huge page (previously unaccessed) will
+	 * cause part to be mapped at 4k.
+	 *
+	 * If NX huge pages are disabled, it should be mapped at 2M.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
+	check_split_count(vm, disable_nx_huge_pages ? 0 : 2);
+
+	/* Reading from the first huge page again should have no effect. */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
+	check_split_count(vm, disable_nx_huge_pages ? 0 : 2);
+
+	/* Give recovery thread time to run. */
+	wait_for_reclaim(reclaim_period_ms);
+
+	/*
+	 * Now that the reclaimer has run, all the split pages should be gone.
+	 *
+	 * If NX huge pages are disabled, the relaimer will not run, so
+	 * nothing should change from here on.
+	 */
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
+	check_split_count(vm, 0);
+
+	/*
+	 * The 4k mapping on hpage 3 should have been removed, so check that
+	 * reading from it causes a huge page mapping to be installed.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 2);
+	check_split_count(vm, 0);
+
+	kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-p period_ms] [-t token]\n", name);
+	puts("");
+	printf(" -p: The NX reclaim period in milliseconds.\n");
+	printf(" -t: The magic token to indicate environment setup is done.\n");
+	printf(" -r: The test has reboot permissions and can disable NX huge pages.\n");
+	puts("");
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	int reclaim_period_ms = 0, token = 0, opt;
+	bool reboot_permissions = false;
+
+	while ((opt = getopt(argc, argv, "hp:t:r")) != -1) {
+		switch (opt) {
+		case 'p':
+			reclaim_period_ms = atoi_positive("Reclaim period", optarg);
+			break;
+		case 't':
+			token = atoi_paranoid(optarg);
+			break;
+		case 'r':
+			reboot_permissions = true;
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES));
+
+	__TEST_REQUIRE(token == MAGIC_TOKEN,
+		       "This test must be run with the magic token via '-t %d'.\n"
+		       "Running via nx_huge_pages_test.sh, which also handles "
+		       "environment setup, is strongly recommended.", MAGIC_TOKEN);
+
+	run_test(reclaim_period_ms, false, reboot_permissions);
+	run_test(reclaim_period_ms, true, reboot_permissions);
+
+	return 0;
+}
+
diff --git a/tools/testing/selftests/kvm/x86/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86/nx_huge_pages_test.sh
new file mode 100755
index 000000000000..caad084b8bfd
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nx_huge_pages_test.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only */
+#
+# Wrapper script which performs setup and cleanup for nx_huge_pages_test.
+# Makes use of root privileges to set up huge pages and KVM module parameters.
+#
+# Copyright (C) 2022, Google LLC.
+
+set -e
+
+NX_HUGE_PAGES=$(cat /sys/module/kvm/parameters/nx_huge_pages)
+NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio)
+NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms)
+HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages)
+
+# If we're already root, the host might not have sudo.
+if [ $(whoami) == "root" ]; then
+	function do_sudo () {
+		"$@"
+	}
+else
+	function do_sudo () {
+		sudo "$@"
+	}
+fi
+
+set +e
+
+function sudo_echo () {
+	echo "$1" | do_sudo tee -a "$2" > /dev/null
+}
+
+NXECUTABLE="$(dirname $0)/nx_huge_pages_test"
+
+sudo_echo test /dev/null || exit 4 # KSFT_SKIP=4
+
+(
+	set -e
+
+	sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages
+	sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio
+	sudo_echo 100 /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms
+	sudo_echo "$(( $HUGE_PAGES + 3 ))" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+	# Test with reboot permissions
+	if [ $(whoami) == "root" ] || sudo setcap cap_sys_boot+ep $NXECUTABLE 2> /dev/null; then
+		echo Running test with CAP_SYS_BOOT enabled
+		$NXECUTABLE -t 887563923 -p 100 -r
+		test $(whoami) == "root" || sudo setcap cap_sys_boot-ep $NXECUTABLE
+	else
+		echo setcap failed, skipping nx_huge_pages_test with CAP_SYS_BOOT enabled
+	fi
+
+	# Test without reboot permissions
+	if [ $(whoami) != "root" ] ; then
+		echo Running test with CAP_SYS_BOOT disabled
+		$NXECUTABLE -t 887563923 -p 100
+	else
+		echo Running as root, skipping nx_huge_pages_test with CAP_SYS_BOOT disabled
+	fi
+)
+RET=$?
+
+sudo_echo "$NX_HUGE_PAGES" /sys/module/kvm/parameters/nx_huge_pages
+sudo_echo "$NX_HUGE_PAGES_RECOVERY_RATIO" /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio
+sudo_echo "$NX_HUGE_PAGES_RECOVERY_PERIOD" /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms
+sudo_echo "$HUGE_PAGES" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+exit $RET
diff --git a/tools/testing/selftests/kvm/x86/platform_info_test.c b/tools/testing/selftests/kvm/x86/platform_info_test.c
new file mode 100644
index 000000000000..9cbf283ebc55
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/platform_info_test.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_MSR_PLATFORM_INFO
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies expected behavior of controlling guest access to
+ * MSR_PLATFORM_INFO.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
+
+static void guest_code(void)
+{
+	uint64_t msr_platform_info;
+	uint8_t vector;
+
+	GUEST_SYNC(true);
+	msr_platform_info = rdmsr(MSR_PLATFORM_INFO);
+	GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
+			MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+
+	GUEST_SYNC(false);
+	vector = rdmsr_safe(MSR_PLATFORM_INFO, &msr_platform_info);
+	GUEST_ASSERT_EQ(vector, GP_VECTOR);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t msr_platform_info;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO);
+	vcpu_set_msr(vcpu, MSR_PLATFORM_INFO,
+		     msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+
+	for (;;) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, uc.args[1]);
+			break;
+		case UCALL_DONE:
+			goto done;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected ucall %lu", uc.cmd);
+			break;
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
new file mode 100644
index 000000000000..698cb36989db
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c
@@ -0,0 +1,644 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+#include <x86intrin.h>
+
+#include "pmu.h"
+#include "processor.h"
+
+/* Number of iterations of the loop for the guest measurement payload. */
+#define NUM_LOOPS			10
+
+/* Each iteration of the loop retires one branch instruction. */
+#define NUM_BRANCH_INSNS_RETIRED	(NUM_LOOPS)
+
+/*
+ * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE,
+ * 1 LOOP.
+ */
+#define NUM_INSNS_PER_LOOP		3
+
+/*
+ * Number of "extra" instructions that will be counted, i.e. the number of
+ * instructions that are needed to set up the loop and then disable the
+ * counter.  2 MOV, 2 XOR, 1 WRMSR.
+ */
+#define NUM_EXTRA_INSNS			5
+
+/* Total number of instructions retired within the measured section. */
+#define NUM_INSNS_RETIRED		(NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS)
+
+
+static uint8_t kvm_pmu_version;
+static bool kvm_has_perf_caps;
+
+static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
+						  void *guest_code,
+						  uint8_t pmu_version,
+						  uint64_t perf_capabilities)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(vcpu, guest_code);
+	sync_global_to_guest(vm, kvm_pmu_version);
+
+	/*
+	 * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling
+	 * features via PERF_CAPABILITIES if the guest doesn't have a vPMU.
+	 */
+	if (kvm_has_perf_caps)
+		vcpu_set_msr(*vcpu, MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+	vcpu_set_cpuid_property(*vcpu, X86_PROPERTY_PMU_VERSION, pmu_version);
+	return vm;
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	do {
+		vcpu_run(vcpu);
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_PRINTF:
+			pr_info("%s", uc.buffer);
+			break;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	} while (uc.cmd != UCALL_DONE);
+}
+
+static uint8_t guest_get_pmu_version(void)
+{
+	/*
+	 * Return the effective PMU version, i.e. the minimum between what KVM
+	 * supports and what is enumerated to the guest.  The host deliberately
+	 * advertises a PMU version to the guest beyond what is actually
+	 * supported by KVM to verify KVM doesn't freak out and do something
+	 * bizarre with an architecturally valid, but unsupported, version.
+	 */
+	return min_t(uint8_t, kvm_pmu_version, this_cpu_property(X86_PROPERTY_PMU_VERSION));
+}
+
+/*
+ * If an architectural event is supported and guaranteed to generate at least
+ * one "hit, assert that its count is non-zero.  If an event isn't supported or
+ * the test can't guarantee the associated action will occur, then all bets are
+ * off regarding the count, i.e. no checks can be done.
+ *
+ * Sanity check that in all cases, the event doesn't count when it's disabled,
+ * and that KVM correctly emulates the write of an arbitrary value.
+ */
+static void guest_assert_event_count(uint8_t idx,
+				     struct kvm_x86_pmu_feature event,
+				     uint32_t pmc, uint32_t pmc_msr)
+{
+	uint64_t count;
+
+	count = _rdpmc(pmc);
+	if (!this_pmu_has(event))
+		goto sanity_checks;
+
+	switch (idx) {
+	case INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX:
+		GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED);
+		break;
+	case INTEL_ARCH_BRANCHES_RETIRED_INDEX:
+		GUEST_ASSERT_EQ(count, NUM_BRANCH_INSNS_RETIRED);
+		break;
+	case INTEL_ARCH_LLC_REFERENCES_INDEX:
+	case INTEL_ARCH_LLC_MISSES_INDEX:
+		if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
+		    !this_cpu_has(X86_FEATURE_CLFLUSH))
+			break;
+		fallthrough;
+	case INTEL_ARCH_CPU_CYCLES_INDEX:
+	case INTEL_ARCH_REFERENCE_CYCLES_INDEX:
+		GUEST_ASSERT_NE(count, 0);
+		break;
+	case INTEL_ARCH_TOPDOWN_SLOTS_INDEX:
+		GUEST_ASSERT(count >= NUM_INSNS_RETIRED);
+		break;
+	default:
+		break;
+	}
+
+sanity_checks:
+	__asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
+	GUEST_ASSERT_EQ(_rdpmc(pmc), count);
+
+	wrmsr(pmc_msr, 0xdead);
+	GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead);
+}
+
+/*
+ * Enable and disable the PMC in a monolithic asm blob to ensure that the
+ * compiler can't insert _any_ code into the measured sequence.  Note, ECX
+ * doesn't need to be clobbered as the input value, @pmc_msr, is restored
+ * before the end of the sequence.
+ *
+ * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
+ * CLFUSH{,OPT} instruction on each loop iteration to force LLC references and
+ * misses, i.e. to allow testing that those events actually count.
+ *
+ * If forced emulation is enabled (and specified), force emulation on a subset
+ * of the measured code to verify that KVM correctly emulates instructions and
+ * branches retired events in conjunction with hardware also counting said
+ * events.
+ */
+#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
+do {										\
+	__asm__ __volatile__("wrmsr\n\t"					\
+			     " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t"	\
+			     "1:\n\t"						\
+			     clflush "\n\t"					\
+			     "mfence\n\t"					\
+			     FEP "loop 1b\n\t"					\
+			     FEP "mov %%edi, %%ecx\n\t"				\
+			     FEP "xor %%eax, %%eax\n\t"				\
+			     FEP "xor %%edx, %%edx\n\t"				\
+			     "wrmsr\n\t"					\
+			     :: "a"((uint32_t)_value), "d"(_value >> 32),	\
+				"c"(_msr), "D"(_msr)				\
+	);									\
+} while (0)
+
+#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP)	\
+do {										\
+	wrmsr(pmc_msr, 0);							\
+										\
+	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP);	\
+	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP);	\
+	else									\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
+										\
+	guest_assert_event_count(_idx, _event, _pmc, _pmc_msr);			\
+} while (0)
+
+static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event,
+				    uint32_t pmc, uint32_t pmc_msr,
+				    uint32_t ctrl_msr, uint64_t ctrl_msr_value)
+{
+	GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, "");
+
+	if (is_forced_emulation_enabled)
+		GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
+}
+
+#define X86_PMU_FEATURE_NULL						\
+({									\
+	struct kvm_x86_pmu_feature feature = {};			\
+									\
+	feature;							\
+})
+
+static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event)
+{
+	return !(*(u64 *)&event);
+}
+
+static void guest_test_arch_event(uint8_t idx)
+{
+	const struct {
+		struct kvm_x86_pmu_feature gp_event;
+		struct kvm_x86_pmu_feature fixed_event;
+	} intel_event_to_feature[] = {
+		[INTEL_ARCH_CPU_CYCLES_INDEX]		 = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED },
+		[INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX]	 = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED },
+		/*
+		 * Note, the fixed counter for reference cycles is NOT the same
+		 * as the general purpose architectural event.  The fixed counter
+		 * explicitly counts at the same frequency as the TSC, whereas
+		 * the GP event counts at a fixed, but uarch specific, frequency.
+		 * Bundle them here for simplicity.
+		 */
+		[INTEL_ARCH_REFERENCE_CYCLES_INDEX]	 = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED },
+		[INTEL_ARCH_LLC_REFERENCES_INDEX]	 = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_LLC_MISSES_INDEX]		 = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_RETIRED_INDEX]	 = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_TOPDOWN_SLOTS_INDEX]	 = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED },
+	};
+
+	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint32_t pmu_version = guest_get_pmu_version();
+	/* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */
+	bool guest_has_perf_global_ctrl = pmu_version >= 2;
+	struct kvm_x86_pmu_feature gp_event, fixed_event;
+	uint32_t base_pmc_msr;
+	unsigned int i;
+
+	/* The host side shouldn't invoke this without a guest PMU. */
+	GUEST_ASSERT(pmu_version);
+
+	if (this_cpu_has(X86_FEATURE_PDCM) &&
+	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		base_pmc_msr = MSR_IA32_PMC0;
+	else
+		base_pmc_msr = MSR_IA32_PERFCTR0;
+
+	gp_event = intel_event_to_feature[idx].gp_event;
+	GUEST_ASSERT_EQ(idx, gp_event.f.bit);
+
+	GUEST_ASSERT(nr_gp_counters);
+
+	for (i = 0; i < nr_gp_counters; i++) {
+		uint64_t eventsel = ARCH_PERFMON_EVENTSEL_OS |
+				    ARCH_PERFMON_EVENTSEL_ENABLE |
+				    intel_pmu_arch_events[idx];
+
+		wrmsr(MSR_P6_EVNTSEL0 + i, 0);
+		if (guest_has_perf_global_ctrl)
+			wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
+
+		__guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i,
+					MSR_P6_EVNTSEL0 + i, eventsel);
+	}
+
+	if (!guest_has_perf_global_ctrl)
+		return;
+
+	fixed_event = intel_event_to_feature[idx].fixed_event;
+	if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event))
+		return;
+
+	i = fixed_event.f.bit;
+
+	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+
+	__guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED,
+				MSR_CORE_PERF_FIXED_CTR0 + i,
+				MSR_CORE_PERF_GLOBAL_CTRL,
+				FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+}
+
+static void guest_test_arch_events(void)
+{
+	uint8_t i;
+
+	for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++)
+		guest_test_arch_event(i);
+
+	GUEST_DONE();
+}
+
+static void test_arch_events(uint8_t pmu_version, uint64_t perf_capabilities,
+			     uint8_t length, uint8_t unavailable_mask)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/* Testing arch events requires a vPMU (there are no negative tests). */
+	if (!pmu_version)
+		return;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_arch_events,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
+				length);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
+				unavailable_mask);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Limit testing to MSRs that are actually defined by Intel (in the SDM).  MSRs
+ * that aren't defined counter MSRs *probably* don't exist, but there's no
+ * guarantee that currently undefined MSR indices won't be used for something
+ * other than PMCs in the future.
+ */
+#define MAX_NR_GP_COUNTERS	8
+#define MAX_NR_FIXED_COUNTERS	3
+
+#define GUEST_ASSERT_PMC_MSR_ACCESS(insn, msr, expect_gp, vector)		\
+__GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector,			\
+	       "Expected %s on " #insn "(0x%x), got vector %u",			\
+	       expect_gp ? "#GP" : "no fault", msr, vector)			\
+
+#define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected)			\
+	__GUEST_ASSERT(val == expected_val,					\
+		       "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx",	\
+		       msr, expected_val, val);
+
+static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success,
+			     uint64_t expected_val)
+{
+	uint8_t vector;
+	uint64_t val;
+
+	vector = rdpmc_safe(rdpmc_idx, &val);
+	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
+	if (expect_success)
+		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
+
+	if (!is_forced_emulation_enabled)
+		return;
+
+	vector = rdpmc_safe_fep(rdpmc_idx, &val);
+	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
+	if (expect_success)
+		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
+}
+
+static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters,
+				 uint8_t nr_counters, uint32_t or_mask)
+{
+	const bool pmu_has_fast_mode = !guest_get_pmu_version();
+	uint8_t i;
+
+	for (i = 0; i < nr_possible_counters; i++) {
+		/*
+		 * TODO: Test a value that validates full-width writes and the
+		 * width of the counters.
+		 */
+		const uint64_t test_val = 0xffff;
+		const uint32_t msr = base_msr + i;
+
+		/*
+		 * Fixed counters are supported if the counter is less than the
+		 * number of enumerated contiguous counters *or* the counter is
+		 * explicitly enumerated in the supported counters mask.
+		 */
+		const bool expect_success = i < nr_counters || (or_mask & BIT(i));
+
+		/*
+		 * KVM drops writes to MSR_P6_PERFCTR[0|1] if the counters are
+		 * unsupported, i.e. doesn't #GP and reads back '0'.
+		 */
+		const uint64_t expected_val = expect_success ? test_val : 0;
+		const bool expect_gp = !expect_success && msr != MSR_P6_PERFCTR0 &&
+				       msr != MSR_P6_PERFCTR1;
+		uint32_t rdpmc_idx;
+		uint8_t vector;
+		uint64_t val;
+
+		vector = wrmsr_safe(msr, test_val);
+		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
+
+		vector = rdmsr_safe(msr, &val);
+		GUEST_ASSERT_PMC_MSR_ACCESS(RDMSR, msr, expect_gp, vector);
+
+		/* On #GP, the result of RDMSR is undefined. */
+		if (!expect_gp)
+			GUEST_ASSERT_PMC_VALUE(RDMSR, msr, val, expected_val);
+
+		/*
+		 * Redo the read tests with RDPMC, which has different indexing
+		 * semantics and additional capabilities.
+		 */
+		rdpmc_idx = i;
+		if (base_msr == MSR_CORE_PERF_FIXED_CTR0)
+			rdpmc_idx |= INTEL_RDPMC_FIXED;
+
+		guest_test_rdpmc(rdpmc_idx, expect_success, expected_val);
+
+		/*
+		 * KVM doesn't support non-architectural PMUs, i.e. it should
+		 * impossible to have fast mode RDPMC.  Verify that attempting
+		 * to use fast RDPMC always #GPs.
+		 */
+		GUEST_ASSERT(!expect_success || !pmu_has_fast_mode);
+		rdpmc_idx |= INTEL_RDPMC_FAST;
+		guest_test_rdpmc(rdpmc_idx, false, -1ull);
+
+		vector = wrmsr_safe(msr, 0);
+		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
+	}
+}
+
+static void guest_test_gp_counters(void)
+{
+	uint8_t pmu_version = guest_get_pmu_version();
+	uint8_t nr_gp_counters = 0;
+	uint32_t base_msr;
+
+	if (pmu_version)
+		nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+
+	/*
+	 * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is
+	 * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number
+	 * of GP counters.  If there are no GP counters, require KVM to leave
+	 * PERF_GLOBAL_CTRL '0'.  This edge case isn't covered by the SDM, but
+	 * follow the spirit of the architecture and only globally enable GP
+	 * counters, of which there are none.
+	 */
+	if (pmu_version > 1) {
+		uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL);
+
+		if (nr_gp_counters)
+			GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0));
+		else
+			GUEST_ASSERT_EQ(global_ctrl, 0);
+	}
+
+	if (this_cpu_has(X86_FEATURE_PDCM) &&
+	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		base_msr = MSR_IA32_PMC0;
+	else
+		base_msr = MSR_IA32_PERFCTR0;
+
+	guest_rd_wr_counters(base_msr, MAX_NR_GP_COUNTERS, nr_gp_counters, 0);
+	GUEST_DONE();
+}
+
+static void test_gp_counters(uint8_t pmu_version, uint64_t perf_capabilities,
+			     uint8_t nr_gp_counters)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_gp_counters,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_GP_COUNTERS,
+				nr_gp_counters);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void guest_test_fixed_counters(void)
+{
+	uint64_t supported_bitmask = 0;
+	uint8_t nr_fixed_counters = 0;
+	uint8_t i;
+
+	/* Fixed counters require Architectural vPMU Version 2+. */
+	if (guest_get_pmu_version() >= 2)
+		nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+
+	/*
+	 * The supported bitmask for fixed counters was introduced in PMU
+	 * version 5.
+	 */
+	if (guest_get_pmu_version() >= 5)
+		supported_bitmask = this_cpu_property(X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK);
+
+	guest_rd_wr_counters(MSR_CORE_PERF_FIXED_CTR0, MAX_NR_FIXED_COUNTERS,
+			     nr_fixed_counters, supported_bitmask);
+
+	for (i = 0; i < MAX_NR_FIXED_COUNTERS; i++) {
+		uint8_t vector;
+		uint64_t val;
+
+		if (i >= nr_fixed_counters && !(supported_bitmask & BIT_ULL(i))) {
+			vector = wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL,
+					    FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+			__GUEST_ASSERT(vector == GP_VECTOR,
+				       "Expected #GP for counter %u in FIXED_CTR_CTRL", i);
+
+			vector = wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL,
+					    FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+			__GUEST_ASSERT(vector == GP_VECTOR,
+				       "Expected #GP for counter %u in PERF_GLOBAL_CTRL", i);
+			continue;
+		}
+
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i);
+
+		GUEST_ASSERT_NE(val, 0);
+	}
+	GUEST_DONE();
+}
+
+static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
+				uint8_t nr_fixed_counters,
+				uint32_t supported_bitmask)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_fixed_counters,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK,
+				supported_bitmask);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_FIXED_COUNTERS,
+				nr_fixed_counters);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void test_intel_counters(void)
+{
+	uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
+	unsigned int i;
+	uint8_t v, j;
+	uint32_t k;
+
+	const uint64_t perf_caps[] = {
+		0,
+		PMU_CAP_FW_WRITES,
+	};
+
+	/*
+	 * Test up to PMU v5, which is the current maximum version defined by
+	 * Intel, i.e. is the last version that is guaranteed to be backwards
+	 * compatible with KVM's existing behavior.
+	 */
+	uint8_t max_pmu_version = max_t(typeof(pmu_version), pmu_version, 5);
+
+	/*
+	 * Detect the existence of events that aren't supported by selftests.
+	 * This will (obviously) fail any time the kernel adds support for a
+	 * new event, but it's worth paying that price to keep the test fresh.
+	 */
+	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
+		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
+		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
+
+	/*
+	 * Force iterating over known arch events regardless of whether or not
+	 * KVM/hardware supports a given event.
+	 */
+	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
+
+	for (v = 0; v <= max_pmu_version; v++) {
+		for (i = 0; i < ARRAY_SIZE(perf_caps); i++) {
+			if (!kvm_has_perf_caps && perf_caps[i])
+				continue;
+
+			pr_info("Testing arch events, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			/*
+			 * To keep the total runtime reasonable, test every
+			 * possible non-zero, non-reserved bitmap combination
+			 * only with the native PMU version and the full bit
+			 * vector length.
+			 */
+			if (v == pmu_version) {
+				for (k = 1; k < (BIT(nr_arch_events) - 1); k++)
+					test_arch_events(v, perf_caps[i], nr_arch_events, k);
+			}
+			/*
+			 * Test single bits for all PMU version and lengths up
+			 * the number of events +1 (to verify KVM doesn't do
+			 * weird things if the guest length is greater than the
+			 * host length).  Explicitly test a mask of '0' and all
+			 * ones i.e. all events being available and unavailable.
+			 */
+			for (j = 0; j <= nr_arch_events + 1; j++) {
+				test_arch_events(v, perf_caps[i], j, 0);
+				test_arch_events(v, perf_caps[i], j, 0xff);
+
+				for (k = 0; k < nr_arch_events; k++)
+					test_arch_events(v, perf_caps[i], j, BIT(k));
+			}
+
+			pr_info("Testing GP counters, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			for (j = 0; j <= nr_gp_counters; j++)
+				test_gp_counters(v, perf_caps[i], j);
+
+			pr_info("Testing fixed counters, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			for (j = 0; j <= nr_fixed_counters; j++) {
+				for (k = 0; k <= (BIT(nr_fixed_counters) - 1); k++)
+					test_fixed_counters(v, perf_caps[i], j, k);
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
+
+	kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
+	kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM);
+
+	test_intel_counters();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c
new file mode 100644
index 000000000000..c15513cd74d1
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_SET_PMU_EVENT_FILTER.
+ *
+ * Copyright (C) 2022, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies the expected behavior of allow lists and deny lists for
+ * virtual PMU events.
+ */
+#include "kvm_util.h"
+#include "pmu.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define NUM_BRANCHES 42
+#define MAX_TEST_EVENTS		10
+
+#define PMU_EVENT_FILTER_INVALID_ACTION		(KVM_PMU_EVENT_DENY + 1)
+#define PMU_EVENT_FILTER_INVALID_FLAGS			(KVM_PMU_EVENT_FLAGS_VALID_MASK << 1)
+#define PMU_EVENT_FILTER_INVALID_NEVENTS		(KVM_PMU_EVENT_FILTER_MAX_EVENTS + 1)
+
+struct __kvm_pmu_event_filter {
+	__u32 action;
+	__u32 nevents;
+	__u32 fixed_counter_bitmap;
+	__u32 flags;
+	__u32 pad[4];
+	__u64 events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
+};
+
+/*
+ * This event list comprises Intel's known architectural events, plus AMD's
+ * Branch Instructions Retired for Zen CPUs.  Note, AMD and Intel use the
+ * same encoding for Instructions Retired.
+ */
+kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED);
+
+static const struct __kvm_pmu_event_filter base_event_filter = {
+	.nevents = ARRAY_SIZE(base_event_filter.events),
+	.events = {
+		INTEL_ARCH_CPU_CYCLES,
+		INTEL_ARCH_INSTRUCTIONS_RETIRED,
+		INTEL_ARCH_REFERENCE_CYCLES,
+		INTEL_ARCH_LLC_REFERENCES,
+		INTEL_ARCH_LLC_MISSES,
+		INTEL_ARCH_BRANCHES_RETIRED,
+		INTEL_ARCH_BRANCHES_MISPREDICTED,
+		INTEL_ARCH_TOPDOWN_SLOTS,
+		AMD_ZEN_BRANCHES_RETIRED,
+	},
+};
+
+struct {
+	uint64_t loads;
+	uint64_t stores;
+	uint64_t loads_stores;
+	uint64_t branches_retired;
+	uint64_t instructions_retired;
+} pmc_results;
+
+/*
+ * If we encounter a #GP during the guest PMU sanity check, then the guest
+ * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
+ */
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	GUEST_SYNC(-EFAULT);
+}
+
+/*
+ * Check that we can write a new value to the given MSR and read it back.
+ * The caller should provide a non-empty set of bits that are safe to flip.
+ *
+ * Return on success. GUEST_SYNC(0) on error.
+ */
+static void check_msr(uint32_t msr, uint64_t bits_to_flip)
+{
+	uint64_t v = rdmsr(msr) ^ bits_to_flip;
+
+	wrmsr(msr, v);
+	if (rdmsr(msr) != v)
+		GUEST_SYNC(-EIO);
+
+	v ^= bits_to_flip;
+	wrmsr(msr, v);
+	if (rdmsr(msr) != v)
+		GUEST_SYNC(-EIO);
+}
+
+static void run_and_measure_loop(uint32_t msr_base)
+{
+	const uint64_t branches_retired = rdmsr(msr_base + 0);
+	const uint64_t insn_retired = rdmsr(msr_base + 1);
+
+	__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+
+	pmc_results.branches_retired = rdmsr(msr_base + 0) - branches_retired;
+	pmc_results.instructions_retired = rdmsr(msr_base + 1) - insn_retired;
+}
+
+static void intel_guest_code(void)
+{
+	check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
+	check_msr(MSR_P6_EVNTSEL0, 0xffff);
+	check_msr(MSR_IA32_PMC0, 0xffff);
+	GUEST_SYNC(0);
+
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_BRANCHES_RETIRED);
+		wrmsr(MSR_P6_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_INSTRUCTIONS_RETIRED);
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+
+		run_and_measure_loop(MSR_IA32_PMC0);
+		GUEST_SYNC(0);
+	}
+}
+
+/*
+ * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
+ * this code uses the always-available, legacy K7 PMU MSRs, which alias to
+ * the first four of the six extended core PMU MSRs.
+ */
+static void amd_guest_code(void)
+{
+	check_msr(MSR_K7_EVNTSEL0, 0xffff);
+	check_msr(MSR_K7_PERFCTR0, 0xffff);
+	GUEST_SYNC(0);
+
+	for (;;) {
+		wrmsr(MSR_K7_EVNTSEL0, 0);
+		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BRANCHES_RETIRED);
+		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_INSTRUCTIONS_RETIRED);
+
+		run_and_measure_loop(MSR_K7_PERFCTR0);
+		GUEST_SYNC(0);
+	}
+}
+
+/*
+ * Run the VM to the next GUEST_SYNC(value), and return the value passed
+ * to the sync. Any other exit from the guest is fatal.
+ */
+static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	get_ucall(vcpu, &uc);
+	TEST_ASSERT(uc.cmd == UCALL_SYNC,
+		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
+	return uc.args[1];
+}
+
+static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
+{
+	uint64_t r;
+
+	memset(&pmc_results, 0, sizeof(pmc_results));
+	sync_global_to_guest(vcpu->vm, pmc_results);
+
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(!r, "Unexpected sync value: 0x%lx", r);
+
+	sync_global_from_guest(vcpu->vm, pmc_results);
+}
+
+/*
+ * In a nested environment or if the vPMU is disabled, the guest PMU
+ * might not work as architected (accessing the PMU MSRs may raise
+ * #GP, or writes could simply be discarded). In those situations,
+ * there is no point in running these tests. The guest code will perform
+ * a sanity check and then GUEST_SYNC(success). In the case of failure,
+ * the behavior of the guest on resumption is undefined.
+ */
+static bool sanity_check_pmu(struct kvm_vcpu *vcpu)
+{
+	uint64_t r;
+
+	vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler);
+	r = run_vcpu_to_sync(vcpu);
+	vm_install_exception_handler(vcpu->vm, GP_VECTOR, NULL);
+
+	return !r;
+}
+
+/*
+ * Remove the first occurrence of 'event' (if any) from the filter's
+ * event list.
+ */
+static void remove_event(struct __kvm_pmu_event_filter *f, uint64_t event)
+{
+	bool found = false;
+	int i;
+
+	for (i = 0; i < f->nevents; i++) {
+		if (found)
+			f->events[i - 1] = f->events[i];
+		else
+			found = f->events[i] == event;
+	}
+	if (found)
+		f->nevents--;
+}
+
+#define ASSERT_PMC_COUNTING_INSTRUCTIONS()						\
+do {											\
+	uint64_t br = pmc_results.branches_retired;					\
+	uint64_t ir = pmc_results.instructions_retired;					\
+											\
+	if (br && br != NUM_BRANCHES)							\
+		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",	\
+			__func__, br, NUM_BRANCHES);					\
+	TEST_ASSERT(br, "%s: Branch instructions retired = %lu (expected > 0)",		\
+		    __func__, br);							\
+	TEST_ASSERT(ir,	"%s: Instructions retired = %lu (expected > 0)",		\
+		    __func__, ir);							\
+} while (0)
+
+#define ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS()						\
+do {											\
+	uint64_t br = pmc_results.branches_retired;					\
+	uint64_t ir = pmc_results.instructions_retired;					\
+											\
+	TEST_ASSERT(!br, "%s: Branch instructions retired = %lu (expected 0)",		\
+		    __func__, br);							\
+	TEST_ASSERT(!ir, "%s: Instructions retired = %lu (expected 0)",			\
+		    __func__, ir);							\
+} while (0)
+
+static void test_without_filter(struct kvm_vcpu *vcpu)
+{
+	run_vcpu_and_sync_pmc_results(vcpu);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_with_filter(struct kvm_vcpu *vcpu,
+			     struct __kvm_pmu_event_filter *__f)
+{
+	struct kvm_pmu_event_filter *f = (void *)__f;
+
+	vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
+	run_vcpu_and_sync_pmc_results(vcpu);
+}
+
+static void test_amd_deny_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = {
+		.action = KVM_PMU_EVENT_DENY,
+		.nevents = 1,
+		.events = {
+			RAW_EVENT(0x1C2, 0),
+		},
+	};
+
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_member_deny_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_DENY;
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
+}
+
+static void test_member_allow_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_ALLOW;
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_not_member_deny_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_DENY;
+
+	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
+	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
+	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_not_member_allow_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_ALLOW;
+
+	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
+	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
+	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
+}
+
+/*
+ * Verify that setting KVM_PMU_CAP_DISABLE prevents the use of the PMU.
+ *
+ * Note that KVM_CAP_PMU_CAPABILITY must be invoked prior to creating VCPUs.
+ */
+static void test_pmu_config_disable(void (*guest_code)(void))
+{
+	struct kvm_vcpu *vcpu;
+	int r;
+	struct kvm_vm *vm;
+
+	r = kvm_check_cap(KVM_CAP_PMU_CAPABILITY);
+	if (!(r & KVM_PMU_CAP_DISABLE))
+		return;
+
+	vm = vm_create(1);
+
+	vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE);
+
+	vcpu = vm_vcpu_add(vm, 0, guest_code);
+	TEST_ASSERT(!sanity_check_pmu(vcpu),
+		    "Guest should not be able to use disabled PMU.");
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * On Intel, check for a non-zero PMU version, at least one general-purpose
+ * counter per logical processor, and support for counting the number of branch
+ * instructions retired.
+ */
+static bool use_intel_pmu(void)
+{
+	return host_cpu_is_intel &&
+	       kvm_cpu_property(X86_PROPERTY_PMU_VERSION) &&
+	       kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) &&
+	       kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
+}
+
+/*
+ * On AMD, all Family 17h+ CPUs (Zen and its successors) use event encoding
+ * 0xc2,0 for Branch Instructions Retired.
+ */
+static bool use_amd_pmu(void)
+{
+	return host_cpu_is_amd && kvm_cpu_family() >= 0x17;
+}
+
+/*
+ * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and
+ * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/
+ * supported on Intel Xeon processors:
+ *  - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake.
+ */
+#define MEM_INST_RETIRED		0xD0
+#define MEM_INST_RETIRED_LOAD		RAW_EVENT(MEM_INST_RETIRED, 0x81)
+#define MEM_INST_RETIRED_STORE		RAW_EVENT(MEM_INST_RETIRED, 0x82)
+#define MEM_INST_RETIRED_LOAD_STORE	RAW_EVENT(MEM_INST_RETIRED, 0x83)
+
+static bool supports_event_mem_inst_retired(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (x86_family(eax) == 0x6) {
+		switch (x86_model(eax)) {
+		/* Sapphire Rapids */
+		case 0x8F:
+		/* Ice Lake */
+		case 0x6A:
+		/* Skylake */
+		/* Cascade Lake */
+		case 0x55:
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * "LS Dispatch", from Processor Programming Reference
+ * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
+ * Preliminary Processor Programming Reference (PPR) for AMD Family
+ * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
+ * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
+ * B1 Processors Volume 1 of 2.
+ */
+#define LS_DISPATCH		0x29
+#define LS_DISPATCH_LOAD	RAW_EVENT(LS_DISPATCH, BIT(0))
+#define LS_DISPATCH_STORE	RAW_EVENT(LS_DISPATCH, BIT(1))
+#define LS_DISPATCH_LOAD_STORE	RAW_EVENT(LS_DISPATCH, BIT(2))
+
+#define INCLUDE_MASKED_ENTRY(event_select, mask, match) \
+	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false)
+#define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \
+	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true)
+
+static void masked_events_guest_test(uint32_t msr_base)
+{
+	/*
+	 * The actual value of the counters don't determine the outcome of
+	 * the test.  Only that they are zero or non-zero.
+	 */
+	const uint64_t loads = rdmsr(msr_base + 0);
+	const uint64_t stores = rdmsr(msr_base + 1);
+	const uint64_t loads_stores = rdmsr(msr_base + 2);
+	int val;
+
+
+	__asm__ __volatile__("movl $0, %[v];"
+			     "movl %[v], %%eax;"
+			     "incl %[v];"
+			     : [v]"+m"(val) :: "eax");
+
+	pmc_results.loads = rdmsr(msr_base + 0) - loads;
+	pmc_results.stores = rdmsr(msr_base + 1) - stores;
+	pmc_results.loads_stores = rdmsr(msr_base + 2) - loads_stores;
+}
+
+static void intel_masked_events_guest_code(void)
+{
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+		wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD);
+		wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE);
+		wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE);
+
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7);
+
+		masked_events_guest_test(MSR_IA32_PMC0);
+		GUEST_SYNC(0);
+	}
+}
+
+static void amd_masked_events_guest_code(void)
+{
+	for (;;) {
+		wrmsr(MSR_K7_EVNTSEL0, 0);
+		wrmsr(MSR_K7_EVNTSEL1, 0);
+		wrmsr(MSR_K7_EVNTSEL2, 0);
+
+		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD);
+		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE);
+		wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE);
+
+		masked_events_guest_test(MSR_K7_PERFCTR0);
+		GUEST_SYNC(0);
+	}
+}
+
+static void run_masked_events_test(struct kvm_vcpu *vcpu,
+				   const uint64_t masked_events[],
+				   const int nmasked_events)
+{
+	struct __kvm_pmu_event_filter f = {
+		.nevents = nmasked_events,
+		.action = KVM_PMU_EVENT_ALLOW,
+		.flags = KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
+	};
+
+	memcpy(f.events, masked_events, sizeof(uint64_t) * nmasked_events);
+	test_with_filter(vcpu, &f);
+}
+
+#define ALLOW_LOADS		BIT(0)
+#define ALLOW_STORES		BIT(1)
+#define ALLOW_LOADS_STORES	BIT(2)
+
+struct masked_events_test {
+	uint64_t intel_events[MAX_TEST_EVENTS];
+	uint64_t intel_event_end;
+	uint64_t amd_events[MAX_TEST_EVENTS];
+	uint64_t amd_event_end;
+	const char *msg;
+	uint32_t flags;
+};
+
+/*
+ * These are the test cases for the masked events tests.
+ *
+ * For each test, the guest enables 3 PMU counters (loads, stores,
+ * loads + stores).  The filter is then set in KVM with the masked events
+ * provided.  The test then verifies that the counters agree with which
+ * ones should be counting and which ones should be filtered.
+ */
+const struct masked_events_test test_cases[] = {
+	{
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
+		},
+		.msg = "Only allow loads.",
+		.flags = ALLOW_LOADS,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
+		},
+		.msg = "Only allow stores.",
+		.flags = ALLOW_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)),
+		},
+		.msg = "Only allow loads + stores.",
+		.flags = ALLOW_LOADS_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0),
+		},
+		.msg = "Only allow loads and stores.",
+		.flags = ALLOW_LOADS | ALLOW_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
+		},
+		.msg = "Only allow loads and loads + stores.",
+		.flags = ALLOW_LOADS | ALLOW_LOADS_STORES
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
+		},
+		.msg = "Only allow stores and loads + stores.",
+		.flags = ALLOW_STORES | ALLOW_LOADS_STORES
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+		},
+		.msg = "Only allow loads, stores, and loads + stores.",
+		.flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES
+	},
+};
+
+static int append_test_events(const struct masked_events_test *test,
+			      uint64_t *events, int nevents)
+{
+	const uint64_t *evts;
+	int i;
+
+	evts = use_intel_pmu() ? test->intel_events : test->amd_events;
+	for (i = 0; i < MAX_TEST_EVENTS; i++) {
+		if (evts[i] == 0)
+			break;
+
+		events[nevents + i] = evts[i];
+	}
+
+	return nevents + i;
+}
+
+static bool bool_eq(bool a, bool b)
+{
+	return a == b;
+}
+
+static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events,
+				    int nevents)
+{
+	int ntests = ARRAY_SIZE(test_cases);
+	int i, n;
+
+	for (i = 0; i < ntests; i++) {
+		const struct masked_events_test *test = &test_cases[i];
+
+		/* Do any test case events overflow MAX_TEST_EVENTS? */
+		assert(test->intel_event_end == 0);
+		assert(test->amd_event_end == 0);
+
+		n = append_test_events(test, events, nevents);
+
+		run_masked_events_test(vcpu, events, n);
+
+		TEST_ASSERT(bool_eq(pmc_results.loads, test->flags & ALLOW_LOADS) &&
+			    bool_eq(pmc_results.stores, test->flags & ALLOW_STORES) &&
+			    bool_eq(pmc_results.loads_stores,
+				    test->flags & ALLOW_LOADS_STORES),
+			    "%s  loads: %lu, stores: %lu, loads + stores: %lu",
+			    test->msg, pmc_results.loads, pmc_results.stores,
+			    pmc_results.loads_stores);
+	}
+}
+
+static void add_dummy_events(uint64_t *events, int nevents)
+{
+	int i;
+
+	for (i = 0; i < nevents; i++) {
+		int event_select = i % 0xFF;
+		bool exclude = ((i % 4) == 0);
+
+		if (event_select == MEM_INST_RETIRED ||
+		    event_select == LS_DISPATCH)
+			event_select++;
+
+		events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0,
+							0, exclude);
+	}
+}
+
+static void test_masked_events(struct kvm_vcpu *vcpu)
+{
+	int nevents = KVM_PMU_EVENT_FILTER_MAX_EVENTS - MAX_TEST_EVENTS;
+	uint64_t events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
+
+	/* Run the test cases against a sparse PMU event filter. */
+	run_masked_events_tests(vcpu, events, 0);
+
+	/* Run the test cases against a dense PMU event filter. */
+	add_dummy_events(events, KVM_PMU_EVENT_FILTER_MAX_EVENTS);
+	run_masked_events_tests(vcpu, events, nevents);
+}
+
+static int set_pmu_event_filter(struct kvm_vcpu *vcpu,
+				struct __kvm_pmu_event_filter *__f)
+{
+	struct kvm_pmu_event_filter *f = (void *)__f;
+
+	return __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
+}
+
+static int set_pmu_single_event_filter(struct kvm_vcpu *vcpu, uint64_t event,
+				       uint32_t flags, uint32_t action)
+{
+	struct __kvm_pmu_event_filter f = {
+		.nevents = 1,
+		.flags = flags,
+		.action = action,
+		.events = {
+			event,
+		},
+	};
+
+	return set_pmu_event_filter(vcpu, &f);
+}
+
+static void test_filter_ioctl(struct kvm_vcpu *vcpu)
+{
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	struct __kvm_pmu_event_filter f;
+	uint64_t e = ~0ul;
+	int r;
+
+	/*
+	 * Unfortunately having invalid bits set in event data is expected to
+	 * pass when flags == 0 (bits other than eventsel+umask).
+	 */
+	r = set_pmu_single_event_filter(vcpu, e, 0, KVM_PMU_EVENT_ALLOW);
+	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
+
+	r = set_pmu_single_event_filter(vcpu, e,
+					KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
+					KVM_PMU_EVENT_ALLOW);
+	TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail");
+
+	e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf);
+	r = set_pmu_single_event_filter(vcpu, e,
+					KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
+					KVM_PMU_EVENT_ALLOW);
+	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
+
+	f = base_event_filter;
+	f.action = PMU_EVENT_FILTER_INVALID_ACTION;
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(r, "Set invalid action is expected to fail");
+
+	f = base_event_filter;
+	f.flags = PMU_EVENT_FILTER_INVALID_FLAGS;
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(r, "Set invalid flags is expected to fail");
+
+	f = base_event_filter;
+	f.nevents = PMU_EVENT_FILTER_INVALID_NEVENTS;
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(r, "Exceeding the max number of filter events should fail");
+
+	f = base_event_filter;
+	f.fixed_counter_bitmap = ~GENMASK_ULL(nr_fixed_counters, 0);
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(!r, "Masking non-existent fixed counters should be allowed");
+}
+
+static void intel_run_fixed_counter_guest_code(uint8_t idx)
+{
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
+
+		/* Only OS_EN bit is enabled for fixed counter[idx]. */
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx));
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+		GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + idx));
+	}
+}
+
+static uint64_t test_with_fixed_counter_filter(struct kvm_vcpu *vcpu,
+					       uint32_t action, uint32_t bitmap)
+{
+	struct __kvm_pmu_event_filter f = {
+		.action = action,
+		.fixed_counter_bitmap = bitmap,
+	};
+	set_pmu_event_filter(vcpu, &f);
+
+	return run_vcpu_to_sync(vcpu);
+}
+
+static uint64_t test_set_gp_and_fixed_event_filter(struct kvm_vcpu *vcpu,
+						   uint32_t action,
+						   uint32_t bitmap)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = action;
+	f.fixed_counter_bitmap = bitmap;
+	set_pmu_event_filter(vcpu, &f);
+
+	return run_vcpu_to_sync(vcpu);
+}
+
+static void __test_fixed_counter_bitmap(struct kvm_vcpu *vcpu, uint8_t idx,
+					uint8_t nr_fixed_counters)
+{
+	unsigned int i;
+	uint32_t bitmap;
+	uint64_t count;
+
+	TEST_ASSERT(nr_fixed_counters < sizeof(bitmap) * 8,
+		    "Invalid nr_fixed_counters");
+
+	/*
+	 * Check the fixed performance counter can count normally when KVM
+	 * userspace doesn't set any pmu filter.
+	 */
+	count = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(count, "Unexpected count value: %ld", count);
+
+	for (i = 0; i < BIT(nr_fixed_counters); i++) {
+		bitmap = BIT(i);
+		count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_ALLOW,
+						       bitmap);
+		TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx)));
+
+		count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_DENY,
+						       bitmap);
+		TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx)));
+
+		/*
+		 * Check that fixed_counter_bitmap has higher priority than
+		 * events[] when both are set.
+		 */
+		count = test_set_gp_and_fixed_event_filter(vcpu,
+							   KVM_PMU_EVENT_ALLOW,
+							   bitmap);
+		TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx)));
+
+		count = test_set_gp_and_fixed_event_filter(vcpu,
+							   KVM_PMU_EVENT_DENY,
+							   bitmap);
+		TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx)));
+	}
+}
+
+static void test_fixed_counter_bitmap(void)
+{
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	uint8_t idx;
+
+	/*
+	 * Check that pmu_event_filter works as expected when it's applied to
+	 * fixed performance counters.
+	 */
+	for (idx = 0; idx < nr_fixed_counters; idx++) {
+		vm = vm_create_with_one_vcpu(&vcpu,
+					     intel_run_fixed_counter_guest_code);
+		vcpu_args_set(vcpu, 1, idx);
+		__test_fixed_counter_bitmap(vcpu, idx, nr_fixed_counters);
+		kvm_vm_free(vm);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	void (*guest_code)(void);
+	struct kvm_vcpu *vcpu, *vcpu2 = NULL;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS));
+
+	TEST_REQUIRE(use_intel_pmu() || use_amd_pmu());
+	guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	TEST_REQUIRE(sanity_check_pmu(vcpu));
+
+	if (use_amd_pmu())
+		test_amd_deny_list(vcpu);
+
+	test_without_filter(vcpu);
+	test_member_deny_list(vcpu);
+	test_member_allow_list(vcpu);
+	test_not_member_deny_list(vcpu);
+	test_not_member_allow_list(vcpu);
+
+	if (use_intel_pmu() &&
+	    supports_event_mem_inst_retired() &&
+	    kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3)
+		vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code);
+	else if (use_amd_pmu())
+		vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code);
+
+	if (vcpu2)
+		test_masked_events(vcpu2);
+	test_filter_ioctl(vcpu);
+
+	kvm_vm_free(vm);
+
+	test_pmu_config_disable(guest_code);
+	test_fixed_counter_bitmap();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
new file mode 100644
index 000000000000..82a8d88b5338
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/memfd.h>
+#include <linux/sizes.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define BASE_DATA_SLOT		10
+#define BASE_DATA_GPA		((uint64_t)(1ull << 32))
+#define PER_CPU_DATA_SIZE	((uint64_t)(SZ_2M + PAGE_SIZE))
+
+/* Horrific macro so that the line info is captured accurately :-( */
+#define memcmp_g(gpa, pattern,  size)								\
+do {												\
+	uint8_t *mem = (uint8_t *)gpa;								\
+	size_t i;										\
+												\
+	for (i = 0; i < size; i++)								\
+		__GUEST_ASSERT(mem[i] == pattern,						\
+			       "Guest expected 0x%x at offset %lu (gpa 0x%lx), got 0x%x",	\
+			       pattern, i, gpa + i, mem[i]);					\
+} while (0)
+
+static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size)
+{
+	size_t i;
+
+	for (i = 0; i < size; i++)
+		TEST_ASSERT(mem[i] == pattern,
+			    "Host expected 0x%x at gpa 0x%lx, got 0x%x",
+			    pattern, gpa + i, mem[i]);
+}
+
+/*
+ * Run memory conversion tests with explicit conversion:
+ * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit
+ * to back/unback private memory. Subsequent accesses by guest to the gpa range
+ * will not cause exit to userspace.
+ *
+ * Test memory conversion scenarios with following steps:
+ * 1) Access private memory using private access and verify that memory contents
+ *   are not visible to userspace.
+ * 2) Convert memory to shared using explicit conversions and ensure that
+ *   userspace is able to access the shared regions.
+ * 3) Convert memory back to private using explicit conversions and ensure that
+ *   userspace is again not able to access converted private regions.
+ */
+
+#define GUEST_STAGE(o, s) { .offset = o, .size = s }
+
+enum ucall_syncs {
+	SYNC_SHARED,
+	SYNC_PRIVATE,
+};
+
+static void guest_sync_shared(uint64_t gpa, uint64_t size,
+			      uint8_t current_pattern, uint8_t new_pattern)
+{
+	GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern);
+}
+
+static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern)
+{
+	GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern);
+}
+
+/* Arbitrary values, KVM doesn't care about the attribute flags. */
+#define MAP_GPA_SET_ATTRIBUTES	BIT(0)
+#define MAP_GPA_SHARED		BIT(1)
+#define MAP_GPA_DO_FALLOCATE	BIT(2)
+
+static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared,
+			  bool do_fallocate)
+{
+	uint64_t flags = MAP_GPA_SET_ATTRIBUTES;
+
+	if (map_shared)
+		flags |= MAP_GPA_SHARED;
+	if (do_fallocate)
+		flags |= MAP_GPA_DO_FALLOCATE;
+	kvm_hypercall_map_gpa_range(gpa, size, flags);
+}
+
+static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate)
+{
+	guest_map_mem(gpa, size, true, do_fallocate);
+}
+
+static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate)
+{
+	guest_map_mem(gpa, size, false, do_fallocate);
+}
+
+struct {
+	uint64_t offset;
+	uint64_t size;
+} static const test_ranges[] = {
+	GUEST_STAGE(0, PAGE_SIZE),
+	GUEST_STAGE(0, SZ_2M),
+	GUEST_STAGE(PAGE_SIZE, PAGE_SIZE),
+	GUEST_STAGE(PAGE_SIZE, SZ_2M),
+	GUEST_STAGE(SZ_2M, PAGE_SIZE),
+};
+
+static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate)
+{
+	const uint8_t def_p = 0xaa;
+	const uint8_t init_p = 0xcc;
+	uint64_t j;
+	int i;
+
+	/* Memory should be shared by default. */
+	memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE);
+	memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE);
+	guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p);
+
+	memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
+
+	for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
+		uint64_t gpa = base_gpa + test_ranges[i].offset;
+		uint64_t size = test_ranges[i].size;
+		uint8_t p1 = 0x11;
+		uint8_t p2 = 0x22;
+		uint8_t p3 = 0x33;
+		uint8_t p4 = 0x44;
+
+		/*
+		 * Set the test region to pattern one to differentiate it from
+		 * the data range as a whole (contains the initial pattern).
+		 */
+		memset((void *)gpa, p1, size);
+
+		/*
+		 * Convert to private, set and verify the private data, and
+		 * then verify that the rest of the data (map shared) still
+		 * holds the initial pattern, and that the host always sees the
+		 * shared memory (initial pattern).  Unlike shared memory,
+		 * punching a hole in private memory is destructive, i.e.
+		 * previous values aren't guaranteed to be preserved.
+		 */
+		guest_map_private(gpa, size, do_fallocate);
+
+		if (size > PAGE_SIZE) {
+			memset((void *)gpa, p2, PAGE_SIZE);
+			goto skip;
+		}
+
+		memset((void *)gpa, p2, size);
+		guest_sync_private(gpa, size, p1);
+
+		/*
+		 * Verify that the private memory was set to pattern two, and
+		 * that shared memory still holds the initial pattern.
+		 */
+		memcmp_g(gpa, p2, size);
+		if (gpa > base_gpa)
+			memcmp_g(base_gpa, init_p, gpa - base_gpa);
+		if (gpa + size < base_gpa + PER_CPU_DATA_SIZE)
+			memcmp_g(gpa + size, init_p,
+				 (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size));
+
+		/*
+		 * Convert odd-number page frames back to shared to verify KVM
+		 * also correctly handles holes in private ranges.
+		 */
+		for (j = 0; j < size; j += PAGE_SIZE) {
+			if ((j >> PAGE_SHIFT) & 1) {
+				guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate);
+				guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3);
+
+				memcmp_g(gpa + j, p3, PAGE_SIZE);
+			} else {
+				guest_sync_private(gpa + j, PAGE_SIZE, p1);
+			}
+		}
+
+skip:
+		/*
+		 * Convert the entire region back to shared, explicitly write
+		 * pattern three to fill in the even-number frames before
+		 * asking the host to verify (and write pattern four).
+		 */
+		guest_map_shared(gpa, size, do_fallocate);
+		memset((void *)gpa, p3, size);
+		guest_sync_shared(gpa, size, p3, p4);
+		memcmp_g(gpa, p4, size);
+
+		/* Reset the shared memory back to the initial pattern. */
+		memset((void *)gpa, init_p, size);
+
+		/*
+		 * Free (via PUNCH_HOLE) *all* private memory so that the next
+		 * iteration starts from a clean slate, e.g. with respect to
+		 * whether or not there are pages/folios in guest_mem.
+		 */
+		guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true);
+	}
+}
+
+static void guest_punch_hole(uint64_t gpa, uint64_t size)
+{
+	/* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */
+	uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE;
+
+	kvm_hypercall_map_gpa_range(gpa, size, flags);
+}
+
+/*
+ * Test that PUNCH_HOLE actually frees memory by punching holes without doing a
+ * proper conversion.  Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating
+ * (subsequent fault) should zero memory.
+ */
+static void guest_test_punch_hole(uint64_t base_gpa, bool precise)
+{
+	const uint8_t init_p = 0xcc;
+	int i;
+
+	/*
+	 * Convert the entire range to private, this testcase is all about
+	 * punching holes in guest_memfd, i.e. shared mappings aren't needed.
+	 */
+	guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false);
+
+	for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
+		uint64_t gpa = base_gpa + test_ranges[i].offset;
+		uint64_t size = test_ranges[i].size;
+
+		/*
+		 * Free all memory before each iteration, even for the !precise
+		 * case where the memory will be faulted back in.  Freeing and
+		 * reallocating should obviously work, and freeing all memory
+		 * minimizes the probability of cross-testcase influence.
+		 */
+		guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE);
+
+		/* Fault-in and initialize memory, and verify the pattern. */
+		if (precise) {
+			memset((void *)gpa, init_p, size);
+			memcmp_g(gpa, init_p, size);
+		} else {
+			memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE);
+			memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
+		}
+
+		/*
+		 * Punch a hole at the target range and verify that reads from
+		 * the guest succeed and return zeroes.
+		 */
+		guest_punch_hole(gpa, size);
+		memcmp_g(gpa, 0, size);
+	}
+}
+
+static void guest_code(uint64_t base_gpa)
+{
+	/*
+	 * Run the conversion test twice, with and without doing fallocate() on
+	 * the guest_memfd backing when converting between shared and private.
+	 */
+	guest_test_explicit_conversion(base_gpa, false);
+	guest_test_explicit_conversion(base_gpa, true);
+
+	/*
+	 * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd
+	 * faulted in, once with only the target range faulted in.
+	 */
+	guest_test_punch_hole(base_gpa, false);
+	guest_test_punch_hole(base_gpa, true);
+	GUEST_DONE();
+}
+
+static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	uint64_t gpa = run->hypercall.args[0];
+	uint64_t size = run->hypercall.args[1] * PAGE_SIZE;
+	bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES;
+	bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED;
+	bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE;
+	struct kvm_vm *vm = vcpu->vm;
+
+	TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE,
+		    "Wanted MAP_GPA_RANGE (%u), got '%llu'",
+		    KVM_HC_MAP_GPA_RANGE, run->hypercall.nr);
+
+	if (do_fallocate)
+		vm_guest_mem_fallocate(vm, gpa, size, map_shared);
+
+	if (set_attributes)
+		vm_set_memory_attributes(vm, gpa, size,
+					 map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
+	run->hypercall.ret = 0;
+}
+
+static bool run_vcpus;
+
+static void *__test_mem_conversions(void *__vcpu)
+{
+	struct kvm_vcpu *vcpu = __vcpu;
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vm *vm = vcpu->vm;
+	struct ucall uc;
+
+	while (!READ_ONCE(run_vcpus))
+		;
+
+	for ( ;; ) {
+		vcpu_run(vcpu);
+
+		if (run->exit_reason == KVM_EXIT_HYPERCALL) {
+			handle_exit_hypercall(vcpu);
+			continue;
+		}
+
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
+			    run->exit_reason, exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		case UCALL_SYNC: {
+			uint64_t gpa  = uc.args[1];
+			size_t size = uc.args[2];
+			size_t i;
+
+			TEST_ASSERT(uc.args[0] == SYNC_SHARED ||
+				    uc.args[0] == SYNC_PRIVATE,
+				    "Unknown sync command '%ld'", uc.args[0]);
+
+			for (i = 0; i < size; i += vm->page_size) {
+				size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
+				uint8_t *hva = addr_gpa2hva(vm, gpa + i);
+
+				/* In all cases, the host should observe the shared data. */
+				memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
+
+				/* For shared, write the new pattern to guest memory. */
+				if (uc.args[0] == SYNC_SHARED)
+					memset(hva, uc.args[4], nr_bytes);
+			}
+			break;
+		}
+		case UCALL_DONE:
+			return NULL;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+}
+
+static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus,
+				 uint32_t nr_memslots)
+{
+	/*
+	 * Allocate enough memory so that each vCPU's chunk of memory can be
+	 * naturally aligned with respect to the size of the backing store.
+	 */
+	const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type));
+	const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment);
+	const size_t memfd_size = per_cpu_size * nr_vcpus;
+	const size_t slot_size = memfd_size / nr_memslots;
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	pthread_t threads[KVM_MAX_VCPUS];
+	struct kvm_vm *vm;
+	int memfd, i, r;
+
+	const struct vm_shape shape = {
+		.mode = VM_MODE_DEFAULT,
+		.type = KVM_X86_SW_PROTECTED_VM,
+	};
+
+	TEST_ASSERT(slot_size * nr_memslots == memfd_size,
+		    "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)",
+		    memfd_size, nr_memslots);
+	vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus);
+
+	vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
+
+	memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+
+	for (i = 0; i < nr_memslots; i++)
+		vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
+			   BASE_DATA_SLOT + i, slot_size / vm->page_size,
+			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
+
+	for (i = 0; i < nr_vcpus; i++) {
+		uint64_t gpa =  BASE_DATA_GPA + i * per_cpu_size;
+
+		vcpu_args_set(vcpus[i], 1, gpa);
+
+		/*
+		 * Map only what is needed so that an out-of-bounds access
+		 * results #PF => SHUTDOWN instead of data corruption.
+		 */
+		virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size);
+
+		pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]);
+	}
+
+	WRITE_ONCE(run_vcpus, true);
+
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_join(threads[i], NULL);
+
+	kvm_vm_free(vm);
+
+	/*
+	 * Allocate and free memory from the guest_memfd after closing the VM
+	 * fd.  The guest_memfd is gifted a reference to its owning VM, i.e.
+	 * should prevent the VM from being fully destroyed until the last
+	 * reference to the guest_memfd is also put.
+	 */
+	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size);
+	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+
+	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size);
+	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+
+	close(memfd);
+}
+
+static void usage(const char *cmd)
+{
+	puts("");
+	printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd);
+	puts("");
+	backing_src_help("-s");
+	puts("");
+	puts(" -n: specify the number of vcpus (default: 1)");
+	puts("");
+	puts(" -m: specify the number of memslots (default: 1)");
+	puts("");
+}
+
+int main(int argc, char *argv[])
+{
+	enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+	uint32_t nr_memslots = 1;
+	uint32_t nr_vcpus = 1;
+	int opt;
+
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+
+	while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
+		switch (opt) {
+		case 's':
+			src_type = parse_backing_src_type(optarg);
+			break;
+		case 'n':
+			nr_vcpus = atoi_positive("nr_vcpus", optarg);
+			break;
+		case 'm':
+			nr_memslots = atoi_positive("nr_memslots", optarg);
+			break;
+		case 'h':
+		default:
+			usage(argv[0]);
+			exit(0);
+		}
+	}
+
+	test_mem_conversions(src_type, nr_vcpus, nr_memslots);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
new file mode 100644
index 000000000000..13e72fcec8dd
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdint.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+/* Arbitrarily selected to avoid overlaps with anything else */
+#define EXITS_TEST_GVA 0xc0000000
+#define EXITS_TEST_GPA EXITS_TEST_GVA
+#define EXITS_TEST_NPAGES 1
+#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE)
+#define EXITS_TEST_SLOT 10
+
+static uint64_t guest_repeatedly_read(void)
+{
+	volatile uint64_t value;
+
+	while (true)
+		value = *((uint64_t *) EXITS_TEST_GVA);
+
+	return value;
+}
+
+static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = _vcpu_run(vcpu);
+	if (r) {
+		TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r));
+		TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT);
+	}
+	return vcpu->run->exit_reason;
+}
+
+const struct vm_shape protected_vm_shape = {
+	.mode = VM_MODE_DEFAULT,
+	.type = KVM_X86_SW_PROTECTED_VM,
+};
+
+static void test_private_access_memslot_deleted(void)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	pthread_t vm_thread;
+	void *thread_return;
+	uint32_t exit_reason;
+
+	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
+					   guest_repeatedly_read);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    EXITS_TEST_GPA, EXITS_TEST_SLOT,
+				    EXITS_TEST_NPAGES,
+				    KVM_MEM_GUEST_MEMFD);
+
+	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+	/* Request to access page privately */
+	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+
+	pthread_create(&vm_thread, NULL,
+		       (void *(*)(void *))run_vcpu_get_exit_reason,
+		       (void *)vcpu);
+
+	vm_mem_region_delete(vm, EXITS_TEST_SLOT);
+
+	pthread_join(vm_thread, &thread_return);
+	exit_reason = (uint32_t)(uint64_t)thread_return;
+
+	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+
+	kvm_vm_free(vm);
+}
+
+static void test_private_access_memslot_not_private(void)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	uint32_t exit_reason;
+
+	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
+					   guest_repeatedly_read);
+
+	/* Add a non-private memslot (flags = 0) */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    EXITS_TEST_GPA, EXITS_TEST_SLOT,
+				    EXITS_TEST_NPAGES, 0);
+
+	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+	/* Request to access page privately */
+	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+
+	exit_reason = run_vcpu_get_exit_reason(vcpu);
+
+	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+
+	test_private_access_memslot_deleted();
+	test_private_access_memslot_not_private();
+}
diff --git a/tools/testing/selftests/kvm/x86/recalc_apic_map_test.c b/tools/testing/selftests/kvm/x86/recalc_apic_map_test.c
new file mode 100644
index 000000000000..cbc92a862ea9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/recalc_apic_map_test.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test edge cases and race conditions in kvm_recalculate_apic_map().
+ */
+
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <time.h>
+
+#include "processor.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "apic.h"
+
+#define TIMEOUT		5	/* seconds */
+
+#define LAPIC_DISABLED	0
+#define LAPIC_X2APIC	(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)
+#define MAX_XAPIC_ID	0xff
+
+static void *race(void *arg)
+{
+	struct kvm_lapic_state lapic = {};
+	struct kvm_vcpu *vcpu = arg;
+
+	while (1) {
+		/* Trigger kvm_recalculate_apic_map(). */
+		vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct kvm_vcpu *vcpuN;
+	struct kvm_vm *vm;
+	pthread_t thread;
+	time_t t;
+	int i;
+
+	kvm_static_assert(KVM_MAX_VCPUS > MAX_XAPIC_ID);
+
+	/*
+	 * Create the max number of vCPUs supported by selftests so that KVM
+	 * has decent amount of work to do when recalculating the map, i.e. to
+	 * make the problematic window large enough to hit.
+	 */
+	vm = vm_create_with_vcpus(KVM_MAX_VCPUS, NULL, vcpus);
+
+	/*
+	 * Enable x2APIC on all vCPUs so that KVM doesn't bail from the recalc
+	 * due to vCPUs having aliased xAPIC IDs (truncated to 8 bits).
+	 */
+	for (i = 0; i < KVM_MAX_VCPUS; i++)
+		vcpu_set_msr(vcpus[i], MSR_IA32_APICBASE, LAPIC_X2APIC);
+
+	TEST_ASSERT_EQ(pthread_create(&thread, NULL, race, vcpus[0]), 0);
+
+	vcpuN = vcpus[KVM_MAX_VCPUS - 1];
+	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
+		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_X2APIC);
+		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_DISABLED);
+	}
+
+	TEST_ASSERT_EQ(pthread_cancel(thread), 0);
+	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86/set_boot_cpu_id.c
new file mode 100644
index 000000000000..49913784bc82
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/set_boot_cpu_id.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that KVM_SET_BOOT_CPU_ID works as intended
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "apic.h"
+
+static void guest_bsp_vcpu(void *arg)
+{
+	GUEST_SYNC(1);
+
+	GUEST_ASSERT_NE(get_bsp_flag(), 0);
+
+	GUEST_DONE();
+}
+
+static void guest_not_bsp_vcpu(void *arg)
+{
+	GUEST_SYNC(1);
+
+	GUEST_ASSERT_EQ(get_bsp_flag(), 0);
+
+	GUEST_DONE();
+}
+
+static void test_set_invalid_bsp(struct kvm_vm *vm)
+{
+	unsigned long max_vcpu_id = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+	int r;
+
+	if (max_vcpu_id) {
+		r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(max_vcpu_id + 1));
+		TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID > MAX should fail");
+	}
+
+	r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(1L << 32));
+	TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID[63:32]!=0 should fail");
+}
+
+static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg)
+{
+	int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID,
+			   (void *)(unsigned long)vcpu->id);
+
+	TEST_ASSERT(r == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set %s", msg);
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+	int stage;
+
+	for (stage = 0; stage < 2; stage++) {
+
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+					uc.args[1] == stage + 1,
+					"Stage %d: Unexpected register values vmexit, got %lx",
+					stage + 1, (ulong)uc.args[1]);
+			test_set_bsp_busy(vcpu, "while running vm");
+			break;
+		case UCALL_DONE:
+			TEST_ASSERT(stage == 1,
+					"Expected GUEST_DONE in stage 2, got stage %d",
+					stage);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_ASSERT(false, "Unexpected exit: %s",
+				    exit_reason_str(vcpu->run->exit_reason));
+		}
+	}
+}
+
+static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id,
+				struct kvm_vcpu *vcpus[])
+{
+	struct kvm_vm *vm;
+	uint32_t i;
+
+	vm = vm_create(nr_vcpus);
+
+	test_set_invalid_bsp(vm);
+
+	vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id);
+
+	for (i = 0; i < nr_vcpus; i++)
+		vcpus[i] = vm_vcpu_add(vm, i, i == bsp_vcpu_id ? guest_bsp_vcpu :
+								 guest_not_bsp_vcpu);
+	return vm;
+}
+
+static void run_vm_bsp(uint32_t bsp_vcpu_id)
+{
+	struct kvm_vcpu *vcpus[2];
+	struct kvm_vm *vm;
+
+	vm = create_vm(ARRAY_SIZE(vcpus), bsp_vcpu_id, vcpus);
+
+	run_vcpu(vcpus[0]);
+	run_vcpu(vcpus[1]);
+
+	kvm_vm_free(vm);
+}
+
+static void check_set_bsp_busy(void)
+{
+	struct kvm_vcpu *vcpus[2];
+	struct kvm_vm *vm;
+
+	vm = create_vm(ARRAY_SIZE(vcpus), 0, vcpus);
+
+	test_set_bsp_busy(vcpus[1], "after adding vcpu");
+
+	run_vcpu(vcpus[0]);
+	run_vcpu(vcpus[1]);
+
+	test_set_bsp_busy(vcpus[1], "to a terminated vcpu");
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID));
+
+	run_vm_bsp(0);
+	run_vm_bsp(1);
+	run_vm_bsp(0);
+
+	check_set_bsp_busy();
+}
diff --git a/tools/testing/selftests/kvm/x86/set_sregs_test.c b/tools/testing/selftests/kvm/x86/set_sregs_test.c
new file mode 100644
index 000000000000..c021c0795a96
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/set_sregs_test.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_SET_SREGS tests
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This is a regression test for the bug fixed by the following commit:
+ * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values")
+ *
+ * That bug allowed a user-mode program that called the KVM_SET_SREGS
+ * ioctl to put a VCPU's local APIC into an invalid state.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define TEST_INVALID_CR_BIT(vcpu, cr, orig, bit)				\
+do {										\
+	struct kvm_sregs new;							\
+	int rc;									\
+										\
+	/* Skip the sub-test, the feature/bit is supported. */			\
+	if (orig.cr & bit)							\
+		break;								\
+										\
+	memcpy(&new, &orig, sizeof(sregs));					\
+	new.cr |= bit;								\
+										\
+	rc = _vcpu_sregs_set(vcpu, &new);					\
+	TEST_ASSERT(rc, "KVM allowed invalid " #cr " bit (0x%lx)", bit);	\
+										\
+	/* Sanity check that KVM didn't change anything. */			\
+	vcpu_sregs_get(vcpu, &new);						\
+	TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs");	\
+} while (0)
+
+static uint64_t calc_supported_cr4_feature_bits(void)
+{
+	uint64_t cr4;
+
+	cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE |
+	      X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE |
+	      X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT;
+	if (kvm_cpu_has(X86_FEATURE_UMIP))
+		cr4 |= X86_CR4_UMIP;
+	if (kvm_cpu_has(X86_FEATURE_LA57))
+		cr4 |= X86_CR4_LA57;
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		cr4 |= X86_CR4_VMXE;
+	if (kvm_cpu_has(X86_FEATURE_SMX))
+		cr4 |= X86_CR4_SMXE;
+	if (kvm_cpu_has(X86_FEATURE_FSGSBASE))
+		cr4 |= X86_CR4_FSGSBASE;
+	if (kvm_cpu_has(X86_FEATURE_PCID))
+		cr4 |= X86_CR4_PCIDE;
+	if (kvm_cpu_has(X86_FEATURE_XSAVE))
+		cr4 |= X86_CR4_OSXSAVE;
+	if (kvm_cpu_has(X86_FEATURE_SMEP))
+		cr4 |= X86_CR4_SMEP;
+	if (kvm_cpu_has(X86_FEATURE_SMAP))
+		cr4 |= X86_CR4_SMAP;
+	if (kvm_cpu_has(X86_FEATURE_PKU))
+		cr4 |= X86_CR4_PKE;
+
+	return cr4;
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_sregs sregs;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t cr4;
+	int rc, i;
+
+	/*
+	 * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
+	 * use it to verify all supported CR4 bits can be set prior to defining
+	 * the vCPU model, i.e. without doing KVM_SET_CPUID2.
+	 */
+	vm = vm_create_barebones();
+	vcpu = __vm_vcpu_add(vm, 0);
+
+	vcpu_sregs_get(vcpu, &sregs);
+
+	sregs.cr0 = 0;
+	sregs.cr4 |= calc_supported_cr4_feature_bits();
+	cr4 = sregs.cr4;
+
+	rc = _vcpu_sregs_set(vcpu, &sregs);
+	TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)",
+		    sregs.cr4, cr4);
+
+	/* Verify all unsupported features are rejected by KVM. */
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMXE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_FSGSBASE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PCIDE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_OSXSAVE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMEP);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMAP);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PKE);
+
+	for (i = 32; i < 64; i++)
+		TEST_INVALID_CR_BIT(vcpu, cr0, sregs, BIT(i));
+
+	/* NW without CD is illegal, as is PG without PE. */
+	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW);
+	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG);
+
+	kvm_vm_free(vm);
+
+	/* Create a "real" VM and verify APIC_BASE can be set. */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	sregs.apic_base = 1 << 10;
+	rc = _vcpu_sregs_set(vcpu, &sregs);
+	TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
+		    sregs.apic_base);
+	sregs.apic_base = 1 << 11;
+	rc = _vcpu_sregs_set(vcpu, &sregs);
+	TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
+		    sregs.apic_base);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/sev_init2_tests.c b/tools/testing/selftests/kvm/x86/sev_init2_tests.c
new file mode 100644
index 000000000000..3fb967f40c6a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/sev_init2_tests.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm.h>
+#include <linux/psp-sev.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "kselftest.h"
+
+#define SVM_SEV_FEAT_DEBUG_SWAP 32u
+
+/*
+ * Some features may have hidden dependencies, or may only work
+ * for certain VM types.  Err on the side of safety and don't
+ * expect that all supported features can be passed one by one
+ * to KVM_SEV_INIT2.
+ *
+ * (Well, right now there's only one...)
+ */
+#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP
+
+int kvm_fd;
+u64 supported_vmsa_features;
+bool have_sev_es;
+
+static int __sev_ioctl(int vm_fd, int cmd_id, void *data)
+{
+	struct kvm_sev_cmd cmd = {
+		.id = cmd_id,
+		.data = (uint64_t)data,
+		.sev_fd = open_sev_dev_path_or_exit(),
+	};
+	int ret;
+
+	ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd);
+	TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS,
+		    "%d failed: fw error: %d\n",
+		    cmd_id, cmd.error);
+
+	return ret;
+}
+
+static void test_init2(unsigned long vm_type, struct kvm_sev_init *init)
+{
+	struct kvm_vm *vm;
+	int ret;
+
+	vm = vm_create_barebones_type(vm_type);
+	ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init);
+	TEST_ASSERT(ret == 0,
+		    "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d",
+		    ret, errno);
+	kvm_vm_free(vm);
+}
+
+static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg)
+{
+	struct kvm_vm *vm;
+	int ret;
+
+	vm = vm_create_barebones_type(vm_type);
+	ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init);
+	TEST_ASSERT(ret == -1 && errno == EINVAL,
+		    "KVM_SEV_INIT2 should fail, %s.",
+		    msg);
+	kvm_vm_free(vm);
+}
+
+void test_vm_types(void)
+{
+	test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){});
+
+	/*
+	 * TODO: check that unsupported types cannot be created.  Probably
+	 * a separate selftest.
+	 */
+	if (have_sev_es)
+		test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){});
+
+	test_init2_invalid(0, &(struct kvm_sev_init){},
+			   "VM type is KVM_X86_DEFAULT_VM");
+	if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))
+		test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){},
+				   "VM type is KVM_X86_SW_PROTECTED_VM");
+}
+
+void test_flags(uint32_t vm_type)
+{
+	int i;
+
+	for (i = 0; i < 32; i++)
+		test_init2_invalid(vm_type,
+			&(struct kvm_sev_init){ .flags = BIT(i) },
+			"invalid flag");
+}
+
+void test_features(uint32_t vm_type, uint64_t supported_features)
+{
+	int i;
+
+	for (i = 0; i < 64; i++) {
+		if (!(supported_features & BIT_ULL(i)))
+			test_init2_invalid(vm_type,
+				&(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) },
+				"unknown feature");
+		else if (KNOWN_FEATURES & BIT_ULL(i))
+			test_init2(vm_type,
+				&(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) });
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int kvm_fd = open_kvm_dev_path_or_exit();
+	bool have_sev;
+
+	TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV,
+					   KVM_X86_SEV_VMSA_FEATURES) == 0);
+	kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV,
+			    KVM_X86_SEV_VMSA_FEATURES,
+			    &supported_vmsa_features);
+
+	have_sev = kvm_cpu_has(X86_FEATURE_SEV);
+	TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)),
+		    "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
+		    kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM);
+
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM));
+	have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES);
+
+	TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)),
+		    "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
+		    kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM);
+
+	test_vm_types();
+
+	test_flags(KVM_X86_SEV_VM);
+	if (have_sev_es)
+		test_flags(KVM_X86_SEV_ES_VM);
+
+	test_features(KVM_X86_SEV_VM, 0);
+	if (have_sev_es)
+		test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c
new file mode 100644
index 000000000000..0a6dfba3905b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm.h>
+#include <linux/psp-sev.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "sev.h"
+#include "kselftest.h"
+
+#define NR_MIGRATE_TEST_VCPUS 4
+#define NR_MIGRATE_TEST_VMS 3
+#define NR_LOCK_TESTING_THREADS 3
+#define NR_LOCK_TESTING_ITERATIONS 10000
+
+bool have_sev_es;
+
+static struct kvm_vm *sev_vm_create(bool es)
+{
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_barebones();
+	if (!es)
+		sev_vm_init(vm);
+	else
+		sev_es_vm_init(vm);
+
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		__vm_vcpu_add(vm, i);
+
+	sev_vm_launch(vm, es ? SEV_POLICY_ES : 0);
+
+	if (es)
+		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+	return vm;
+}
+
+static struct kvm_vm *aux_vm_create(bool with_vcpus)
+{
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_barebones();
+	if (!with_vcpus)
+		return vm;
+
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		__vm_vcpu_add(vm, i);
+
+	return vm;
+}
+
+static int __sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	return __vm_enable_cap(dst, KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, src->fd);
+}
+
+
+static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	int ret;
+
+	ret = __sev_migrate_from(dst, src);
+	TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d", ret, errno);
+}
+
+static void test_sev_migrate_from(bool es)
+{
+	struct kvm_vm *src_vm;
+	struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS];
+	int i, ret;
+
+	src_vm = sev_vm_create(es);
+	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
+		dst_vms[i] = aux_vm_create(true);
+
+	/* Initial migration from the src to the first dst. */
+	sev_migrate_from(dst_vms[0], src_vm);
+
+	for (i = 1; i < NR_MIGRATE_TEST_VMS; i++)
+		sev_migrate_from(dst_vms[i], dst_vms[i - 1]);
+
+	/* Migrate the guest back to the original VM. */
+	ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]);
+	TEST_ASSERT(ret == -1 && errno == EIO,
+		    "VM that was migrated from should be dead. ret %d, errno: %d", ret,
+		    errno);
+
+	kvm_vm_free(src_vm);
+	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
+		kvm_vm_free(dst_vms[i]);
+}
+
+struct locking_thread_input {
+	struct kvm_vm *vm;
+	struct kvm_vm *source_vms[NR_LOCK_TESTING_THREADS];
+};
+
+static void *locking_test_thread(void *arg)
+{
+	int i, j;
+	struct locking_thread_input *input = (struct locking_thread_input *)arg;
+
+	for (i = 0; i < NR_LOCK_TESTING_ITERATIONS; ++i) {
+		j = i % NR_LOCK_TESTING_THREADS;
+		__sev_migrate_from(input->vm, input->source_vms[j]);
+	}
+
+	return NULL;
+}
+
+static void test_sev_migrate_locking(void)
+{
+	struct locking_thread_input input[NR_LOCK_TESTING_THREADS];
+	pthread_t pt[NR_LOCK_TESTING_THREADS];
+	int i;
+
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) {
+		input[i].vm = sev_vm_create(/* es= */ false);
+		input[0].source_vms[i] = input[i].vm;
+	}
+	for (i = 1; i < NR_LOCK_TESTING_THREADS; ++i)
+		memcpy(input[i].source_vms, input[0].source_vms,
+		       sizeof(input[i].source_vms));
+
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		pthread_create(&pt[i], NULL, locking_test_thread, &input[i]);
+
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		pthread_join(pt[i], NULL);
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		kvm_vm_free(input[i].vm);
+}
+
+static void test_sev_migrate_parameters(void)
+{
+	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_no_sev,
+		*sev_es_vm_no_vmsa;
+	int ret;
+
+	vm_no_vcpu = vm_create_barebones();
+	vm_no_sev = aux_vm_create(true);
+	ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev);
+	TEST_ASSERT(ret == -1 && errno == EINVAL,
+		    "Migrations require SEV enabled. ret %d, errno: %d", ret,
+		    errno);
+
+	if (!have_sev_es)
+		goto out;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	sev_es_vm = sev_vm_create(/* es= */ true);
+	sev_es_vm_no_vmsa = vm_create_barebones();
+	sev_es_vm_init(sev_es_vm_no_vmsa);
+	__vm_vcpu_add(sev_es_vm_no_vmsa, 1);
+
+	ret = __sev_migrate_from(sev_vm, sev_es_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able migrate to SEV enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_migrate_from(sev_es_vm, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm_no_vmsa);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d",
+		ret, errno);
+
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(sev_es_vm);
+	kvm_vm_free(sev_es_vm_no_vmsa);
+out:
+	kvm_vm_free(vm_no_vcpu);
+	kvm_vm_free(vm_no_sev);
+}
+
+static int __sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	return __vm_enable_cap(dst, KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, src->fd);
+}
+
+
+static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	int ret;
+
+	ret = __sev_mirror_create(dst, src);
+	TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno);
+}
+
+static void verify_mirror_allowed_cmds(struct kvm_vm *vm)
+{
+	struct kvm_sev_guest_status status;
+	int cmd_id;
+
+	for (cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) {
+		int ret;
+
+		/*
+		 * These commands are allowed for mirror VMs, all others are
+		 * not.
+		 */
+		switch (cmd_id) {
+		case KVM_SEV_LAUNCH_UPDATE_VMSA:
+		case KVM_SEV_GUEST_STATUS:
+		case KVM_SEV_DBG_DECRYPT:
+		case KVM_SEV_DBG_ENCRYPT:
+			continue;
+		default:
+			break;
+		}
+
+		/*
+		 * These commands should be disallowed before the data
+		 * parameter is examined so NULL is OK here.
+		 */
+		ret = __vm_sev_ioctl(vm, cmd_id, NULL);
+		TEST_ASSERT(
+			ret == -1 && errno == EINVAL,
+			"Should not be able call command: %d. ret: %d, errno: %d",
+			cmd_id, ret, errno);
+	}
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+}
+
+static void test_sev_mirror(bool es)
+{
+	struct kvm_vm *src_vm, *dst_vm;
+	int i;
+
+	src_vm = sev_vm_create(es);
+	dst_vm = aux_vm_create(false);
+
+	sev_mirror_create(dst_vm, src_vm);
+
+	/* Check that we can complete creation of the mirror VM.  */
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		__vm_vcpu_add(dst_vm, i);
+
+	if (es)
+		vm_sev_ioctl(dst_vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+
+	verify_mirror_allowed_cmds(dst_vm);
+
+	kvm_vm_free(src_vm);
+	kvm_vm_free(dst_vm);
+}
+
+static void test_sev_mirror_parameters(void)
+{
+	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu;
+	int ret;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	vm_with_vcpu = aux_vm_create(true);
+	vm_no_vcpu = aux_vm_create(false);
+
+	ret = __sev_mirror_create(sev_vm, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to self. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_mirror_create(vm_no_vcpu, vm_with_vcpu);
+	TEST_ASSERT(ret == -1 && errno == EINVAL,
+		    "Copy context requires SEV enabled. ret %d, errno: %d", ret,
+		    errno);
+
+	ret = __sev_mirror_create(vm_with_vcpu, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d",
+		ret, errno);
+
+	if (!have_sev_es)
+		goto out;
+
+	sev_es_vm = sev_vm_create(/* es= */ true);
+	ret = __sev_mirror_create(sev_vm, sev_es_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to SEV enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_mirror_create(sev_es_vm, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	kvm_vm_free(sev_es_vm);
+
+out:
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(vm_with_vcpu);
+	kvm_vm_free(vm_no_vcpu);
+}
+
+static void test_sev_move_copy(void)
+{
+	struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
+		      *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	dst_vm = aux_vm_create(true);
+	dst2_vm = aux_vm_create(true);
+	dst3_vm = aux_vm_create(true);
+	mirror_vm = aux_vm_create(false);
+	dst_mirror_vm = aux_vm_create(false);
+	dst2_mirror_vm = aux_vm_create(false);
+	dst3_mirror_vm = aux_vm_create(false);
+
+	sev_mirror_create(mirror_vm, sev_vm);
+
+	sev_migrate_from(dst_mirror_vm, mirror_vm);
+	sev_migrate_from(dst_vm, sev_vm);
+
+	sev_migrate_from(dst2_vm, dst_vm);
+	sev_migrate_from(dst2_mirror_vm, dst_mirror_vm);
+
+	sev_migrate_from(dst3_mirror_vm, dst2_mirror_vm);
+	sev_migrate_from(dst3_vm, dst2_vm);
+
+	kvm_vm_free(dst_vm);
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(dst2_vm);
+	kvm_vm_free(dst3_vm);
+	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_mirror_vm);
+	kvm_vm_free(dst2_mirror_vm);
+	kvm_vm_free(dst3_mirror_vm);
+
+	/*
+	 * Run similar test be destroy mirrors before mirrored VMs to ensure
+	 * destruction is done safely.
+	 */
+	sev_vm = sev_vm_create(/* es= */ false);
+	dst_vm = aux_vm_create(true);
+	mirror_vm = aux_vm_create(false);
+	dst_mirror_vm = aux_vm_create(false);
+
+	sev_mirror_create(mirror_vm, sev_vm);
+
+	sev_migrate_from(dst_mirror_vm, mirror_vm);
+	sev_migrate_from(dst_vm, sev_vm);
+
+	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_mirror_vm);
+	kvm_vm_free(dst_vm);
+	kvm_vm_free(sev_vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM));
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
+
+	have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES);
+
+	if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
+		test_sev_migrate_from(/* es= */ false);
+		if (have_sev_es)
+			test_sev_migrate_from(/* es= */ true);
+		test_sev_migrate_locking();
+		test_sev_migrate_parameters();
+		if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
+			test_sev_move_copy();
+	}
+	if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
+		test_sev_mirror(/* es= */ false);
+		if (have_sev_es)
+			test_sev_mirror(/* es= */ true);
+		test_sev_mirror_parameters();
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
new file mode 100644
index 000000000000..ae77698e6e97
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <math.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "linux/psp-sev.h"
+#include "sev.h"
+
+
+#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)
+
+static void guest_sev_es_code(void)
+{
+	/* TODO: Check CPUID after GHCB-based hypercall support is added. */
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ES_ENABLED);
+
+	/*
+	 * TODO: Add GHCB and ucall support for SEV-ES guests.  For now, simply
+	 * force "termination" to signal "done" via the GHCB MSR protocol.
+	 */
+	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
+	__asm__ __volatile__("rep; vmmcall");
+}
+
+static void guest_sev_code(void)
+{
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_SEV));
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
+
+	GUEST_DONE();
+}
+
+/* Stash state passed via VMSA before any compiled code runs.  */
+extern void guest_code_xsave(void);
+asm("guest_code_xsave:\n"
+    "mov $" __stringify(XFEATURE_MASK_X87_AVX) ", %eax\n"
+    "xor %edx, %edx\n"
+    "xsave (%rdi)\n"
+    "jmp guest_sev_es_code");
+
+static void compare_xsave(u8 *from_host, u8 *from_guest)
+{
+	int i;
+	bool bad = false;
+	for (i = 0; i < 4095; i++) {
+		if (from_host[i] != from_guest[i]) {
+			printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
+			bad = true;
+		}
+	}
+
+	if (bad)
+		abort();
+}
+
+static void test_sync_vmsa(uint32_t policy)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	vm_vaddr_t gva;
+	void *hva;
+
+	double x87val = M_PI;
+	struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 };
+
+	vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu);
+	gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
+				    MEM_REGION_TEST_DATA);
+	hva = addr_gva2hva(vm, gva);
+
+	vcpu_args_set(vcpu, 1, gva);
+
+	asm("fninit\n"
+	    "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n"
+	    "fldl %3\n"
+	    "xsave (%2)\n"
+	    "fstp %%st\n"
+	    : "=m"(xsave)
+	    : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val)
+	    : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
+	vcpu_xsave_set(vcpu, &xsave);
+
+	vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL);
+
+	/* This page is shared, so make it decrypted.  */
+	memset(hva, 0, 4096);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
+		    "Wanted SYSTEM_EVENT, got %s",
+		    exit_reason_str(vcpu->run->exit_reason));
+	TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
+	TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
+	TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
+
+	compare_xsave((u8 *)&xsave, (u8 *)hva);
+
+	kvm_vm_free(vm);
+}
+
+static void test_sev(void *guest_code, uint64_t policy)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM;
+
+	vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu);
+
+	/* TODO: Validate the measurement is as expected. */
+	vm_sev_launch(vm, policy, NULL);
+
+	for (;;) {
+		vcpu_run(vcpu);
+
+		if (policy & SEV_POLICY_ES) {
+			TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
+				    "Wanted SYSTEM_EVENT, got %s",
+				    exit_reason_str(vcpu->run->exit_reason));
+			TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
+			TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
+			TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
+			break;
+		}
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			continue;
+		case UCALL_DONE:
+			return;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected exit: %s",
+				  exit_reason_str(vcpu->run->exit_reason));
+		}
+	}
+
+	kvm_vm_free(vm);
+}
+
+static void guest_shutdown_code(void)
+{
+	struct desc_ptr idt;
+
+	/* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */
+	memset(&idt, 0, sizeof(idt));
+	__asm__ __volatile__("lidt %0" :: "m"(idt));
+
+	__asm__ __volatile__("ud2");
+}
+
+static void test_sev_es_shutdown(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	uint32_t type = KVM_X86_SEV_ES_VM;
+
+	vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu);
+
+	vm_sev_launch(vm, SEV_POLICY_ES, NULL);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN,
+		    "Wanted SHUTDOWN, got %s",
+		    exit_reason_str(vcpu->run->exit_reason));
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	const u64 xf_mask = XFEATURE_MASK_X87_AVX;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
+
+	test_sev(guest_sev_code, SEV_POLICY_NO_DBG);
+	test_sev(guest_sev_code, 0);
+
+	if (kvm_cpu_has(X86_FEATURE_SEV_ES)) {
+		test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
+		test_sev(guest_sev_es_code, SEV_POLICY_ES);
+
+		test_sev_es_shutdown();
+
+		if (kvm_has_cap(KVM_CAP_XCRS) &&
+		    (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) {
+			test_sync_vmsa(0);
+			test_sync_vmsa(SEV_POLICY_NO_DBG);
+		}
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c
new file mode 100644
index 000000000000..fabeeaddfb3a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Test that KVM emulates instructions in response to EPT violations when
+ * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR.
+ */
+#include "flds_emulation.h"
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define MAXPHYADDR 36
+
+#define MEM_REGION_GVA	0x0000123456789000
+#define MEM_REGION_GPA	0x0000000700000000
+#define MEM_REGION_SLOT	10
+#define MEM_REGION_SIZE PAGE_SIZE
+
+static void guest_code(bool tdp_enabled)
+{
+	uint64_t error_code;
+	uint64_t vector;
+
+	vector = kvm_asm_safe_ec(FLDS_MEM_EAX, error_code, "a"(MEM_REGION_GVA));
+
+	/*
+	 * When TDP is enabled, flds will trigger an emulation failure, exit to
+	 * userspace, and then the selftest host "VMM" skips the instruction.
+	 *
+	 * When TDP is disabled, no instruction emulation is required so flds
+	 * should generate #PF(RSVD).
+	 */
+	if (tdp_enabled) {
+		GUEST_ASSERT(!vector);
+	} else {
+		GUEST_ASSERT_EQ(vector, PF_VECTOR);
+		GUEST_ASSERT(error_code & PFERR_RSVD_MASK);
+	}
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	uint64_t *pte;
+	uint64_t *hva;
+	uint64_t gpa;
+	int rc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled());
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR);
+
+	rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE);
+	TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");
+	vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    MEM_REGION_GPA, MEM_REGION_SLOT,
+				    MEM_REGION_SIZE / PAGE_SIZE, 0);
+	gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE,
+				 MEM_REGION_GPA, MEM_REGION_SLOT);
+	TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc");
+	virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1);
+	hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+	memset(hva, 0, PAGE_SIZE);
+
+	pte = vm_get_page_table_entry(vm, MEM_REGION_GVA);
+	*pte |= BIT_ULL(MAXPHYADDR);
+
+	vcpu_run(vcpu);
+
+	/*
+	 * When TDP is enabled, KVM must emulate in response the guest physical
+	 * address that is illegal from the guest's perspective, but is legal
+	 * from hardware's perspeective.  This should result in an emulation
+	 * failure exit to userspace since KVM doesn't support emulating flds.
+	 */
+	if (kvm_is_tdp_enabled()) {
+		handle_flds_emulation_failure_exit(vcpu);
+		vcpu_run(vcpu);
+	}
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unrecognized ucall: %lu", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/smm_test.c b/tools/testing/selftests/kvm/x86/smm_test.c
new file mode 100644
index 000000000000..55c88d664a94
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/smm_test.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for SMM.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "vmx.h"
+#include "svm_util.h"
+
+#define SMRAM_SIZE 65536
+#define SMRAM_MEMSLOT ((1 << 16) | 1)
+#define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE)
+#define SMRAM_GPA 0x1000000
+#define SMRAM_STAGE 0xfe
+
+#define STR(x) #x
+#define XSTR(s) STR(s)
+
+#define SYNC_PORT 0xe
+#define DONE 0xff
+
+/*
+ * This is compiled as normal 64-bit code, however, SMI handler is executed
+ * in real-address mode. To stay simple we're limiting ourselves to a mode
+ * independent subset of asm here.
+ * SMI handler always report back fixed stage SMRAM_STAGE.
+ */
+uint8_t smi_handler[] = {
+	0xb0, SMRAM_STAGE,    /* mov $SMRAM_STAGE, %al */
+	0xe4, SYNC_PORT,      /* in $SYNC_PORT, %al */
+	0x0f, 0xaa,           /* rsm */
+};
+
+static inline void sync_with_host(uint64_t phase)
+{
+	asm volatile("in $" XSTR(SYNC_PORT)", %%al \n"
+		     : "+a" (phase));
+}
+
+static void self_smi(void)
+{
+	x2apic_write_reg(APIC_ICR,
+			 APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
+}
+
+static void l2_guest_code(void)
+{
+	sync_with_host(8);
+
+	sync_with_host(10);
+
+	vmcall();
+}
+
+static void guest_code(void *arg)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint64_t apicbase = rdmsr(MSR_IA32_APICBASE);
+	struct svm_test_data *svm = arg;
+	struct vmx_pages *vmx_pages = arg;
+
+	sync_with_host(1);
+
+	wrmsr(MSR_IA32_APICBASE, apicbase | X2APIC_ENABLE);
+
+	sync_with_host(2);
+
+	self_smi();
+
+	sync_with_host(4);
+
+	if (arg) {
+		if (this_cpu_has(X86_FEATURE_SVM)) {
+			generic_svm_setup(svm, l2_guest_code,
+					  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+		} else {
+			GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+			GUEST_ASSERT(load_vmcs(vmx_pages));
+			prepare_vmcs(vmx_pages, l2_guest_code,
+				     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+		}
+
+		sync_with_host(5);
+
+		self_smi();
+
+		sync_with_host(7);
+
+		if (this_cpu_has(X86_FEATURE_SVM)) {
+			run_guest(svm->vmcb, svm->vmcb_gpa);
+			run_guest(svm->vmcb, svm->vmcb_gpa);
+		} else {
+			vmlaunch();
+			vmresume();
+		}
+
+		/* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */
+		sync_with_host(12);
+	}
+
+	sync_with_host(DONE);
+}
+
+void inject_smi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	events.smi.pending = 1;
+	events.flags |= KVM_VCPUEVENT_VALID_SMM;
+
+	vcpu_events_set(vcpu, &events);
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_gva = 0;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_regs regs;
+	struct kvm_vm *vm;
+	struct kvm_x86_state *state;
+	int stage, stage_reported;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_SMM));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA,
+				    SMRAM_MEMSLOT, SMRAM_PAGES, 0);
+	TEST_ASSERT(vm_phy_pages_alloc(vm, SMRAM_PAGES, SMRAM_GPA, SMRAM_MEMSLOT)
+		    == SMRAM_GPA, "could not allocate guest physical addresses?");
+
+	memset(addr_gpa2hva(vm, SMRAM_GPA), 0x0, SMRAM_SIZE);
+	memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler,
+	       sizeof(smi_handler));
+
+	vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA);
+
+	if (kvm_has_cap(KVM_CAP_NESTED_STATE)) {
+		if (kvm_cpu_has(X86_FEATURE_SVM))
+			vcpu_alloc_svm(vm, &nested_gva);
+		else if (kvm_cpu_has(X86_FEATURE_VMX))
+			vcpu_alloc_vmx(vm, &nested_gva);
+	}
+
+	if (!nested_gva)
+		pr_info("will skip SMM test with VMX enabled\n");
+
+	vcpu_args_set(vcpu, 1, nested_gva);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		memset(&regs, 0, sizeof(regs));
+		vcpu_regs_get(vcpu, &regs);
+
+		stage_reported = regs.rax & 0xff;
+
+		if (stage_reported == DONE)
+			goto done;
+
+		TEST_ASSERT(stage_reported == stage ||
+			    stage_reported == SMRAM_STAGE,
+			    "Unexpected stage: #%x, got %x",
+			    stage, stage_reported);
+
+		/*
+		 * Enter SMM during L2 execution and check that we correctly
+		 * return from it. Do not perform save/restore while in SMM yet.
+		 */
+		if (stage == 8) {
+			inject_smi(vcpu);
+			continue;
+		}
+
+		/*
+		 * Perform save/restore while the guest is in SMM triggered
+		 * during L2 execution.
+		 */
+		if (stage == 10)
+			inject_smi(vcpu);
+
+		state = vcpu_save_state(vcpu);
+		kvm_vm_release(vm);
+
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+		kvm_x86_state_cleanup(state);
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c
new file mode 100644
index 000000000000..141b7fc0c965
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/state_test.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_GET/SET_* tests
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for vCPU state save/restore, including nested guest state.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#define L2_GUEST_STACK_SIZE 256
+
+void svm_l2_guest_code(void)
+{
+	GUEST_SYNC(4);
+	/* Exit to L1 */
+	vmcall();
+	GUEST_SYNC(6);
+	/* Done, exit to L1 and never come back.  */
+	vmcall();
+}
+
+static void svm_l1_guest_code(struct svm_test_data *svm)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	GUEST_ASSERT(svm->vmcb_gpa);
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm, svm_l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(3);
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_SYNC(5);
+	vmcb->save.rip += 3;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_SYNC(7);
+}
+
+void vmx_l2_guest_code(void)
+{
+	GUEST_SYNC(6);
+
+	/* Exit to L1 */
+	vmcall();
+
+	/* L1 has now set up a shadow VMCS for us.  */
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+	GUEST_SYNC(10);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
+	GUEST_SYNC(11);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
+	GUEST_SYNC(12);
+
+	/* Done, exit to L1 and never come back.  */
+	vmcall();
+}
+
+static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT(vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_SYNC(3);
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+	GUEST_SYNC(4);
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+	prepare_vmcs(vmx_pages, vmx_l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(5);
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	/* Check that the launched state is preserved.  */
+	GUEST_ASSERT(vmlaunch());
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_SYNC(7);
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
+
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+	vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa);
+
+	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+	GUEST_ASSERT(vmlaunch());
+	GUEST_SYNC(8);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(vmresume());
+
+	vmwrite(GUEST_RIP, 0xc0ffee);
+	GUEST_SYNC(9);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+
+	GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(vmresume());
+	GUEST_SYNC(13);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(vmresume());
+}
+
+static void __attribute__((__flatten__)) guest_code(void *arg)
+{
+	GUEST_SYNC(1);
+
+	if (this_cpu_has(X86_FEATURE_XSAVE)) {
+		uint64_t supported_xcr0 = this_cpu_supported_xcr0();
+		uint8_t buffer[4096];
+
+		memset(buffer, 0xcc, sizeof(buffer));
+
+		/*
+		 * Modify state for all supported xfeatures to take them out of
+		 * their "init" state, i.e. to make them show up in XSTATE_BV.
+		 *
+		 * Note off-by-default features, e.g. AMX, are out of scope for
+		 * this particular testcase as they have a different ABI.
+		 */
+		GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_FP);
+		asm volatile ("fincstp");
+
+		GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_SSE);
+		asm volatile ("vmovdqu %0, %%xmm0" :: "m" (buffer));
+
+		if (supported_xcr0 & XFEATURE_MASK_YMM)
+			asm volatile ("vmovdqu %0, %%ymm0" :: "m" (buffer));
+
+		if (supported_xcr0 & XFEATURE_MASK_AVX512) {
+			asm volatile ("kmovq %0, %%k1" :: "r" (-1ull));
+			asm volatile ("vmovupd %0, %%zmm0" :: "m" (buffer));
+			asm volatile ("vmovupd %0, %%zmm16" :: "m" (buffer));
+		}
+
+		if (this_cpu_has(X86_FEATURE_MPX)) {
+			uint64_t bounds[2] = { 10, 0xffffffffull };
+			uint64_t output[2] = { };
+
+			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDREGS);
+			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDCSR);
+
+			/*
+			 * Don't bother trying to get BNDCSR into the INUSE
+			 * state.  MSR_IA32_BNDCFGS doesn't count as it isn't
+			 * managed via XSAVE/XRSTOR, and BNDCFGU can only be
+			 * modified by XRSTOR.  Stuffing XSTATE_BV in the host
+			 * is simpler than doing XRSTOR here in the guest.
+			 *
+			 * However, temporarily enable MPX in BNDCFGS so that
+			 * BNDMOV actually loads BND1.  If MPX isn't *fully*
+			 * enabled, all MPX instructions are treated as NOPs.
+			 *
+			 * Hand encode "bndmov (%rax),%bnd1" as support for MPX
+			 * mnemonics/registers has been removed from gcc and
+			 * clang (and was never fully supported by clang).
+			 */
+			wrmsr(MSR_IA32_BNDCFGS, BIT_ULL(0));
+			asm volatile (".byte 0x66,0x0f,0x1a,0x08" :: "a" (bounds));
+			/*
+			 * Hand encode "bndmov %bnd1, (%rax)" to sanity check
+			 * that BND1 actually got loaded.
+			 */
+			asm volatile (".byte 0x66,0x0f,0x1b,0x08" :: "a" (output));
+			wrmsr(MSR_IA32_BNDCFGS, 0);
+
+			GUEST_ASSERT_EQ(bounds[0], output[0]);
+			GUEST_ASSERT_EQ(bounds[1], output[1]);
+		}
+		if (this_cpu_has(X86_FEATURE_PKU)) {
+			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_PKRU);
+			set_cr4(get_cr4() | X86_CR4_PKE);
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSPKE));
+
+			wrpkru(-1u);
+		}
+	}
+
+	GUEST_SYNC(2);
+
+	if (arg) {
+		if (this_cpu_has(X86_FEATURE_SVM))
+			svm_l1_guest_code(arg);
+		else
+			vmx_l1_guest_code(arg);
+	}
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t *xstate_bv, saved_xstate_bv;
+	vm_vaddr_t nested_gva = 0;
+	struct kvm_cpuid2 empty_cpuid = {};
+	struct kvm_regs regs1, regs2;
+	struct kvm_vcpu *vcpu, *vcpuN;
+	struct kvm_vm *vm;
+	struct kvm_x86_state *state;
+	struct ucall uc;
+	int stage;
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vcpu_regs_get(vcpu, &regs1);
+
+	if (kvm_has_cap(KVM_CAP_NESTED_STATE)) {
+		if (kvm_cpu_has(X86_FEATURE_SVM))
+			vcpu_alloc_svm(vm, &nested_gva);
+		else if (kvm_cpu_has(X86_FEATURE_VMX))
+			vcpu_alloc_vmx(vm, &nested_gva);
+	}
+
+	if (!nested_gva)
+		pr_info("will skip nested state checks\n");
+
+	vcpu_args_set(vcpu, 1, nested_gva);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+		state = vcpu_save_state(vcpu);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vcpu, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+
+		/*
+		 * Restore XSAVE state in a dummy vCPU, first without doing
+		 * KVM_SET_CPUID2, and then with an empty guest CPUID.  Except
+		 * for off-by-default xfeatures, e.g. AMX, KVM is supposed to
+		 * allow KVM_SET_XSAVE regardless of guest CPUID.  Manually
+		 * load only XSAVE state, MSRs in particular have a much more
+		 * convoluted ABI.
+		 *
+		 * Load two versions of XSAVE state: one with the actual guest
+		 * XSAVE state, and one with all supported features forced "on"
+		 * in xstate_bv, e.g. to ensure that KVM allows loading all
+		 * supported features, even if something goes awry in saving
+		 * the original snapshot.
+		 */
+		xstate_bv = (void *)&((uint8_t *)state->xsave->region)[512];
+		saved_xstate_bv = *xstate_bv;
+
+		vcpuN = __vm_vcpu_add(vm, vcpu->id + 1);
+		vcpu_xsave_set(vcpuN, state->xsave);
+		*xstate_bv = kvm_cpu_supported_xcr0();
+		vcpu_xsave_set(vcpuN, state->xsave);
+
+		vcpu_init_cpuid(vcpuN, &empty_cpuid);
+		vcpu_xsave_set(vcpuN, state->xsave);
+		*xstate_bv = saved_xstate_bv;
+		vcpu_xsave_set(vcpuN, state->xsave);
+
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vcpu, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c
new file mode 100644
index 000000000000..916e04248fbb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_int_ctl_test
+ *
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "apic.h"
+
+bool vintr_irq_called;
+bool intr_irq_called;
+
+#define VINTR_IRQ_NUMBER 0x20
+#define INTR_IRQ_NUMBER 0x30
+
+static void vintr_irq_handler(struct ex_regs *regs)
+{
+	vintr_irq_called = true;
+}
+
+static void intr_irq_handler(struct ex_regs *regs)
+{
+	x2apic_write_reg(APIC_EOI, 0x00);
+	intr_irq_called = true;
+}
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+	/* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC,
+	 * and since L1 didn't enable virtual interrupt masking,
+	 * L2 should receive it and not L1.
+	 *
+	 * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ
+	 * so it should also receive it after the following 'sti'.
+	 */
+	x2apic_write_reg(APIC_ICR,
+		APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER);
+
+	__asm__ __volatile__(
+		"sti\n"
+		"nop\n"
+	);
+
+	GUEST_ASSERT(vintr_irq_called);
+	GUEST_ASSERT(intr_irq_called);
+
+	__asm__ __volatile__(
+		"vmcall\n"
+	);
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	x2apic_enable();
+
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* No virtual interrupt masking */
+	vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+	/* No intercepts for real and virtual interrupts */
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
+
+	/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
+	vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);
+	vmcb->control.int_vector = VINTR_IRQ_NUMBER;
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t svm_gva;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler);
+	vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler);
+
+	vcpu_alloc_svm(vm, &svm_gva);
+	vcpu_args_set(vcpu, 1, svm_gva);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+		/* NOT REACHED */
+	case UCALL_DONE:
+		goto done;
+	default:
+		TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+	}
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c
new file mode 100644
index 000000000000..00135cbba35e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_nested_shutdown_test
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ * Nested SVM testing: test that unintercepted shutdown in L2 doesn't crash the host
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+	__asm__ __volatile__("ud2");
+}
+
+static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
+
+	idt[6].p   = 0; // #UD is intercepted but its injection will cause #NP
+	idt[11].p  = 0; // #NP is not intercepted and will cause another
+			// #NP that will be converted to #DF
+	idt[8].p   = 0; // #DF will cause #NP which will cause SHUTDOWN
+
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	/* should not reach here */
+	GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t svm_gva;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vcpu_alloc_svm(vm, &svm_gva);
+
+	vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
new file mode 100644
index 000000000000..7b6481d6c0d3
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Oracle and/or its affiliates.
+ *
+ * Based on:
+ *   svm_int_ctl_test
+ *
+ *   Copyright (C) 2021, Red Hat, Inc.
+ *
+ */
+#include <stdatomic.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "apic.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "test_util.h"
+
+#define INT_NR			0x20
+
+static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless");
+
+static unsigned int bp_fired;
+static void guest_bp_handler(struct ex_regs *regs)
+{
+	bp_fired++;
+}
+
+static unsigned int int_fired;
+static void l2_guest_code_int(void);
+
+static void guest_int_handler(struct ex_regs *regs)
+{
+	int_fired++;
+	GUEST_ASSERT_EQ(regs->rip, (unsigned long)l2_guest_code_int);
+}
+
+static void l2_guest_code_int(void)
+{
+	GUEST_ASSERT_EQ(int_fired, 1);
+
+	/*
+         * Same as the vmmcall() function, but with a ud2 sneaked after the
+         * vmmcall.  The caller injects an exception with the return address
+         * increased by 2, so the "pop rbp" must be after the ud2 and we cannot
+	 * use vmmcall() directly.
+         */
+	__asm__ __volatile__("push %%rbp; vmmcall; ud2; pop %%rbp"
+                             : : "a"(0xdeadbeef), "c"(0xbeefdead)
+                             : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+                               "r10", "r11", "r12", "r13", "r14", "r15");
+
+	GUEST_ASSERT_EQ(bp_fired, 1);
+	hlt();
+}
+
+static atomic_int nmi_stage;
+#define nmi_stage_get() atomic_load_explicit(&nmi_stage, memory_order_acquire)
+#define nmi_stage_inc() atomic_fetch_add_explicit(&nmi_stage, 1, memory_order_acq_rel)
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+	nmi_stage_inc();
+
+	if (nmi_stage_get() == 1) {
+		vmmcall();
+		GUEST_FAIL("Unexpected resume after VMMCALL");
+	} else {
+		GUEST_ASSERT_EQ(nmi_stage_get(), 3);
+		GUEST_DONE();
+	}
+}
+
+static void l2_guest_code_nmi(void)
+{
+	ud2();
+}
+
+static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t idt_alt)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	if (is_nmi)
+		x2apic_enable();
+
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm,
+			  is_nmi ? l2_guest_code_nmi : l2_guest_code_int,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR);
+	vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT);
+
+	if (is_nmi) {
+		vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+	} else {
+		vmcb->control.event_inj = INT_NR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_SOFT;
+		/* The return address pushed on stack */
+		vmcb->control.next_rip = vmcb->save.rip;
+	}
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL,
+		       "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       vmcb->control.exit_code,
+		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
+
+	if (is_nmi) {
+		clgi();
+		x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_NMI);
+
+		GUEST_ASSERT_EQ(nmi_stage_get(), 1);
+		nmi_stage_inc();
+
+		stgi();
+		/* self-NMI happens here */
+		while (true)
+			cpu_relax();
+	}
+
+	/* Skip over VMMCALL */
+	vmcb->save.rip += 3;
+
+	/* Switch to alternate IDT to cause intervening NPF again */
+	vmcb->save.idtr.base = idt_alt;
+	vmcb->control.clean = 0; /* &= ~BIT(VMCB_DT) would be enough */
+
+	vmcb->control.event_inj = BP_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+	/* The return address pushed on stack, skip over UD2 */
+	vmcb->control.next_rip = vmcb->save.rip + 2;
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT,
+		       "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       vmcb->control.exit_code,
+		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
+
+	GUEST_DONE();
+}
+
+static void run_test(bool is_nmi)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	vm_vaddr_t svm_gva;
+	vm_vaddr_t idt_alt_vm;
+	struct kvm_guest_debug debug;
+
+	pr_info("Running %s test\n", is_nmi ? "NMI" : "soft int");
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
+	vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler);
+	vm_install_exception_handler(vm, INT_NR, guest_int_handler);
+
+	vcpu_alloc_svm(vm, &svm_gva);
+
+	if (!is_nmi) {
+		void *idt, *idt_alt;
+
+		idt_alt_vm = vm_vaddr_alloc_page(vm);
+		idt_alt = addr_gva2hva(vm, idt_alt_vm);
+		idt = addr_gva2hva(vm, vm->arch.idt);
+		memcpy(idt_alt, idt, getpagesize());
+	} else {
+		idt_alt_vm = 0;
+	}
+	vcpu_args_set(vcpu, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm);
+
+	memset(&debug, 0, sizeof(debug));
+	vcpu_guest_debug_set(vcpu, &debug);
+
+	struct ucall uc;
+
+	alarm(2);
+	vcpu_run(vcpu);
+	alarm(0);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+		/* NOT REACHED */
+	case UCALL_DONE:
+		goto done;
+	default:
+		TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+	}
+done:
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS),
+		    "KVM with nSVM is supposed to unconditionally advertise nRIP Save");
+
+	atomic_init(&nmi_stage, 0);
+
+	run_test(false);
+	run_test(true);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86/svm_vmcall_test.c
new file mode 100644
index 000000000000..8a62cca28cfb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_vmcall_test.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ *
+ * Nested SVM testing: VMCALL
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t svm_gva;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	vcpu_alloc_svm(vm, &svm_gva);
+	vcpu_args_set(vcpu, 1, svm_gva);
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/sync_regs_test.c b/tools/testing/selftests/kvm/x86/sync_regs_test.c
new file mode 100644
index 000000000000..8fa3948b0170
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/sync_regs_test.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for x86 KVM_CAP_SYNC_REGS
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality,
+ * including requesting an invalid register set, updates to/from values
+ * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+
+#include "kvm_test_harness.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+struct ucall uc_none = {
+	.cmd = UCALL_NONE,
+};
+
+/*
+ * ucall is embedded here to protect against compiler reshuffling registers
+ * before calling a function. In this test we only need to get KVM_EXIT_IO
+ * vmexit and preserve RBX, no additional information is needed.
+ */
+void guest_code(void)
+{
+	asm volatile("1: in %[port], %%al\n"
+		     "add $0x1, %%rbx\n"
+		     "jmp 1b"
+		     : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none)
+		     : "rax", "rbx");
+}
+
+KVM_ONE_VCPU_TEST_SUITE(sync_regs_test);
+
+static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
+{
+#define REG_COMPARE(reg) \
+	TEST_ASSERT(left->reg == right->reg, \
+		    "Register " #reg \
+		    " values did not match: 0x%llx, 0x%llx", \
+		    left->reg, right->reg)
+	REG_COMPARE(rax);
+	REG_COMPARE(rbx);
+	REG_COMPARE(rcx);
+	REG_COMPARE(rdx);
+	REG_COMPARE(rsi);
+	REG_COMPARE(rdi);
+	REG_COMPARE(rsp);
+	REG_COMPARE(rbp);
+	REG_COMPARE(r8);
+	REG_COMPARE(r9);
+	REG_COMPARE(r10);
+	REG_COMPARE(r11);
+	REG_COMPARE(r12);
+	REG_COMPARE(r13);
+	REG_COMPARE(r14);
+	REG_COMPARE(r15);
+	REG_COMPARE(rip);
+	REG_COMPARE(rflags);
+#undef REG_COMPARE
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right)
+{
+}
+
+static void compare_vcpu_events(struct kvm_vcpu_events *left,
+				struct kvm_vcpu_events *right)
+{
+}
+
+#define TEST_SYNC_FIELDS   (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS)
+#define INVALID_SYNC_FIELD 0x80000000
+
+/*
+ * Set an exception as pending *and* injected while KVM is processing events.
+ * KVM is supposed to ignore/drop pending exceptions if userspace is also
+ * requesting that an exception be injected.
+ */
+static void *race_events_inj_pen(void *arg)
+{
+	struct kvm_run *run = (struct kvm_run *)arg;
+	struct kvm_vcpu_events *events = &run->s.regs.events;
+
+	WRITE_ONCE(events->exception.nr, UD_VECTOR);
+
+	for (;;) {
+		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS);
+		WRITE_ONCE(events->flags, 0);
+		WRITE_ONCE(events->exception.injected, 1);
+		WRITE_ONCE(events->exception.pending, 1);
+
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+/*
+ * Set an invalid exception vector while KVM is processing events.  KVM is
+ * supposed to reject any vector >= 32, as well as NMIs (vector 2).
+ */
+static void *race_events_exc(void *arg)
+{
+	struct kvm_run *run = (struct kvm_run *)arg;
+	struct kvm_vcpu_events *events = &run->s.regs.events;
+
+	for (;;) {
+		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS);
+		WRITE_ONCE(events->flags, 0);
+		WRITE_ONCE(events->exception.nr, UD_VECTOR);
+		WRITE_ONCE(events->exception.pending, 1);
+		WRITE_ONCE(events->exception.nr, 255);
+
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+/*
+ * Toggle CR4.PAE while KVM is processing SREGS, EFER.LME=1 with CR4.PAE=0 is
+ * illegal, and KVM's MMU heavily relies on vCPU state being valid.
+ */
+static noinline void *race_sregs_cr4(void *arg)
+{
+	struct kvm_run *run = (struct kvm_run *)arg;
+	__u64 *cr4 = &run->s.regs.sregs.cr4;
+	__u64 pae_enabled = *cr4;
+	__u64 pae_disabled = *cr4 & ~X86_CR4_PAE;
+
+	for (;;) {
+		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_SREGS);
+		WRITE_ONCE(*cr4, pae_enabled);
+		asm volatile(".rept 512\n\t"
+			     "nop\n\t"
+			     ".endr");
+		WRITE_ONCE(*cr4, pae_disabled);
+
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+static void race_sync_regs(struct kvm_vcpu *vcpu, void *racer)
+{
+	const time_t TIMEOUT = 2; /* seconds, roughly */
+	struct kvm_x86_state *state;
+	struct kvm_translation tr;
+	struct kvm_run *run;
+	pthread_t thread;
+	time_t t;
+
+	run = vcpu->run;
+
+	run->kvm_valid_regs = KVM_SYNC_X86_SREGS;
+	vcpu_run(vcpu);
+	run->kvm_valid_regs = 0;
+
+	/* Save state *before* spawning the thread that mucks with vCPU state. */
+	state = vcpu_save_state(vcpu);
+
+	/*
+	 * Selftests run 64-bit guests by default, both EFER.LME and CR4.PAE
+	 * should already be set in guest state.
+	 */
+	TEST_ASSERT((run->s.regs.sregs.cr4 & X86_CR4_PAE) &&
+		    (run->s.regs.sregs.efer & EFER_LME),
+		    "vCPU should be in long mode, CR4.PAE=%d, EFER.LME=%d",
+		    !!(run->s.regs.sregs.cr4 & X86_CR4_PAE),
+		    !!(run->s.regs.sregs.efer & EFER_LME));
+
+	TEST_ASSERT_EQ(pthread_create(&thread, NULL, racer, (void *)run), 0);
+
+	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
+		/*
+		 * Reload known good state if the vCPU triple faults, e.g. due
+		 * to the unhandled #GPs being injected.  VMX preserves state
+		 * on shutdown, but SVM synthesizes an INIT as the VMCB state
+		 * is architecturally undefined on triple fault.
+		 */
+		if (!__vcpu_run(vcpu) && run->exit_reason == KVM_EXIT_SHUTDOWN)
+			vcpu_load_state(vcpu, state);
+
+		if (racer == race_sregs_cr4) {
+			tr = (struct kvm_translation) { .linear_address = 0 };
+			__vcpu_ioctl(vcpu, KVM_TRANSLATE, &tr);
+		}
+	}
+
+	TEST_ASSERT_EQ(pthread_cancel(thread), 0);
+	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
+
+	kvm_x86_state_cleanup(state);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, read_invalid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
+
+	/* Request reading invalid register set from VCPU. */
+	run->kvm_valid_regs = INVALID_SYNC_FIELD;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_valid_regs = 0;
+
+	run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_valid_regs = 0;
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, set_invalid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
+
+	/* Request setting invalid register set into VCPU. */
+	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_dirty_regs = 0;
+
+	run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+	rv = _vcpu_run(vcpu);
+	TEST_ASSERT(rv < 0 && errno == EINVAL,
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
+		    rv);
+	run->kvm_dirty_regs = 0;
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vcpu_events events;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+
+	/* Request and verify all valid register sets. */
+	/* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	vcpu_regs_get(vcpu, &regs);
+	compare_regs(&regs, &run->s.regs.regs);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	compare_sregs(&sregs, &run->s.regs.sregs);
+
+	vcpu_events_get(vcpu, &events);
+	compare_vcpu_events(&events, &run->s.regs.events);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, set_and_verify_various, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vcpu_events events;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	/* Set and verify various register values. */
+	run->s.regs.regs.rbx = 0xBAD1DEA;
+	run->s.regs.sregs.apic_base = 1 << 11;
+	/* TODO run->s.regs.events.XYZ = ABC; */
+
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1,
+		    "rbx sync regs value incorrect 0x%llx.",
+		    run->s.regs.regs.rbx);
+	TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11,
+		    "apic_base sync regs value incorrect 0x%llx.",
+		    run->s.regs.sregs.apic_base);
+
+	vcpu_regs_get(vcpu, &regs);
+	compare_regs(&regs, &run->s.regs.regs);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	compare_sregs(&sregs, &run->s.regs.sregs);
+
+	vcpu_events_get(vcpu, &events);
+	compare_vcpu_events(&events, &run->s.regs.events);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_dirty_regs_bits, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+
+	/* Clear kvm_dirty_regs bits, verify new s.regs values are
+	 * overwritten with existing guest values.
+	 */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	run->kvm_dirty_regs = 0;
+	run->s.regs.regs.rbx = 0xDEADBEEF;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF,
+		    "rbx sync regs value incorrect 0x%llx.",
+		    run->s.regs.regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_and_dirty_regs, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	/* Clear kvm_valid_regs bits and kvm_dirty_bits.
+	 * Verify s.regs values are not overwritten with existing guest values
+	 * and that guest values are not overwritten with kvm_sync_regs values.
+	 */
+	run->kvm_valid_regs = 0;
+	run->kvm_dirty_regs = 0;
+	run->s.regs.regs.rbx = 0xAAAA;
+	vcpu_regs_get(vcpu, &regs);
+	regs.rbx = 0xBAC0;
+	vcpu_regs_set(vcpu, &regs);
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA,
+		    "rbx sync regs value incorrect 0x%llx.",
+		    run->s.regs.regs.rbx);
+	vcpu_regs_get(vcpu, &regs);
+	TEST_ASSERT(regs.rbx == 0xBAC0 + 1,
+		    "rbx guest value incorrect 0x%llx.",
+		    regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_regs_bits, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	/* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
+	 * with existing guest values but that guest values are overwritten
+	 * with kvm_sync_regs values.
+	 */
+	run->kvm_valid_regs = 0;
+	run->kvm_dirty_regs = TEST_SYNC_FIELDS;
+	run->s.regs.regs.rbx = 0xBBBB;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB,
+		    "rbx sync regs value incorrect 0x%llx.",
+		    run->s.regs.regs.rbx);
+	vcpu_regs_get(vcpu, &regs);
+	TEST_ASSERT(regs.rbx == 0xBBBB + 1,
+		    "rbx guest value incorrect 0x%llx.",
+		    regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, race_cr4, guest_code)
+{
+	race_sync_regs(vcpu, race_sregs_cr4);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, race_exc, guest_code)
+{
+	race_sync_regs(vcpu, race_events_exc);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, race_inj_pen, guest_code)
+{
+	race_sync_regs(vcpu, race_events_inj_pen);
+}
+
+int main(int argc, char *argv[])
+{
+	int cap;
+
+	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+	TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS);
+	TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD));
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86/triple_fault_event_test.c
new file mode 100644
index 000000000000..56306a19144a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/triple_fault_event_test.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define ARBITRARY_IO_PORT	0x2000
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
+}
+
+#define L2_GUEST_STACK_SIZE 64
+unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+void l1_guest_code_vmx(struct vmx_pages *vmx)
+{
+
+	GUEST_ASSERT(vmx->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+	GUEST_ASSERT(load_vmcs(vmx));
+
+	prepare_vmcs(vmx, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_ASSERT(!vmlaunch());
+	/* L2 should triple fault after a triple fault event injected. */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
+	GUEST_DONE();
+}
+
+void l1_guest_code_svm(struct svm_test_data *svm)
+{
+	struct vmcb *vmcb = svm->vmcb;
+
+	generic_svm_setup(svm, l2_guest_code,
+			&l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* don't intercept shutdown to test the case of SVM allowing to do so */
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
+
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	/* should not reach here, L1 should crash  */
+	GUEST_ASSERT(0);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vcpu_events events;
+	struct ucall uc;
+
+	bool has_vmx = kvm_cpu_has(X86_FEATURE_VMX);
+	bool has_svm = kvm_cpu_has(X86_FEATURE_SVM);
+
+	TEST_REQUIRE(has_vmx || has_svm);
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT));
+
+
+	if (has_vmx) {
+		vm_vaddr_t vmx_pages_gva;
+
+		vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_vmx);
+		vcpu_alloc_vmx(vm, &vmx_pages_gva);
+		vcpu_args_set(vcpu, 1, vmx_pages_gva);
+	} else {
+		vm_vaddr_t svm_gva;
+
+		vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_svm);
+		vcpu_alloc_svm(vm, &svm_gva);
+		vcpu_args_set(vcpu, 1, svm_gva);
+	}
+
+	vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1);
+	run = vcpu->run;
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
+		    "Expected IN from port %d from L2, got port %d",
+		    ARBITRARY_IO_PORT, run->io.port);
+	vcpu_events_get(vcpu, &events);
+	events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+	events.triple_fault.pending = true;
+	vcpu_events_set(vcpu, &events);
+	run->immediate_exit = true;
+	vcpu_run_complete_io(vcpu);
+
+	vcpu_events_get(vcpu, &events);
+	TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT,
+		    "Triple fault event invalid");
+	TEST_ASSERT(events.triple_fault.pending,
+		    "No triple fault pending");
+	vcpu_run(vcpu);
+
+
+	if (has_svm) {
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
+	} else {
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_DONE:
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86/tsc_msrs_test.c
new file mode 100644
index 000000000000..12b0964f4f13
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/tsc_msrs_test.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define UNITY                  (1ull << 30)
+#define HOST_ADJUST            (UNITY * 64)
+#define GUEST_STEP             (UNITY * 4)
+#define ROUND(x)               ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x)       ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x)  ROUND(vcpu_get_msr(vcpu, x))
+
+static void guest_code(void)
+{
+	u64 val = 0;
+
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
+	val = 1ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
+	GUEST_SYNC(2);
+	val = 2ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC_ADJUST, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Host: setting the TSC offset.  */
+	GUEST_SYNC(3);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+	 * host-side offset and affect both MSRs.
+	 */
+	GUEST_SYNC(4);
+	val = 3ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC_ADJUST, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+	 * offset is now visible in MSR_IA32_TSC_ADJUST.
+	 */
+	GUEST_SYNC(5);
+	val = 4ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+	GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		if (!strcmp((const char *)uc.args[0], "hello") &&
+		    uc.args[1] == stage + 1)
+			ksft_test_result_pass("stage %d passed\n", stage + 1);
+		else
+			ksft_test_result_fail(
+				"stage %d: Unexpected register values vmexit, got %lx",
+				stage + 1, (ulong)uc.args[1]);
+		return;
+	case UCALL_DONE:
+		ksft_test_result_pass("stage %d passed\n", stage + 1);
+		return;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_ASSERT(false, "Unexpected exit: %s",
+			    exit_reason_str(vcpu->run->exit_reason));
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t val;
+
+	ksft_print_header();
+	ksft_set_plan(5);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	val = 0;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
+	run_vcpu(vcpu, 1);
+	val = 1ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
+	run_vcpu(vcpu, 2);
+	val = 2ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Host: writes to MSR_IA32_TSC set the host-side offset
+	 * and therefore do not change MSR_IA32_TSC_ADJUST.
+	 */
+	vcpu_set_msr(vcpu, MSR_IA32_TSC, HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+	run_vcpu(vcpu, 3);
+
+	/* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC.  */
+	vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+	/* Restore previous value.  */
+	vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+	 * host-side offset and affect both MSRs.
+	 */
+	run_vcpu(vcpu, 4);
+	val = 3ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+	 * offset is now visible in MSR_IA32_TSC_ADJUST.
+	 */
+	run_vcpu(vcpu, 5);
+	val = 4ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+	kvm_vm_free(vm);
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/x86/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86/tsc_scaling_sync.c
new file mode 100644
index 000000000000..59c7304f805e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/tsc_scaling_sync.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#define NR_TEST_VCPUS 20
+
+static struct kvm_vm *vm;
+pthread_spinlock_t create_lock;
+
+#define TEST_TSC_KHZ    2345678UL
+#define TEST_TSC_OFFSET 200000000
+
+uint64_t tsc_sync;
+static void guest_code(void)
+{
+	uint64_t start_tsc, local_tsc, tmp;
+
+	start_tsc = rdtsc();
+	do {
+		tmp = READ_ONCE(tsc_sync);
+		local_tsc = rdtsc();
+		WRITE_ONCE(tsc_sync, local_tsc);
+		if (unlikely(local_tsc < tmp))
+			GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
+
+	} while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
+
+	GUEST_DONE();
+}
+
+
+static void *run_vcpu(void *_cpu_nr)
+{
+	unsigned long vcpu_id = (unsigned long)_cpu_nr;
+	unsigned long failures = 0;
+	static bool first_cpu_done;
+	struct kvm_vcpu *vcpu;
+
+	/* The kernel is fine, but vm_vcpu_add() needs locking */
+	pthread_spin_lock(&create_lock);
+
+	vcpu = vm_vcpu_add(vm, vcpu_id, guest_code);
+
+	if (!first_cpu_done) {
+		first_cpu_done = true;
+		vcpu_set_msr(vcpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
+	}
+
+	pthread_spin_unlock(&create_lock);
+
+	for (;;) {
+                struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+                case UCALL_DONE:
+			goto out;
+
+                case UCALL_SYNC:
+			printf("Guest %d sync %lx %lx %ld\n", vcpu->id,
+			       uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
+			failures++;
+			break;
+
+                default:
+                        TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+ out:
+	return (void *)failures;
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_TSC_CONTROL));
+
+	vm = vm_create(NR_TEST_VCPUS);
+	vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
+
+	pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
+	pthread_t cpu_threads[NR_TEST_VCPUS];
+	unsigned long cpu;
+	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
+		pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
+
+	unsigned long failures = 0;
+	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
+		void *this_cpu_failures;
+		pthread_join(cpu_threads[cpu], &this_cpu_failures);
+		failures += (unsigned long)this_cpu_failures;
+	}
+
+	TEST_ASSERT(!failures, "TSC sync failed");
+	pthread_spin_destroy(&create_lock);
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/ucna_injection_test.c b/tools/testing/selftests/kvm/x86/ucna_injection_test.c
new file mode 100644
index 000000000000..57f157c06b39
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/ucna_injection_test.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucna_injection_test
+ *
+ * Copyright (C) 2022, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that user space can inject UnCorrectable No Action required (UCNA)
+ * memory errors to the guest.
+ *
+ * The test starts one vCPU with the MCG_CMCI_P enabled. It verifies that
+ * proper UCNA errors can be injected to a vCPU with MCG_CMCI_P and
+ * corresponding per-bank control register (MCI_CTL2) bit enabled.
+ * The test also checks that the UCNA errors get recorded in the
+ * Machine Check bank registers no matter the error signal interrupts get
+ * delivered into the guest or not.
+ *
+ */
+#include <pthread.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "kvm_util.h"
+#include "mce.h"
+#include "processor.h"
+#include "test_util.h"
+#include "apic.h"
+
+#define SYNC_FIRST_UCNA 9
+#define SYNC_SECOND_UCNA 10
+#define SYNC_GP 11
+#define FIRST_UCNA_ADDR 0xdeadbeef
+#define SECOND_UCNA_ADDR 0xcafeb0ba
+
+/*
+ * Vector for the CMCI interrupt.
+ * Value is arbitrary. Any value in 0x20-0xFF should work:
+ * https://wiki.osdev.org/Interrupt_Vector_Table
+ */
+#define CMCI_VECTOR  0xa9
+
+#define UCNA_BANK  0x7	// IMC0 bank
+
+#define MCI_CTL2_RESERVED_BIT BIT_ULL(29)
+
+static uint64_t supported_mcg_caps;
+
+/*
+ * Record states about the injected UCNA.
+ * The variables started with the 'i_' prefixes are recorded in interrupt
+ * handler. Variables without the 'i_' prefixes are recorded in guest main
+ * execution thread.
+ */
+static volatile uint64_t i_ucna_rcvd;
+static volatile uint64_t i_ucna_addr;
+static volatile uint64_t ucna_addr;
+static volatile uint64_t ucna_addr2;
+
+struct thread_params {
+	struct kvm_vcpu *vcpu;
+	uint64_t *p_i_ucna_rcvd;
+	uint64_t *p_i_ucna_addr;
+	uint64_t *p_ucna_addr;
+	uint64_t *p_ucna_addr2;
+};
+
+static void verify_apic_base_addr(void)
+{
+	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
+	uint64_t base = GET_APIC_BASE(msr);
+
+	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
+}
+
+static void ucna_injection_guest_code(void)
+{
+	uint64_t ctl2;
+	verify_apic_base_addr();
+	xapic_enable();
+
+	/* Sets up the interrupt vector and enables per-bank CMCI sigaling. */
+	xapic_write_reg(APIC_LVTCMCI, CMCI_VECTOR | APIC_DM_FIXED);
+	ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN);
+
+	/* Enables interrupt in guest. */
+	asm volatile("sti");
+
+	/* Let user space inject the first UCNA */
+	GUEST_SYNC(SYNC_FIRST_UCNA);
+
+	ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
+
+	/* Disables the per-bank CMCI signaling. */
+	ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 & ~MCI_CTL2_CMCI_EN);
+
+	/* Let the user space inject the second UCNA */
+	GUEST_SYNC(SYNC_SECOND_UCNA);
+
+	ucna_addr2 = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
+	GUEST_DONE();
+}
+
+static void cmci_disabled_guest_code(void)
+{
+	uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN);
+
+	GUEST_DONE();
+}
+
+static void cmci_enabled_guest_code(void)
+{
+	uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_RESERVED_BIT);
+
+	GUEST_DONE();
+}
+
+static void guest_cmci_handler(struct ex_regs *regs)
+{
+	i_ucna_rcvd++;
+	i_ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
+	xapic_write_reg(APIC_EOI, 0);
+}
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	GUEST_SYNC(SYNC_GP);
+}
+
+static void run_vcpu_expect_gp(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_SYNC,
+		    "Expect UCALL_SYNC");
+	TEST_ASSERT(uc.args[1] == SYNC_GP, "#GP is expected.");
+	printf("vCPU received GP in guest.\n");
+}
+
+static void inject_ucna(struct kvm_vcpu *vcpu, uint64_t addr) {
+	/*
+	 * A UCNA error is indicated with VAL=1, UC=1, PCC=0, S=0 and AR=0 in
+	 * the IA32_MCi_STATUS register.
+	 * MSCOD=1 (BIT[16] - MscodDataRdErr).
+	 * MCACOD=0x0090 (Memory controller error format, channel 0)
+	 */
+	uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+			  MCI_STATUS_MISCV | MCI_STATUS_ADDRV | 0x10090;
+	struct kvm_x86_mce mce = {};
+	mce.status = status;
+	mce.mcg_status = 0;
+	/*
+	 * MCM_ADDR_PHYS indicates the reported address is a physical address.
+	 * Lowest 6 bits is the recoverable address LSB, i.e., the injected MCE
+	 * is at 4KB granularity.
+	 */
+	mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+	mce.addr = addr;
+	mce.bank = UCNA_BANK;
+
+	vcpu_ioctl(vcpu, KVM_X86_SET_MCE, &mce);
+}
+
+static void *run_ucna_injection(void *arg)
+{
+	struct thread_params *params = (struct thread_params *)arg;
+	struct ucall uc;
+	int old;
+	int r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(r == 0,
+		    "pthread_setcanceltype failed with errno=%d",
+		    r);
+
+	vcpu_run(params->vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC,
+		    "Expect UCALL_SYNC");
+	TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA.");
+
+	printf("Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR);
+
+	inject_ucna(params->vcpu, FIRST_UCNA_ADDR);
+	vcpu_run(params->vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC,
+		    "Expect UCALL_SYNC");
+	TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA.");
+
+	printf("Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR);
+
+	inject_ucna(params->vcpu, SECOND_UCNA_ADDR);
+	vcpu_run(params->vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
+	if (get_ucall(params->vcpu, &uc) == UCALL_ABORT) {
+		TEST_ASSERT(false, "vCPU assertion failure: %s.",
+			    (const char *)uc.args[0]);
+	}
+
+	return NULL;
+}
+
+static void test_ucna_injection(struct kvm_vcpu *vcpu, struct thread_params *params)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	params->vcpu = vcpu;
+	params->p_i_ucna_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_rcvd);
+	params->p_i_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_addr);
+	params->p_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr);
+	params->p_ucna_addr2 = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr2);
+
+	run_ucna_injection(params);
+
+	TEST_ASSERT(*params->p_i_ucna_rcvd == 1, "Only first UCNA get signaled.");
+	TEST_ASSERT(*params->p_i_ucna_addr == FIRST_UCNA_ADDR,
+		    "Only first UCNA reported addr get recorded via interrupt.");
+	TEST_ASSERT(*params->p_ucna_addr == FIRST_UCNA_ADDR,
+		    "First injected UCNAs should get exposed via registers.");
+	TEST_ASSERT(*params->p_ucna_addr2 == SECOND_UCNA_ADDR,
+		    "Second injected UCNAs should get exposed via registers.");
+
+	printf("Test successful.\n"
+	       "UCNA CMCI interrupts received: %ld\n"
+	       "Last UCNA address received via CMCI: %lx\n"
+	       "First UCNA address in vCPU thread: %lx\n"
+	       "Second UCNA address in vCPU thread: %lx\n",
+	       *params->p_i_ucna_rcvd, *params->p_i_ucna_addr,
+	       *params->p_ucna_addr, *params->p_ucna_addr2);
+}
+
+static void setup_mce_cap(struct kvm_vcpu *vcpu, bool enable_cmci_p)
+{
+	uint64_t mcg_caps = MCG_CTL_P | MCG_SER_P | MCG_LMCE_P | KVM_MAX_MCE_BANKS;
+	if (enable_cmci_p)
+		mcg_caps |= MCG_CMCI_P;
+
+	mcg_caps &= supported_mcg_caps | MCG_CAP_BANKS_MASK;
+	vcpu_ioctl(vcpu, KVM_X86_SETUP_MCE, &mcg_caps);
+}
+
+static struct kvm_vcpu *create_vcpu_with_mce_cap(struct kvm_vm *vm, uint32_t vcpuid,
+						 bool enable_cmci_p, void *guest_code)
+{
+	struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpuid, guest_code);
+	setup_mce_cap(vcpu, enable_cmci_p);
+	return vcpu;
+}
+
+int main(int argc, char *argv[])
+{
+	struct thread_params params;
+	struct kvm_vm *vm;
+	struct kvm_vcpu *ucna_vcpu;
+	struct kvm_vcpu *cmcidis_vcpu;
+	struct kvm_vcpu *cmci_vcpu;
+
+	kvm_check_cap(KVM_CAP_MCE);
+
+	vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0);
+
+	kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED,
+		  &supported_mcg_caps);
+
+	if (!(supported_mcg_caps & MCG_CMCI_P)) {
+		print_skip("MCG_CMCI_P is not supported");
+		exit(KSFT_SKIP);
+	}
+
+	ucna_vcpu = create_vcpu_with_mce_cap(vm, 0, true, ucna_injection_guest_code);
+	cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code);
+	cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code);
+
+	vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler);
+	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	test_ucna_injection(ucna_vcpu, &params);
+	run_vcpu_expect_gp(cmcidis_vcpu);
+	run_vcpu_expect_gp(cmci_vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/userspace_io_test.c b/tools/testing/selftests/kvm/x86/userspace_io_test.c
new file mode 100644
index 000000000000..9481cbcf284f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/userspace_io_test.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+static void guest_ins_port80(uint8_t *buffer, unsigned int count)
+{
+	unsigned long end;
+
+	if (count == 2)
+		end = (unsigned long)buffer + 1;
+	else
+		end = (unsigned long)buffer + 8192;
+
+	asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
+	GUEST_ASSERT_EQ(count, 0);
+	GUEST_ASSERT_EQ((unsigned long)buffer, end);
+}
+
+static void guest_code(void)
+{
+	uint8_t buffer[8192];
+	int i;
+
+	/*
+	 * Special case tests.  main() will adjust RCX 2 => 1 and 3 => 8192 to
+	 * test that KVM doesn't explode when userspace modifies the "count" on
+	 * a userspace I/O exit.  KVM isn't required to play nice with the I/O
+	 * itself as KVM doesn't support manipulating the count, it just needs
+	 * to not explode or overflow a buffer.
+	 */
+	guest_ins_port80(buffer, 2);
+	guest_ins_port80(buffer, 3);
+
+	/* Verify KVM fills the buffer correctly when not stuffing RCX. */
+	memset(buffer, 0, sizeof(buffer));
+	guest_ins_port80(buffer, 8192);
+	for (i = 0; i < 8192; i++)
+		__GUEST_ASSERT(buffer[i] == 0xaa,
+			       "Expected '0xaa', got '0x%x' at buffer[%u]",
+			       buffer[i], i);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_regs regs;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+
+	memset(&regs, 0, sizeof(regs));
+
+	while (1) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		if (get_ucall(vcpu, &uc))
+			break;
+
+		TEST_ASSERT(run->io.port == 0x80,
+			    "Expected I/O at port 0x80, got port 0x%x", run->io.port);
+
+		/*
+		 * Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
+		 * Note, this abuses KVM's batching of rep string I/O to avoid
+		 * getting stuck in an infinite loop.  That behavior isn't in
+		 * scope from a testing perspective as it's not ABI in any way,
+		 * i.e. it really is abusing internal KVM knowledge.
+		 */
+		vcpu_regs_get(vcpu, &regs);
+		if (regs.rcx == 2)
+			regs.rcx = 1;
+		if (regs.rcx == 3)
+			regs.rcx = 8192;
+		memset((void *)run + run->io.data_offset, 0xaa, 4096);
+		vcpu_regs_set(vcpu, &regs);
+	}
+
+	switch (uc.cmd) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86/userspace_msr_exit_test.c
new file mode 100644
index 000000000000..32b2794b78fe
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/userspace_msr_exit_test.c
@@ -0,0 +1,769 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for exiting into userspace on registered MSRs
+ */
+#include <sys/ioctl.h>
+
+#include "kvm_test_harness.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define MSR_NON_EXISTENT 0x474f4f00
+
+static u64 deny_bits = 0;
+struct kvm_msr_filter filter_allow = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ |
+				 KVM_MSR_FILTER_WRITE,
+			.nmsrs = 1,
+			/* Test an MSR the kernel knows about. */
+			.base = MSR_IA32_XSS,
+			.bitmap = (uint8_t*)&deny_bits,
+		}, {
+			.flags = KVM_MSR_FILTER_READ |
+				 KVM_MSR_FILTER_WRITE,
+			.nmsrs = 1,
+			/* Test an MSR the kernel doesn't know about. */
+			.base = MSR_IA32_FLUSH_CMD,
+			.bitmap = (uint8_t*)&deny_bits,
+		}, {
+			.flags = KVM_MSR_FILTER_READ |
+				 KVM_MSR_FILTER_WRITE,
+			.nmsrs = 1,
+			/* Test a fabricated MSR that no one knows about. */
+			.base = MSR_NON_EXISTENT,
+			.bitmap = (uint8_t*)&deny_bits,
+		},
+	},
+};
+
+struct kvm_msr_filter filter_fs = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ,
+			.nmsrs = 1,
+			.base = MSR_FS_BASE,
+			.bitmap = (uint8_t*)&deny_bits,
+		},
+	},
+};
+
+struct kvm_msr_filter filter_gs = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ,
+			.nmsrs = 1,
+			.base = MSR_GS_BASE,
+			.bitmap = (uint8_t*)&deny_bits,
+		},
+	},
+};
+
+static uint64_t msr_non_existent_data;
+static int guest_exception_count;
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+	u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+	bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+	memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+	memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+	memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+	memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+	memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+	deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+	deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+	deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter_deny = {
+	.flags = KVM_MSR_FILTER_DEFAULT_DENY,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ,
+			.base = 0x00000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_00000000,
+		}, {
+			.flags = KVM_MSR_FILTER_WRITE,
+			.base = 0x00000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_00000000_write,
+		}, {
+			.flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+			.base = 0x40000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_40000000,
+		}, {
+			.flags = KVM_MSR_FILTER_READ,
+			.base = 0xc0000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_c0000000_read,
+		}, {
+			.flags = KVM_MSR_FILTER_WRITE,
+			.base = 0xc0000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_c0000000,
+		}, {
+			.flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+			.base = 0xdeadbeef,
+			.nmsrs = 1,
+			.bitmap = bitmap_deadbeef,
+		},
+	},
+};
+
+struct kvm_msr_filter no_filter_deny = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+/*
+ * Note: Force test_rdmsr() to not be inlined to prevent the labels,
+ * rdmsr_start and rdmsr_end, from being defined multiple times.
+ */
+static noinline uint64_t test_rdmsr(uint32_t msr)
+{
+	uint32_t a, d;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__("rdmsr_start: rdmsr; rdmsr_end:" :
+			"=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+	return a | ((uint64_t) d << 32);
+}
+
+/*
+ * Note: Force test_wrmsr() to not be inlined to prevent the labels,
+ * wrmsr_start and wrmsr_end, from being defined multiple times.
+ */
+static noinline void test_wrmsr(uint32_t msr, uint64_t value)
+{
+	uint32_t a = value;
+	uint32_t d = value >> 32;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__("wrmsr_start: wrmsr; wrmsr_end:" ::
+			"a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+extern char rdmsr_start, rdmsr_end;
+extern char wrmsr_start, wrmsr_end;
+
+/*
+ * Note: Force test_em_rdmsr() to not be inlined to prevent the labels,
+ * rdmsr_start and rdmsr_end, from being defined multiple times.
+ */
+static noinline uint64_t test_em_rdmsr(uint32_t msr)
+{
+	uint32_t a, d;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__(KVM_FEP "em_rdmsr_start: rdmsr; em_rdmsr_end:" :
+			"=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+	return a | ((uint64_t) d << 32);
+}
+
+/*
+ * Note: Force test_em_wrmsr() to not be inlined to prevent the labels,
+ * wrmsr_start and wrmsr_end, from being defined multiple times.
+ */
+static noinline void test_em_wrmsr(uint32_t msr, uint64_t value)
+{
+	uint32_t a = value;
+	uint32_t d = value >> 32;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__(KVM_FEP "em_wrmsr_start: wrmsr; em_wrmsr_end:" ::
+			"a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+extern char em_rdmsr_start, em_rdmsr_end;
+extern char em_wrmsr_start, em_wrmsr_end;
+
+static void guest_code_filter_allow(void)
+{
+	uint64_t data;
+
+	/*
+	 * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_XSS.
+	 *
+	 * A GP is thrown if anything other than 0 is written to
+	 * MSR_IA32_XSS.
+	 */
+	data = test_rdmsr(MSR_IA32_XSS);
+	GUEST_ASSERT(data == 0);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	test_wrmsr(MSR_IA32_XSS, 0);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	test_wrmsr(MSR_IA32_XSS, 1);
+	GUEST_ASSERT(guest_exception_count == 1);
+
+	/*
+	 * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_FLUSH_CMD.
+	 *
+	 * A GP is thrown if MSR_IA32_FLUSH_CMD is read
+	 * from or if a value other than 1 is written to it.
+	 */
+	test_rdmsr(MSR_IA32_FLUSH_CMD);
+	GUEST_ASSERT(guest_exception_count == 1);
+
+	test_wrmsr(MSR_IA32_FLUSH_CMD, 0);
+	GUEST_ASSERT(guest_exception_count == 1);
+
+	test_wrmsr(MSR_IA32_FLUSH_CMD, 1);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	/*
+	 * Test userspace intercepting rdmsr / wrmsr for MSR_NON_EXISTENT.
+	 *
+	 * Test that a fabricated MSR can pass through the kernel
+	 * and be handled in userspace.
+	 */
+	test_wrmsr(MSR_NON_EXISTENT, 2);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	data = test_rdmsr(MSR_NON_EXISTENT);
+	GUEST_ASSERT(data == 2);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	if (is_forced_emulation_enabled) {
+		/* Let userspace know we aren't done. */
+		GUEST_SYNC(0);
+
+		/*
+		 * Now run the same tests with the instruction emulator.
+		 */
+		data = test_em_rdmsr(MSR_IA32_XSS);
+		GUEST_ASSERT(data == 0);
+		GUEST_ASSERT(guest_exception_count == 0);
+		test_em_wrmsr(MSR_IA32_XSS, 0);
+		GUEST_ASSERT(guest_exception_count == 0);
+		test_em_wrmsr(MSR_IA32_XSS, 1);
+		GUEST_ASSERT(guest_exception_count == 1);
+
+		test_em_rdmsr(MSR_IA32_FLUSH_CMD);
+		GUEST_ASSERT(guest_exception_count == 1);
+		test_em_wrmsr(MSR_IA32_FLUSH_CMD, 0);
+		GUEST_ASSERT(guest_exception_count == 1);
+		test_em_wrmsr(MSR_IA32_FLUSH_CMD, 1);
+		GUEST_ASSERT(guest_exception_count == 0);
+
+		test_em_wrmsr(MSR_NON_EXISTENT, 2);
+		GUEST_ASSERT(guest_exception_count == 0);
+		data = test_em_rdmsr(MSR_NON_EXISTENT);
+		GUEST_ASSERT(data == 2);
+		GUEST_ASSERT(guest_exception_count == 0);
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_msr_calls(bool trapped)
+{
+	/* This goes into the in-kernel emulation */
+	wrmsr(MSR_SYSCALL_MASK, 0);
+
+	if (trapped) {
+		/* This goes into user space emulation */
+		GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+		GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+	} else {
+		GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+		GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+	}
+
+	/* If trapped == true, this goes into user space emulation */
+	wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+	/* This goes into the in-kernel emulation */
+	rdmsr(MSR_IA32_POWER_CTL);
+
+	/* Invalid MSR, should always be handled by user space exit */
+	GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+	wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code_filter_deny(void)
+{
+	guest_msr_calls(true);
+
+	/*
+	 * Disable msr filtering, so that the kernel
+	 * handles everything in the next round
+	 */
+	GUEST_SYNC(0);
+
+	guest_msr_calls(false);
+
+	GUEST_DONE();
+}
+
+static void guest_code_permission_bitmap(void)
+{
+	uint64_t data;
+
+	data = test_rdmsr(MSR_FS_BASE);
+	GUEST_ASSERT(data == MSR_FS_BASE);
+	data = test_rdmsr(MSR_GS_BASE);
+	GUEST_ASSERT(data != MSR_GS_BASE);
+
+	/* Let userspace know to switch the filter */
+	GUEST_SYNC(0);
+
+	data = test_rdmsr(MSR_FS_BASE);
+	GUEST_ASSERT(data != MSR_FS_BASE);
+	data = test_rdmsr(MSR_GS_BASE);
+	GUEST_ASSERT(data == MSR_GS_BASE);
+
+	GUEST_DONE();
+}
+
+static void __guest_gp_handler(struct ex_regs *regs,
+			       char *r_start, char *r_end,
+			       char *w_start, char *w_end)
+{
+	if (regs->rip == (uintptr_t)r_start) {
+		regs->rip = (uintptr_t)r_end;
+		regs->rax = 0;
+		regs->rdx = 0;
+	} else if (regs->rip == (uintptr_t)w_start) {
+		regs->rip = (uintptr_t)w_end;
+	} else {
+		GUEST_ASSERT(!"RIP is at an unknown location!");
+	}
+
+	++guest_exception_count;
+}
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	__guest_gp_handler(regs, &rdmsr_start, &rdmsr_end,
+			   &wrmsr_start, &wrmsr_end);
+}
+
+static void guest_fep_gp_handler(struct ex_regs *regs)
+{
+	__guest_gp_handler(regs, &em_rdmsr_start, &em_rdmsr_end,
+			   &em_wrmsr_start, &em_wrmsr_end);
+}
+
+static void check_for_guest_assert(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	if (vcpu->run->exit_reason == KVM_EXIT_IO &&
+	    get_ucall(vcpu, &uc) == UCALL_ABORT) {
+		REPORT_GUEST_ASSERT(uc);
+	}
+}
+
+static void process_rdmsr(struct kvm_vcpu *vcpu, uint32_t msr_index)
+{
+	struct kvm_run *run = vcpu->run;
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_RDMSR);
+	TEST_ASSERT(run->msr.index == msr_index,
+			"Unexpected msr (0x%04x), expected 0x%04x",
+			run->msr.index, msr_index);
+
+	switch (run->msr.index) {
+	case MSR_IA32_XSS:
+		run->msr.data = 0;
+		break;
+	case MSR_IA32_FLUSH_CMD:
+		run->msr.error = 1;
+		break;
+	case MSR_NON_EXISTENT:
+		run->msr.data = msr_non_existent_data;
+		break;
+	case MSR_FS_BASE:
+		run->msr.data = MSR_FS_BASE;
+		break;
+	case MSR_GS_BASE:
+		run->msr.data = MSR_GS_BASE;
+		break;
+	default:
+		TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index);
+	}
+}
+
+static void process_wrmsr(struct kvm_vcpu *vcpu, uint32_t msr_index)
+{
+	struct kvm_run *run = vcpu->run;
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_WRMSR);
+	TEST_ASSERT(run->msr.index == msr_index,
+			"Unexpected msr (0x%04x), expected 0x%04x",
+			run->msr.index, msr_index);
+
+	switch (run->msr.index) {
+	case MSR_IA32_XSS:
+		if (run->msr.data != 0)
+			run->msr.error = 1;
+		break;
+	case MSR_IA32_FLUSH_CMD:
+		if (run->msr.data != 1)
+			run->msr.error = 1;
+		break;
+	case MSR_NON_EXISTENT:
+		msr_non_existent_data = run->msr.data;
+		break;
+	default:
+		TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index);
+	}
+}
+
+static void process_ucall_done(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE,
+		    "Unexpected ucall command: %lu, expected UCALL_DONE (%d)",
+		    uc.cmd, UCALL_DONE);
+}
+
+static uint64_t process_ucall(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc = {};
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		break;
+	case UCALL_ABORT:
+		check_for_guest_assert(vcpu);
+		break;
+	case UCALL_DONE:
+		process_ucall_done(vcpu);
+		break;
+	default:
+		TEST_ASSERT(false, "Unexpected ucall");
+	}
+
+	return uc.cmd;
+}
+
+static void run_guest_then_process_rdmsr(struct kvm_vcpu *vcpu,
+					 uint32_t msr_index)
+{
+	vcpu_run(vcpu);
+	process_rdmsr(vcpu, msr_index);
+}
+
+static void run_guest_then_process_wrmsr(struct kvm_vcpu *vcpu,
+					 uint32_t msr_index)
+{
+	vcpu_run(vcpu);
+	process_wrmsr(vcpu, msr_index);
+}
+
+static uint64_t run_guest_then_process_ucall(struct kvm_vcpu *vcpu)
+{
+	vcpu_run(vcpu);
+	return process_ucall(vcpu);
+}
+
+static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu)
+{
+	vcpu_run(vcpu);
+	process_ucall_done(vcpu);
+}
+
+KVM_ONE_VCPU_TEST_SUITE(user_msr);
+
+KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	uint64_t cmd;
+	int rc;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow);
+
+	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+	/* Process guest code userspace exits. */
+	run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+
+	run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+
+	run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
+	run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
+
+	vcpu_run(vcpu);
+	cmd = process_ucall(vcpu);
+
+	if (is_forced_emulation_enabled) {
+		TEST_ASSERT_EQ(cmd, UCALL_SYNC);
+		vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler);
+
+		/* Process emulated rdmsr and wrmsr instructions. */
+		run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+
+		run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+
+		run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
+		run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
+
+		/* Confirm the guest completed without issues. */
+		run_guest_then_process_ucall_done(vcpu);
+	} else {
+		TEST_ASSERT_EQ(cmd, UCALL_DONE);
+		printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n");
+	}
+}
+
+static int handle_ucall(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_SYNC:
+		vm_ioctl(vcpu->vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny);
+		break;
+	case UCALL_DONE:
+		return 1;
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+
+	return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+	run->msr.data = run->msr.index;
+	msr_reads++;
+
+	if (run->msr.index == MSR_SYSCALL_MASK ||
+	    run->msr.index == MSR_GS_BASE) {
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+			    "MSR read trap w/o access fault");
+	}
+
+	if (run->msr.index == 0xdeadbeef) {
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+			    "MSR deadbeef read trap w/o inval fault");
+	}
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+	/* ignore */
+	msr_writes++;
+
+	if (run->msr.index == MSR_IA32_POWER_CTL) {
+		TEST_ASSERT(run->msr.data == 0x1234,
+			    "MSR data for MSR_IA32_POWER_CTL incorrect");
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+			    "MSR_IA32_POWER_CTL trap w/o access fault");
+	}
+
+	if (run->msr.index == 0xdeadbeef) {
+		TEST_ASSERT(run->msr.data == 0x1234,
+			    "MSR data for deadbeef incorrect");
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+			    "deadbeef trap w/o inval fault");
+	}
+}
+
+KVM_ONE_VCPU_TEST(user_msr, msr_filter_deny, guest_code_filter_deny)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	struct kvm_run *run = vcpu->run;
+	int rc;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_INVAL |
+						      KVM_MSR_EXIT_REASON_UNKNOWN |
+						      KVM_MSR_EXIT_REASON_FILTER);
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	prepare_bitmaps();
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_deny);
+
+	while (1) {
+		vcpu_run(vcpu);
+
+		switch (run->exit_reason) {
+		case KVM_EXIT_X86_RDMSR:
+			handle_rdmsr(run);
+			break;
+		case KVM_EXIT_X86_WRMSR:
+			handle_wrmsr(run);
+			break;
+		case KVM_EXIT_IO:
+			if (handle_ucall(vcpu))
+				goto done;
+			break;
+		}
+
+	}
+
+done:
+	TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+	TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+}
+
+KVM_ONE_VCPU_TEST(user_msr, msr_permission_bitmap, guest_code_permission_bitmap)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	int rc;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_fs);
+	run_guest_then_process_rdmsr(vcpu, MSR_FS_BASE);
+	TEST_ASSERT(run_guest_then_process_ucall(vcpu) == UCALL_SYNC,
+		    "Expected ucall state to be UCALL_SYNC.");
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs);
+	run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE);
+	run_guest_then_process_ucall_done(vcpu);
+}
+
+#define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask)	\
+({									\
+	int r = __vm_ioctl(vm, cmd, arg);				\
+									\
+	if (flag & valid_mask)						\
+		TEST_ASSERT(!r, __KVM_IOCTL_ERROR(#cmd, r));		\
+	else								\
+		TEST_ASSERT(r == -1 && errno == EINVAL,			\
+			    "Wanted EINVAL for %s with flag = 0x%llx, got  rc: %i errno: %i (%s)", \
+			    #cmd, flag, r, errno,  strerror(errno));	\
+})
+
+static void run_user_space_msr_flag_test(struct kvm_vm *vm)
+{
+	struct kvm_enable_cap cap = { .cap = KVM_CAP_X86_USER_SPACE_MSR };
+	int nflags = sizeof(cap.args[0]) * BITS_PER_BYTE;
+	int rc;
+	int i;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+
+	for (i = 0; i < nflags; i++) {
+		cap.args[0] = BIT_ULL(i);
+		test_user_exit_msr_ioctl(vm, KVM_ENABLE_CAP, &cap,
+			   BIT_ULL(i), KVM_MSR_EXIT_REASON_VALID_MASK);
+	}
+}
+
+static void run_msr_filter_flag_test(struct kvm_vm *vm)
+{
+	u64 deny_bits = 0;
+	struct kvm_msr_filter filter = {
+		.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+		.ranges = {
+			{
+				.flags = KVM_MSR_FILTER_READ,
+				.nmsrs = 1,
+				.base = 0,
+				.bitmap = (uint8_t *)&deny_bits,
+			},
+		},
+	};
+	int nflags;
+	int rc;
+	int i;
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	nflags = sizeof(filter.flags) * BITS_PER_BYTE;
+	for (i = 0; i < nflags; i++) {
+		filter.flags = BIT_ULL(i);
+		test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
+			   BIT_ULL(i), KVM_MSR_FILTER_VALID_MASK);
+	}
+
+	filter.flags = KVM_MSR_FILTER_DEFAULT_ALLOW;
+	nflags = sizeof(filter.ranges[0].flags) * BITS_PER_BYTE;
+	for (i = 0; i < nflags; i++) {
+		filter.ranges[0].flags = BIT_ULL(i);
+		test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
+			   BIT_ULL(i), KVM_MSR_FILTER_RANGE_VALID_MASK);
+	}
+}
+
+/* Test that attempts to write to the unused bits in a flag fails. */
+KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL)
+{
+	struct kvm_vm *vm = vcpu->vm;
+
+	/* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */
+	run_user_space_msr_flag_test(vm);
+
+	/* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */
+	run_msr_filter_flag_test(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c
new file mode 100644
index 000000000000..a81a24761aac
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_access_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * The first subtest simply checks to see that an L2 guest can be
+ * launched with a valid APIC-access address that is backed by a
+ * page of L1 physical memory.
+ *
+ * The second subtest sets the APIC-access address to a (valid) L1
+ * physical address that is not backed by memory. KVM can't handle
+ * this situation, so resuming L2 should result in a KVM exit for
+ * internal error (emulation). This is not an architectural
+ * requirement. It is just a shortcoming of KVM. The internal error
+ * is unfortunate, but it's better than what used to happen!
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+static void l2_guest_code(void)
+{
+	/* Exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+	vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
+
+	/* Try to launch L2 with the memory-backed APIC-access address. */
+	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	vmwrite(APIC_ACCESS_ADDR, high_gpa);
+
+	/* Try to resume L2 with the unbacked APIC-access address. */
+	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long apic_access_addr = ~0ul;
+	vm_vaddr_t vmx_pages_gva;
+	unsigned long high_gpa;
+	struct vmx_pages *vmx;
+	bool done = false;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	high_gpa = (vm->max_gfn - 1) << vm->page_shift;
+
+	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	prepare_virtualize_apic_accesses(vmx, vm);
+	vcpu_args_set(vcpu, 2, vmx_pages_gva, high_gpa);
+
+	while (!done) {
+		volatile struct kvm_run *run = vcpu->run;
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		if (apic_access_addr == high_gpa) {
+			TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+			TEST_ASSERT(run->internal.suberror ==
+				    KVM_INTERNAL_ERROR_EMULATION,
+				    "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u",
+				    run->internal.suberror);
+			break;
+		}
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			apic_access_addr = uc.args[1];
+			break;
+		case UCALL_DONE:
+			done = true;
+			break;
+		default:
+			TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+		}
+	}
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c
new file mode 100644
index 000000000000..dad988351493
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_close_while_nested
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ *
+ * Verify that nothing bad happens if a KVM user exits with open
+ * file descriptors while executing a nested guest.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+enum {
+	PORT_L0_EXIT = 0x2000,
+};
+
+static void l2_guest_code(void)
+{
+	/* Exit to L0 */
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (PORT_L0_EXIT) : "rax");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	/* Allocate VMX pages and shared descriptors (vmx_pages). */
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu->run;
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		if (run->io.port == PORT_L0_EXIT)
+			break;
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
new file mode 100644
index 000000000000..fa512d033205
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging test
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+/* The memory slot index to track dirty pages */
+#define TEST_MEM_SLOT_INDEX		1
+#define TEST_MEM_PAGES			3
+
+/* L1 guest test virtual memory offset */
+#define GUEST_TEST_MEM			0xc0000000
+
+/* L2 guest test virtual memory offset */
+#define NESTED_TEST_MEM1		0xc0001000
+#define NESTED_TEST_MEM2		0xc0002000
+
+static void l2_guest_code(u64 *a, u64 *b)
+{
+	READ_ONCE(*a);
+	WRITE_ONCE(*a, 1);
+	GUEST_SYNC(true);
+	GUEST_SYNC(false);
+
+	WRITE_ONCE(*b, 1);
+	GUEST_SYNC(true);
+	WRITE_ONCE(*b, 1);
+	GUEST_SYNC(true);
+	GUEST_SYNC(false);
+
+	/* Exit to L1 and never come back.  */
+	vmcall();
+}
+
+static void l2_guest_code_ept_enabled(void)
+{
+	l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2);
+}
+
+static void l2_guest_code_ept_disabled(void)
+{
+	/* Access the same L1 GPAs as l2_guest_code_ept_enabled() */
+	l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
+}
+
+void l1_guest_code(struct vmx_pages *vmx)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	void *l2_rip;
+
+	GUEST_ASSERT(vmx->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+	GUEST_ASSERT(load_vmcs(vmx));
+
+	if (vmx->eptp_gpa)
+		l2_rip = l2_guest_code_ept_enabled;
+	else
+		l2_rip = l2_guest_code_ept_disabled;
+
+	prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(false);
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_SYNC(false);
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_DONE();
+}
+
+static void test_vmx_dirty_log(bool enable_ept)
+{
+	vm_vaddr_t vmx_pages_gva = 0;
+	struct vmx_pages *vmx;
+	unsigned long *bmap;
+	uint64_t *host_test_mem;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	bool done = false;
+
+	pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled");
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	/* Add an extra memory slot for testing dirty logging */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    GUEST_TEST_MEM,
+				    TEST_MEM_SLOT_INDEX,
+				    TEST_MEM_PAGES,
+				    KVM_MEM_LOG_DIRTY_PAGES);
+
+	/*
+	 * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
+	 * affects both L1 and L2.  However...
+	 */
+	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
+
+	/*
+	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
+	 * 0xc0000000.
+	 *
+	 * Note that prepare_eptp should be called only L1's GPA map is done,
+	 * meaning after the last call to virt_map.
+	 *
+	 * When EPT is disabled, the L2 guest code will still access the same L1
+	 * GPAs as the EPT enabled case.
+	 */
+	if (enable_ept) {
+		prepare_eptp(vmx, vm, 0);
+		nested_map_memslot(vmx, vm, 0);
+		nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096);
+		nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096);
+	}
+
+	bmap = bitmap_zalloc(TEST_MEM_PAGES);
+	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
+
+	while (!done) {
+		memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096);
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			/*
+			 * The nested guest wrote at offset 0x1000 in the memslot, but the
+			 * dirty bitmap must be filled in according to L1 GPA, not L2.
+			 */
+			kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+			if (uc.args[1]) {
+				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
+				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
+			} else {
+				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
+				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
+			}
+
+			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
+			TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
+			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
+			TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
+			break;
+		case UCALL_DONE:
+			done = true;
+			break;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	test_vmx_dirty_log(/*enable_ept=*/false);
+
+	if (kvm_cpu_has_ept())
+		test_vmx_dirty_log(/*enable_ept=*/true);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86/vmx_exception_with_invalid_guest_state.c
new file mode 100644
index 000000000000..3fd6eceab46f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_exception_with_invalid_guest_state.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <signal.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include "kselftest.h"
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	/* Loop on the ud2 until guest state is made invalid. */
+}
+
+static void guest_code(void)
+{
+	asm volatile("ud2");
+}
+
+static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+		    "Expected emulation failure, got %d",
+		    run->emulation_failure.suberror);
+}
+
+static void run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Always run twice to verify KVM handles the case where _KVM_ queues
+	 * an exception with invalid state and then exits to userspace, i.e.
+	 * that KVM doesn't explode if userspace ignores the initial error.
+	 */
+	__run_vcpu_with_invalid_state(vcpu);
+	__run_vcpu_with_invalid_state(vcpu);
+}
+
+static void set_timer(void)
+{
+	struct itimerval timer;
+
+	timer.it_value.tv_sec  = 0;
+	timer.it_value.tv_usec = 200;
+	timer.it_interval = timer.it_value;
+	TEST_ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
+}
+
+static void set_or_clear_invalid_guest_state(struct kvm_vcpu *vcpu, bool set)
+{
+	static struct kvm_sregs sregs;
+
+	if (!sregs.cr0)
+		vcpu_sregs_get(vcpu, &sregs);
+	sregs.tr.unusable = !!set;
+	vcpu_sregs_set(vcpu, &sregs);
+}
+
+static void set_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+	set_or_clear_invalid_guest_state(vcpu, true);
+}
+
+static void clear_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+	set_or_clear_invalid_guest_state(vcpu, false);
+}
+
+static struct kvm_vcpu *get_set_sigalrm_vcpu(struct kvm_vcpu *__vcpu)
+{
+	static struct kvm_vcpu *vcpu = NULL;
+
+	if (__vcpu)
+		vcpu = __vcpu;
+	return vcpu;
+}
+
+static void sigalrm_handler(int sig)
+{
+	struct kvm_vcpu *vcpu = get_set_sigalrm_vcpu(NULL);
+	struct kvm_vcpu_events events;
+
+	TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
+
+	vcpu_events_get(vcpu, &events);
+
+	/*
+	 * If an exception is pending, attempt KVM_RUN with invalid guest,
+	 * otherwise rearm the timer and keep doing so until the timer fires
+	 * between KVM queueing an exception and re-entering the guest.
+	 */
+	if (events.exception.pending) {
+		set_invalid_guest_state(vcpu);
+		run_vcpu_with_invalid_state(vcpu);
+	} else {
+		set_timer();
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	get_set_sigalrm_vcpu(vcpu);
+
+	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+	/*
+	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
+	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+	 * emulating invalid guest state for L2.
+	 */
+	set_invalid_guest_state(vcpu);
+	run_vcpu_with_invalid_state(vcpu);
+
+	/*
+	 * Verify KVM also handles the case where userspace gains control while
+	 * an exception is pending and stuffs invalid state.  Run with valid
+	 * guest state and a timer firing every 200us, and attempt to enter the
+	 * guest with invalid state when the handler interrupts KVM with an
+	 * exception pending.
+	 */
+	clear_invalid_guest_state(vcpu);
+	TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
+		    "Failed to register SIGALRM handler, errno = %d (%s)",
+		    errno, strerror(errno));
+
+	set_timer();
+	run_vcpu_with_invalid_state(vcpu);
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c
new file mode 100644
index 000000000000..a100ee5f0009
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define ARBITRARY_IO_PORT 0x2000
+
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+	/*
+	 * Generate an exit to L0 userspace, i.e. main(), via I/O to an
+	 * arbitrary port.
+	 */
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/*
+	 * L2 must be run without unrestricted guest, verify that the selftests
+	 * library hasn't enabled it.  Because KVM selftests jump directly to
+	 * 64-bit mode, unrestricted guest support isn't required.
+	 */
+	GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) ||
+		     !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST));
+
+	GUEST_ASSERT(!vmlaunch());
+
+	/* L2 should triple fault after main() stuffs invalid guest state. */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	/* Allocate VMX pages and shared descriptors (vmx_pages). */
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	vcpu_run(vcpu);
+
+	run = vcpu->run;
+
+	/*
+	 * The first exit to L0 userspace should be an I/O access from L2.
+	 * Running L1 should launch L2 without triggering an exit to userspace.
+	 */
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
+		    "Expected IN from port %d from L2, got port %d",
+		    ARBITRARY_IO_PORT, run->io.port);
+
+	/*
+	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
+	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+	 * emulating invalid guest state for L2.
+	 */
+	memset(&sregs, 0, sizeof(sregs));
+	vcpu_sregs_get(vcpu, &sregs);
+	sregs.tr.unusable = 1;
+	vcpu_sregs_set(vcpu, &sregs);
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_msrs_test.c b/tools/testing/selftests/kvm/x86/vmx_msrs_test.c
new file mode 100644
index 000000000000..90720b6205f4
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_msrs_test.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX control MSR test
+ *
+ * Copyright (C) 2022 Google LLC.
+ *
+ * Tests for KVM ownership of bits in the VMX entry/exit control MSRs. Checks
+ * that KVM will set owned bits where appropriate, and will not if
+ * KVM_X86_QUIRK_TWEAK_VMX_CTRL_MSRS is disabled.
+ */
+#include <linux/bitmap.h>
+#include "kvm_util.h"
+#include "vmx.h"
+
+static void vmx_fixed1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index,
+				  uint64_t mask)
+{
+	uint64_t val = vcpu_get_msr(vcpu, msr_index);
+	uint64_t bit;
+
+	mask &= val;
+
+	for_each_set_bit(bit, &mask, 64) {
+		vcpu_set_msr(vcpu, msr_index, val & ~BIT_ULL(bit));
+		vcpu_set_msr(vcpu, msr_index, val);
+	}
+}
+
+static void vmx_fixed0_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index,
+				uint64_t mask)
+{
+	uint64_t val = vcpu_get_msr(vcpu, msr_index);
+	uint64_t bit;
+
+	mask = ~mask | val;
+
+	for_each_clear_bit(bit, &mask, 64) {
+		vcpu_set_msr(vcpu, msr_index, val | BIT_ULL(bit));
+		vcpu_set_msr(vcpu, msr_index, val);
+	}
+}
+
+static void vmx_fixed0and1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index)
+{
+	vmx_fixed0_msr_test(vcpu, msr_index, GENMASK_ULL(31, 0));
+	vmx_fixed1_msr_test(vcpu, msr_index, GENMASK_ULL(63, 32));
+}
+
+static void vmx_save_restore_msrs_test(struct kvm_vcpu *vcpu)
+{
+	vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, 0);
+	vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, -1ull);
+
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_BASIC,
+			    BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55));
+
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_MISC,
+			    BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) |
+			    BIT_ULL(15) | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30));
+
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2);
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, -1ull);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS);
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_VMFUNC, -1ull);
+}
+
+static void __ia32_feature_control_msr_test(struct kvm_vcpu *vcpu,
+					    uint64_t msr_bit,
+					    struct kvm_x86_cpu_feature feature)
+{
+	uint64_t val;
+
+	vcpu_clear_cpuid_feature(vcpu, feature);
+
+	val = vcpu_get_msr(vcpu, MSR_IA32_FEAT_CTL);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val);
+
+	if (!kvm_cpu_has(feature))
+		return;
+
+	vcpu_set_cpuid_feature(vcpu, feature);
+}
+
+static void ia32_feature_control_msr_test(struct kvm_vcpu *vcpu)
+{
+	uint64_t supported_bits = FEAT_CTL_LOCKED |
+				  FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+				  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX |
+				  FEAT_CTL_SGX_LC_ENABLED |
+				  FEAT_CTL_SGX_ENABLED |
+				  FEAT_CTL_LMCE_ENABLED;
+	int bit, r;
+
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_SMX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_VMX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX, X86_FEATURE_VMX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX_LC);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_ENABLED, X86_FEATURE_SGX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_LMCE_ENABLED, X86_FEATURE_MCE);
+
+	for_each_clear_bit(bit, &supported_bits, 64) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, BIT(bit));
+		TEST_ASSERT(r == 0,
+			    "Setting reserved bit %d in IA32_FEATURE_CONTROL should fail", bit);
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	/* No need to actually do KVM_RUN, thus no guest code. */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	vmx_save_restore_msrs_test(vcpu);
+	ia32_feature_control_msr_test(vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c
new file mode 100644
index 000000000000..1759fa5cb3f2
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_nested_tsc_scaling_test
+ *
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * This test case verifies that nested TSC scaling behaves as expected when
+ * both L1 and L2 are scaled using different ratios. For this test we scale
+ * L1 down and scale L2 up.
+ */
+
+#include <time.h>
+
+#include "kvm_util.h"
+#include "vmx.h"
+#include "kselftest.h"
+
+/* L2 is scaled up (from L1's perspective) by this factor */
+#define L2_SCALE_FACTOR 4ULL
+
+#define TSC_OFFSET_L2 ((uint64_t) -33125236320908)
+#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
+
+#define L2_GUEST_STACK_SIZE 64
+
+enum { USLEEP, UCHECK_L1, UCHECK_L2 };
+#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
+#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
+
+
+/*
+ * This function checks whether the "actual" TSC frequency of a guest matches
+ * its expected frequency. In order to account for delays in taking the TSC
+ * measurements, a difference of 1% between the actual and the expected value
+ * is tolerated.
+ */
+static void compare_tsc_freq(uint64_t actual, uint64_t expected)
+{
+	uint64_t tolerance, thresh_low, thresh_high;
+
+	tolerance = expected / 100;
+	thresh_low = expected - tolerance;
+	thresh_high = expected + tolerance;
+
+	TEST_ASSERT(thresh_low < actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+	TEST_ASSERT(thresh_high > actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+}
+
+static void check_tsc_freq(int level)
+{
+	uint64_t tsc_start, tsc_end, tsc_freq;
+
+	/*
+	 * Reading the TSC twice with about a second's difference should give
+	 * us an approximation of the TSC frequency from the guest's
+	 * perspective. Now, this won't be completely accurate, but it should
+	 * be good enough for the purposes of this test.
+	 */
+	tsc_start = rdmsr(MSR_IA32_TSC);
+	GUEST_SLEEP(1);
+	tsc_end = rdmsr(MSR_IA32_TSC);
+
+	tsc_freq = tsc_end - tsc_start;
+
+	GUEST_CHECK(level, tsc_freq);
+}
+
+static void l2_guest_code(void)
+{
+	check_tsc_freq(UCHECK_L2);
+
+	/* exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	/* check that L1's frequency looks alright before launching L2 */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* prepare the VMCS for L2 execution */
+	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* enable TSC offsetting and TSC scaling for L2 */
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_TSC_SCALING;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+
+	vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
+	vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
+	vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
+
+	/* launch L2 */
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	/* check that L1's frequency still looks good */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	vm_vaddr_t vmx_pages_gva;
+
+	uint64_t tsc_start, tsc_end;
+	uint64_t tsc_khz;
+	uint64_t l1_scale_factor;
+	uint64_t l0_tsc_freq = 0;
+	uint64_t l1_tsc_freq = 0;
+	uint64_t l2_tsc_freq = 0;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
+
+	/*
+	 * We set L1's scale factor to be a random number from 2 to 10.
+	 * Ideally we would do the same for L2's factor but that one is
+	 * referenced by both main() and l1_guest_code() and using a global
+	 * variable does not work.
+	 */
+	srand(time(NULL));
+	l1_scale_factor = (rand() % 9) + 2;
+	printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor);
+	printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR);
+
+	tsc_start = rdtsc();
+	sleep(1);
+	tsc_end = rdtsc();
+
+	l0_tsc_freq = tsc_end - tsc_start;
+	printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
+	TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
+
+	/* scale down L1's TSC frequency */
+	vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *) (tsc_khz / l1_scale_factor));
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		case UCALL_SYNC:
+			switch (uc.args[0]) {
+			case USLEEP:
+				sleep(uc.args[1]);
+				break;
+			case UCHECK_L1:
+				l1_tsc_freq = uc.args[1];
+				printf("L1's TSC frequency is around: %"PRIu64
+				       "\n", l1_tsc_freq);
+
+				compare_tsc_freq(l1_tsc_freq,
+						 l0_tsc_freq / l1_scale_factor);
+				break;
+			case UCHECK_L2:
+				l2_tsc_freq = uc.args[1];
+				printf("L2's TSC frequency is around: %"PRIu64
+				       "\n", l2_tsc_freq);
+
+				compare_tsc_freq(l2_tsc_freq,
+						 l1_tsc_freq * L2_SCALE_FACTOR);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86/vmx_pmu_caps_test.c
new file mode 100644
index 000000000000..a1f5ff45d518
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_pmu_caps_test.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for VMX-pmu perf capability msr
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Test to check the effect of various CPUID settings on
+ * MSR_IA32_PERF_CAPABILITIES MSR, and check that what
+ * we write with KVM_SET_MSR is _not_ modified by the guest
+ * and check it can be retrieved with KVM_GET_MSR, also test
+ * the invalid LBR formats are rejected.
+ */
+#include <sys/ioctl.h>
+
+#include <linux/bitmap.h>
+
+#include "kvm_test_harness.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+static union perf_capabilities {
+	struct {
+		u64	lbr_format:6;
+		u64	pebs_trap:1;
+		u64	pebs_arch_reg:1;
+		u64	pebs_format:4;
+		u64	smm_freeze:1;
+		u64	full_width_write:1;
+		u64 pebs_baseline:1;
+		u64	perf_metrics:1;
+		u64	pebs_output_pt_available:1;
+		u64	anythread_deprecated:1;
+	};
+	u64	capabilities;
+} host_cap;
+
+/*
+ * The LBR format and most PEBS features are immutable, all other features are
+ * fungible (if supported by the host and KVM).
+ */
+static const union perf_capabilities immutable_caps = {
+	.lbr_format = -1,
+	.pebs_trap  = 1,
+	.pebs_arch_reg = 1,
+	.pebs_format = -1,
+	.pebs_baseline = 1,
+};
+
+static const union perf_capabilities format_caps = {
+	.lbr_format = -1,
+	.pebs_format = -1,
+};
+
+static void guest_test_perf_capabilities_gp(uint64_t val)
+{
+	uint8_t vector = wrmsr_safe(MSR_IA32_PERF_CAPABILITIES, val);
+
+	__GUEST_ASSERT(vector == GP_VECTOR,
+		       "Expected #GP for value '0x%lx', got vector '0x%x'",
+		       val, vector);
+}
+
+static void guest_code(uint64_t current_val)
+{
+	int i;
+
+	guest_test_perf_capabilities_gp(current_val);
+	guest_test_perf_capabilities_gp(0);
+
+	for (i = 0; i < 64; i++)
+		guest_test_perf_capabilities_gp(current_val ^ BIT_ULL(i));
+
+	GUEST_DONE();
+}
+
+KVM_ONE_VCPU_TEST_SUITE(vmx_pmu_caps);
+
+/*
+ * Verify that guest WRMSRs to PERF_CAPABILITIES #GP regardless of the value
+ * written, that the guest always sees the userspace controlled value, and that
+ * PERF_CAPABILITIES is immutable after KVM_RUN.
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code)
+{
+	struct ucall uc;
+	int r, i;
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+
+	vcpu_args_set(vcpu, 1, host_cap.capabilities);
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+
+	TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES),
+			host_cap.capabilities);
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+
+	r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
+	TEST_ASSERT(!r, "Post-KVM_RUN write '0' didn't fail");
+
+	for (i = 0; i < 64; i++) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
+				  host_cap.capabilities ^ BIT_ULL(i));
+		TEST_ASSERT(!r, "Post-KVM_RUN write '0x%llx'didn't fail",
+			    host_cap.capabilities ^ BIT_ULL(i));
+	}
+}
+
+/*
+ * Verify KVM allows writing PERF_CAPABILITIES with all KVM-supported features
+ * enabled, as well as '0' (to disable all features).
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, basic_perf_capabilities, guest_code)
+{
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+}
+
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, fungible_perf_capabilities, guest_code)
+{
+	const uint64_t fungible_caps = host_cap.capabilities & ~immutable_caps.capabilities;
+	int bit;
+
+	for_each_set_bit(bit, &fungible_caps, 64) {
+		vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(bit));
+		vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
+			     host_cap.capabilities & ~BIT_ULL(bit));
+	}
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+}
+
+/*
+ * Verify KVM rejects attempts to set unsupported and/or immutable features in
+ * PERF_CAPABILITIES.  Note, LBR format and PEBS format need to be validated
+ * separately as they are multi-bit values, e.g. toggling or setting a single
+ * bit can generate a false positive without dedicated safeguards.
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, immutable_perf_capabilities, guest_code)
+{
+	const uint64_t reserved_caps = (~host_cap.capabilities |
+					immutable_caps.capabilities) &
+				       ~format_caps.capabilities;
+	union perf_capabilities val = host_cap;
+	int r, bit;
+
+	for_each_set_bit(bit, &reserved_caps, 64) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
+				  host_cap.capabilities ^ BIT_ULL(bit));
+		TEST_ASSERT(!r, "%s immutable feature 0x%llx (bit %d) didn't fail",
+			    host_cap.capabilities & BIT_ULL(bit) ? "Setting" : "Clearing",
+			    BIT_ULL(bit), bit);
+	}
+
+	/*
+	 * KVM only supports the host's native LBR format, as well as '0' (to
+	 * disable LBR support).  Verify KVM rejects all other LBR formats.
+	 */
+	for (val.lbr_format = 1; val.lbr_format; val.lbr_format++) {
+		if (val.lbr_format == host_cap.lbr_format)
+			continue;
+
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities);
+		TEST_ASSERT(!r, "Bad LBR FMT = 0x%x didn't fail, host = 0x%x",
+			    val.lbr_format, host_cap.lbr_format);
+	}
+
+	/* Ditto for the PEBS format. */
+	for (val.pebs_format = 1; val.pebs_format; val.pebs_format++) {
+		if (val.pebs_format == host_cap.pebs_format)
+			continue;
+
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities);
+		TEST_ASSERT(!r, "Bad PEBS FMT = 0x%x didn't fail, host = 0x%x",
+			    val.pebs_format, host_cap.pebs_format);
+	}
+}
+
+/*
+ * Test that LBR MSRs are writable when LBRs are enabled, and then verify that
+ * disabling the vPMU via CPUID also disables LBR support.  Set bits 2:0 of
+ * LBR_TOS as those bits are writable across all uarch implementations (arch
+ * LBRs will need to poke a different MSR).
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code)
+{
+	int r;
+
+	if (!host_cap.lbr_format)
+		return;
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+	vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
+
+	vcpu_clear_cpuid_entry(vcpu, X86_PROPERTY_PMU_VERSION.function);
+
+	r = _vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
+	TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU");
+}
+
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, perf_capabilities_unsupported, guest_code)
+{
+	uint64_t val;
+	int i, r;
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+	val = vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES);
+	TEST_ASSERT_EQ(val, host_cap.capabilities);
+
+	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_PDCM);
+
+	val = vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES);
+	TEST_ASSERT_EQ(val, 0);
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
+
+	for (i = 0; i < 64; i++) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(i));
+		TEST_ASSERT(!r, "Setting PERF_CAPABILITIES bit %d (= 0x%llx) should fail without PDCM",
+			    i, BIT_ULL(i));
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
+
+	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
+
+	host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES);
+
+	TEST_ASSERT(host_cap.full_width_write,
+		    "Full-width writes should always be supported");
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c
new file mode 100644
index 000000000000..00dd2ac07a61
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX-preemption timer test
+ *
+ * Copyright (C) 2020, Google, LLC.
+ *
+ * Test to ensure the VM-Enter after migration doesn't
+ * incorrectly restarts the timer with the full timer
+ * value instead of partially decayed timer value
+ *
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define PREEMPTION_TIMER_VALUE			100000000ull
+#define PREEMPTION_TIMER_VALUE_THRESHOLD1	 80000000ull
+
+u32 vmx_pt_rate;
+bool l2_save_restore_done;
+static u64 l2_vmx_pt_start;
+volatile u64 l2_vmx_pt_finish;
+
+union vmx_basic basic;
+union vmx_ctrl_msr ctrl_pin_rev;
+union vmx_ctrl_msr ctrl_exit_rev;
+
+void l2_guest_code(void)
+{
+	u64 vmx_pt_delta;
+
+	vmcall();
+	l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+	/*
+	 * Wait until the 1st threshold has passed
+	 */
+	do {
+		l2_vmx_pt_finish = rdtsc();
+		vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
+				vmx_pt_rate;
+	} while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
+
+	/*
+	 * Force L2 through Save and Restore cycle
+	 */
+	GUEST_SYNC(1);
+
+	l2_save_restore_done = 1;
+
+	/*
+	 * Now wait for the preemption timer to fire and
+	 * exit to L1
+	 */
+	while ((l2_vmx_pt_finish = rdtsc()))
+		;
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	u64 l1_vmx_pt_start;
+	u64 l1_vmx_pt_finish;
+	u64 l1_tsc_deadline, l2_tsc_deadline;
+
+	GUEST_ASSERT(vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/*
+	 * Check for Preemption timer support
+	 */
+	basic.val = rdmsr(MSR_IA32_VMX_BASIC);
+	ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
+			: MSR_IA32_VMX_PINBASED_CTLS);
+	ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
+			: MSR_IA32_VMX_EXIT_CTLS);
+
+	if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+	    !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+		return;
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
+
+	/*
+	 * Turn on PIN control and resume the guest
+	 */
+	GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+			      vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+			      PIN_BASED_VMX_PREEMPTION_TIMER));
+
+	GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
+			      PREEMPTION_TIMER_VALUE));
+
+	vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
+
+	l2_save_restore_done = 0;
+
+	l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+	GUEST_ASSERT(!vmresume());
+
+	l1_vmx_pt_finish = rdtsc();
+
+	/*
+	 * Ensure exit from L2 happens after L2 goes through
+	 * save and restore
+	 */
+	GUEST_ASSERT(l2_save_restore_done);
+
+	/*
+	 * Ensure the exit from L2 is due to preemption timer expiry
+	 */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
+
+	l1_tsc_deadline = l1_vmx_pt_start +
+		(PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+	l2_tsc_deadline = l2_vmx_pt_start +
+		(PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+	/*
+	 * Sync with the host and pass the l1|l2 pt_expiry_finish times and
+	 * tsc deadlines so that host can verify they are as expected
+	 */
+	GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
+		l2_vmx_pt_finish, l2_tsc_deadline);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+	if (vmx_pages)
+		l1_guest_code(vmx_pages);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva = 0;
+
+	struct kvm_regs regs1, regs2;
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	struct kvm_x86_state *state;
+	struct ucall uc;
+	int stage;
+
+	/*
+	 * AMD currently does not implement any VMX features, so for now we
+	 * just early out.
+	 */
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vcpu_regs_get(vcpu, &regs1);
+
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+		/*
+		 * If this stage 2 then we should verify the vmx pt expiry
+		 * is as expected.
+		 * From L1's perspective verify Preemption timer hasn't
+		 * expired too early.
+		 * From L2's perspective verify Preemption timer hasn't
+		 * expired too late.
+		 */
+		if (stage == 2) {
+
+			pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
+				stage, uc.args[2], uc.args[3]);
+
+			pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
+				stage, uc.args[4], uc.args[5]);
+
+			TEST_ASSERT(uc.args[2] >= uc.args[3],
+				"Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
+				stage, uc.args[2], uc.args[3]);
+
+			TEST_ASSERT(uc.args[4] < uc.args[5],
+				"Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
+				stage, uc.args[4], uc.args[5]);
+		}
+
+		state = vcpu_save_state(vcpu);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vcpu, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vcpu, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
new file mode 100644
index 000000000000..67a62a5a8895
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_set_nested_state_test
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <errno.h>
+#include <linux/kvm.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+/*
+ * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
+ * changes this should be updated.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+bool have_evmcs;
+
+void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
+{
+	vcpu_nested_state_set(vcpu, state);
+}
+
+void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
+				    struct kvm_nested_state *state,
+				    int expected_errno)
+{
+	int rv;
+
+	rv = __vcpu_nested_state_set(vcpu, state);
+	TEST_ASSERT(rv == -1 && errno == expected_errno,
+		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
+		strerror(expected_errno), expected_errno, rv, strerror(errno),
+		errno);
+}
+
+void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_state *state)
+{
+	test_nested_state_expect_errno(vcpu, state, EINVAL);
+}
+
+void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_state *state)
+{
+	test_nested_state_expect_errno(vcpu, state, EFAULT);
+}
+
+void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
+				u32 vmcs12_revision)
+{
+	/* Set revision_id in vmcs12 to vmcs12_revision. */
+	memcpy(&state->data, &vmcs12_revision, sizeof(u32));
+}
+
+void set_default_state(struct kvm_nested_state *state)
+{
+	memset(state, 0, sizeof(*state));
+	state->flags = KVM_STATE_NESTED_RUN_PENDING |
+		       KVM_STATE_NESTED_GUEST_MODE;
+	state->format = 0;
+	state->size = sizeof(*state);
+}
+
+void set_default_vmx_state(struct kvm_nested_state *state, int size)
+{
+	memset(state, 0, size);
+	if (have_evmcs)
+		state->flags = KVM_STATE_NESTED_EVMCS;
+	state->format = 0;
+	state->size = size;
+	state->hdr.vmx.vmxon_pa = 0x1000;
+	state->hdr.vmx.vmcs12_pa = 0x2000;
+	state->hdr.vmx.smm.flags = 0;
+	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
+}
+
+void test_vmx_nested_state(struct kvm_vcpu *vcpu)
+{
+	/* Add a page for VMCS12. */
+	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+	struct kvm_nested_state *state =
+		(struct kvm_nested_state *)malloc(state_sz);
+
+	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
+	set_default_vmx_state(state, state_sz);
+	state->format = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * We cannot virtualize anything if the guest does not have VMX
+	 * enabled.
+	 */
+	set_default_vmx_state(state, state_sz);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * We cannot virtualize anything if the guest does not have VMX
+	 * enabled.  We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
+	 * is set to -1ull, but the flags must be zero.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	state->flags = KVM_STATE_NESTED_EVMCS;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	/* Enable VMX in the guest CPUID. */
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
+
+	/*
+	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
+	 * setting the nested state. When the eVMCS flag is not set, the
+	 * expected return value is '0'.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * When eVMCS is supported, the eVMCS flag can only be set if the
+	 * enlightened VMCS capability has been enabled.
+	 */
+	if (have_evmcs) {
+		state->flags = KVM_STATE_NESTED_EVMCS;
+		test_nested_state_expect_einval(vcpu, state);
+		vcpu_enable_evmcs(vcpu);
+		test_nested_state(vcpu, state);
+	}
+
+	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
+	state->hdr.vmx.smm.flags = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Invalid flags are rejected. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
+	 * KVM_STATE_NESTED_GUEST_MODE set together.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
+		      KVM_STATE_NESTED_RUN_PENDING;
+	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * It is invalid to have any of the SMM flags set besides:
+	 *	KVM_STATE_NESTED_SMM_GUEST_MODE
+	 *	KVM_STATE_NESTED_SMM_VMXON
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
+				KVM_STATE_NESTED_SMM_VMXON);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Outside SMM, SMM flags must be zero. */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Size must be large enough to fit kvm_nested_state and vmcs12
+	 * if VMCS12 physical address is set
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+	 * contents but L2 not running.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	/* Invalid flags are rejected, even if no VMCS loaded. */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* vmxon_pa cannot be the same address as vmcs_pa. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = 0;
+	state->hdr.vmx.vmcs12_pa = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Test that if we leave nesting the state reflects that when we get
+	 * it again.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	vcpu_nested_state_get(vcpu, state);
+	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+		    "Size must be between %ld and %d.  The size returned was %d.",
+		    sizeof(*state), state_sz, state->size);
+	TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
+	TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
+
+	free(state);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_nested_state state;
+	struct kvm_vcpu *vcpu;
+
+	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
+
+	/*
+	 * AMD currently does not implement set_nested_state, so for now we
+	 * just early out.
+	 */
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	/*
+	 * First run tests with VMX disabled to check error handling.
+	 */
+	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
+
+	/* Passing a NULL kvm_nested_state causes a EFAULT. */
+	test_nested_state_expect_efault(vcpu, NULL);
+
+	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
+	set_default_state(&state);
+	state.size = 0;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	/*
+	 * Setting the flags 0xf fails the flags check.  The only flags that
+	 * can be used are:
+	 *     KVM_STATE_NESTED_GUEST_MODE
+	 *     KVM_STATE_NESTED_RUN_PENDING
+	 *     KVM_STATE_NESTED_EVMCS
+	 */
+	set_default_state(&state);
+	state.flags = 0xf;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	/*
+	 * If KVM_STATE_NESTED_RUN_PENDING is set then
+	 * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
+	 */
+	set_default_state(&state);
+	state.flags = KVM_STATE_NESTED_RUN_PENDING;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	test_vmx_nested_state(vcpu);
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c
new file mode 100644
index 000000000000..2ceb5c78c442
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_tsc_adjust_test
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * IA32_TSC_ADJUST test
+ *
+ * According to the SDM, "if an execution of WRMSR to the
+ * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC,
+ * the logical processor also adds (or subtracts) value X from the
+ * IA32_TSC_ADJUST MSR.
+ *
+ * Note that when L1 doesn't intercept writes to IA32_TSC, a
+ * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC
+ * value.
+ *
+ * This test verifies that this unusual case is handled correctly.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#ifndef MSR_IA32_TSC_ADJUST
+#define MSR_IA32_TSC_ADJUST 0x3b
+#endif
+
+#define TSC_ADJUST_VALUE (1ll << 32)
+#define TSC_OFFSET_VALUE -(1ll << 48)
+
+enum {
+	PORT_ABORT = 0x1000,
+	PORT_REPORT,
+	PORT_DONE,
+};
+
+enum {
+	VMXON_PAGE = 0,
+	VMCS_PAGE,
+	MSR_BITMAP_PAGE,
+
+	NUM_VMX_PAGES,
+};
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void check_ia32_tsc_adjust(int64_t max)
+{
+	int64_t adjust;
+
+	adjust = rdmsr(MSR_IA32_TSC_ADJUST);
+	GUEST_SYNC(adjust);
+	GUEST_ASSERT(adjust <= max);
+}
+
+static void l2_guest_code(void)
+{
+	uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE;
+
+	wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE);
+	check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+	/* Exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+	uintptr_t save_cr3;
+
+	GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
+	wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
+	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
+
+	/* Jump into L2.  First, test failure to load guest CR3.  */
+	save_cr3 = vmreadz(GUEST_CR3);
+	vmwrite(GUEST_CR3, -1ull);
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) ==
+		     (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE));
+	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+	vmwrite(GUEST_CR3, save_cr3);
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+	GUEST_DONE();
+}
+
+static void report(int64_t val)
+{
+	pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
+		val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva;
+	struct kvm_vcpu *vcpu;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code);
+
+	/* Allocate VMX pages and shared descriptors (vmx_pages). */
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			report(uc.args[1]);
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c
new file mode 100644
index 000000000000..a76078a08ff8
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c
@@ -0,0 +1,487 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xapic_ipi_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake
+ * another vCPU that is halted when KVM's backing page for the APIC access
+ * address has been moved by mm.
+ *
+ * The test starts two vCPUs: one that sends IPIs and one that continually
+ * executes HLT. The sender checks that the halter has woken from the HLT and
+ * has reentered HLT before sending the next IPI. While the vCPUs are running,
+ * the host continually calls migrate_pages to move all of the process' pages
+ * amongst the available numa nodes on the machine.
+ *
+ * Migration is a command line option. When used on non-numa machines will 
+ * exit with error. Test is still usefull on non-numa for testing IPIs.
+ */
+#include <getopt.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "kvm_util.h"
+#include "numaif.h"
+#include "processor.h"
+#include "test_util.h"
+#include "vmx.h"
+
+/* Default running time for the test */
+#define DEFAULT_RUN_SECS 3
+
+/* Default delay between migrate_pages calls (microseconds) */
+#define DEFAULT_DELAY_USECS 500000
+
+/*
+ * Vector for IPI from sender vCPU to halting vCPU.
+ * Value is arbitrary and was chosen for the alternating bit pattern. Any
+ * value should work.
+ */
+#define IPI_VECTOR	 0xa5
+
+/*
+ * Incremented in the IPI handler. Provides evidence to the sender that the IPI
+ * arrived at the destination
+ */
+static volatile uint64_t ipis_rcvd;
+
+/* Data struct shared between host main thread and vCPUs */
+struct test_data_page {
+	uint32_t halter_apic_id;
+	volatile uint64_t hlt_count;
+	volatile uint64_t wake_count;
+	uint64_t ipis_sent;
+	uint64_t migrations_attempted;
+	uint64_t migrations_completed;
+	uint32_t icr;
+	uint32_t icr2;
+	uint32_t halter_tpr;
+	uint32_t halter_ppr;
+
+	/*
+	 *  Record local version register as a cross-check that APIC access
+	 *  worked. Value should match what KVM reports (APIC_VERSION in
+	 *  arch/x86/kvm/lapic.c). If test is failing, check that values match
+	 *  to determine whether APIC access exits are working.
+	 */
+	uint32_t halter_lvr;
+};
+
+struct thread_params {
+	struct test_data_page *data;
+	struct kvm_vcpu *vcpu;
+	uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */
+};
+
+void verify_apic_base_addr(void)
+{
+	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
+	uint64_t base = GET_APIC_BASE(msr);
+
+	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
+}
+
+static void halter_guest_code(struct test_data_page *data)
+{
+	verify_apic_base_addr();
+	xapic_enable();
+
+	data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
+	data->halter_lvr = xapic_read_reg(APIC_LVR);
+
+	/*
+	 * Loop forever HLTing and recording halts & wakes. Disable interrupts
+	 * each time around to minimize window between signaling the pending
+	 * halt to the sender vCPU and executing the halt. No need to disable on
+	 * first run as this vCPU executes first and the host waits for it to
+	 * signal going into first halt before starting the sender vCPU. Record
+	 * TPR and PPR for diagnostic purposes in case the test fails.
+	 */
+	for (;;) {
+		data->halter_tpr = xapic_read_reg(APIC_TASKPRI);
+		data->halter_ppr = xapic_read_reg(APIC_PROCPRI);
+		data->hlt_count++;
+		asm volatile("sti; hlt; cli");
+		data->wake_count++;
+	}
+}
+
+/*
+ * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to
+ * enable diagnosing errant writes to the APIC access address backing page in
+ * case of test failure.
+ */
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+	ipis_rcvd++;
+	xapic_write_reg(APIC_EOI, 77);
+}
+
+static void sender_guest_code(struct test_data_page *data)
+{
+	uint64_t last_wake_count;
+	uint64_t last_hlt_count;
+	uint64_t last_ipis_rcvd_count;
+	uint32_t icr_val;
+	uint32_t icr2_val;
+	uint64_t tsc_start;
+
+	verify_apic_base_addr();
+	xapic_enable();
+
+	/*
+	 * Init interrupt command register for sending IPIs
+	 *
+	 * Delivery mode=fixed, per SDM:
+	 *   "Delivers the interrupt specified in the vector field to the target
+	 *    processor."
+	 *
+	 * Destination mode=physical i.e. specify target by its local APIC
+	 * ID. This vCPU assumes that the halter vCPU has already started and
+	 * set data->halter_apic_id.
+	 */
+	icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR);
+	icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id);
+	data->icr = icr_val;
+	data->icr2 = icr2_val;
+
+	last_wake_count = data->wake_count;
+	last_hlt_count = data->hlt_count;
+	last_ipis_rcvd_count = ipis_rcvd;
+	for (;;) {
+		/*
+		 * Send IPI to halter vCPU.
+		 * First IPI can be sent unconditionally because halter vCPU
+		 * starts earlier.
+		 */
+		xapic_write_reg(APIC_ICR2, icr2_val);
+		xapic_write_reg(APIC_ICR, icr_val);
+		data->ipis_sent++;
+
+		/*
+		 * Wait up to ~1 sec for halter to indicate that it has:
+		 * 1. Received the IPI
+		 * 2. Woken up from the halt
+		 * 3. Gone back into halt
+		 * Current CPUs typically run at 2.x Ghz which is ~2
+		 * billion ticks per second.
+		 */
+		tsc_start = rdtsc();
+		while (rdtsc() - tsc_start < 2000000000) {
+			if ((ipis_rcvd != last_ipis_rcvd_count) &&
+			    (data->wake_count != last_wake_count) &&
+			    (data->hlt_count != last_hlt_count))
+				break;
+		}
+
+		GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) &&
+			     (data->wake_count != last_wake_count) &&
+			     (data->hlt_count != last_hlt_count));
+
+		last_wake_count = data->wake_count;
+		last_hlt_count = data->hlt_count;
+		last_ipis_rcvd_count = ipis_rcvd;
+	}
+}
+
+static void *vcpu_thread(void *arg)
+{
+	struct thread_params *params = (struct thread_params *)arg;
+	struct kvm_vcpu *vcpu = params->vcpu;
+	struct ucall uc;
+	int old;
+	int r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(r == 0,
+		    "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+		    vcpu->id, r);
+
+	fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id);
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	if (get_ucall(vcpu, &uc) == UCALL_ABORT) {
+		TEST_ASSERT(false,
+			    "vCPU %u exited with error: %s.\n"
+			    "Sending vCPU sent %lu IPIs to halting vCPU\n"
+			    "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+			    "Halter TPR=%#x PPR=%#x LVR=%#x\n"
+			    "Migrations attempted: %lu\n"
+			    "Migrations completed: %lu",
+			    vcpu->id, (const char *)uc.args[0],
+			    params->data->ipis_sent, params->data->hlt_count,
+			    params->data->wake_count,
+			    *params->pipis_rcvd, params->data->halter_tpr,
+			    params->data->halter_ppr, params->data->halter_lvr,
+			    params->data->migrations_attempted,
+			    params->data->migrations_completed);
+	}
+
+	return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+	void *retval;
+	int r;
+
+	r = pthread_cancel(thread);
+	TEST_ASSERT(r == 0,
+		    "pthread_cancel on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+
+	r = pthread_join(thread, &retval);
+	TEST_ASSERT(r == 0,
+		    "pthread_join on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+	TEST_ASSERT(retval == PTHREAD_CANCELED,
+		    "expected retval=%p, got %p", PTHREAD_CANCELED,
+		    retval);
+}
+
+void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs,
+		   uint64_t *pipis_rcvd)
+{
+	long pages_not_moved;
+	unsigned long nodemask = 0;
+	unsigned long nodemasks[sizeof(nodemask) * 8];
+	int nodes = 0;
+	time_t start_time, last_update, now;
+	time_t interval_secs = 1;
+	int i, r;
+	int from, to;
+	unsigned long bit;
+	uint64_t hlt_count;
+	uint64_t wake_count;
+	uint64_t ipis_sent;
+
+	fprintf(stderr, "Calling migrate_pages every %d microseconds\n",
+		delay_usecs);
+
+	/* Get set of first 64 numa nodes available */
+	r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8,
+			  0, MPOL_F_MEMS_ALLOWED);
+	TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno);
+
+	fprintf(stderr, "Numa nodes found amongst first %lu possible nodes "
+		"(each 1-bit indicates node is present): %#lx\n",
+		sizeof(nodemask) * 8, nodemask);
+
+	/* Init array of masks containing a single-bit in each, one for each
+	 * available node. migrate_pages called below requires specifying nodes
+	 * as bit masks.
+	 */
+	for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) {
+		if (nodemask & bit) {
+			nodemasks[nodes] = nodemask & bit;
+			nodes++;
+		}
+	}
+
+	TEST_ASSERT(nodes > 1,
+		    "Did not find at least 2 numa nodes. Can't do migration");
+
+	fprintf(stderr, "Migrating amongst %d nodes found\n", nodes);
+
+	from = 0;
+	to = 1;
+	start_time = time(NULL);
+	last_update = start_time;
+
+	ipis_sent = data->ipis_sent;
+	hlt_count = data->hlt_count;
+	wake_count = data->wake_count;
+
+	while ((int)(time(NULL) - start_time) < run_secs) {
+		data->migrations_attempted++;
+
+		/*
+		 * migrate_pages with PID=0 will migrate all pages of this
+		 * process between the nodes specified as bitmasks. The page
+		 * backing the APIC access address belongs to this process
+		 * because it is allocated by KVM in the context of the
+		 * KVM_CREATE_VCPU ioctl. If that assumption ever changes this
+		 * test may break or give a false positive signal.
+		 */
+		pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]),
+						&nodemasks[from],
+						&nodemasks[to]);
+		if (pages_not_moved < 0)
+			fprintf(stderr,
+				"migrate_pages failed, errno=%d\n", errno);
+		else if (pages_not_moved > 0)
+			fprintf(stderr,
+				"migrate_pages could not move %ld pages\n",
+				pages_not_moved);
+		else
+			data->migrations_completed++;
+
+		from = to;
+		to++;
+		if (to == nodes)
+			to = 0;
+
+		now = time(NULL);
+		if (((now - start_time) % interval_secs == 0) &&
+		    (now != last_update)) {
+			last_update = now;
+			fprintf(stderr,
+				"%lu seconds: Migrations attempted=%lu completed=%lu, "
+				"IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n",
+				now - start_time, data->migrations_attempted,
+				data->migrations_completed,
+				data->ipis_sent, *pipis_rcvd,
+				data->hlt_count, data->wake_count);
+
+			TEST_ASSERT(ipis_sent != data->ipis_sent &&
+				    hlt_count != data->hlt_count &&
+				    wake_count != data->wake_count,
+				    "IPI, HLT and wake count have not increased "
+				    "in the last %lu seconds. "
+				    "HLTer is likely hung.", interval_secs);
+
+			ipis_sent = data->ipis_sent;
+			hlt_count = data->hlt_count;
+			wake_count = data->wake_count;
+		}
+		usleep(delay_usecs);
+	}
+}
+
+void get_cmdline_args(int argc, char *argv[], int *run_secs,
+		      bool *migrate, int *delay_usecs)
+{
+	for (;;) {
+		int opt = getopt(argc, argv, "s:d:m");
+
+		if (opt == -1)
+			break;
+		switch (opt) {
+		case 's':
+			*run_secs = parse_size(optarg);
+			break;
+		case 'm':
+			*migrate = true;
+			break;
+		case 'd':
+			*delay_usecs = parse_size(optarg);
+			break;
+		default:
+			TEST_ASSERT(false,
+				    "Usage: -s <runtime seconds>. Default is %d seconds.\n"
+				    "-m adds calls to migrate_pages while vCPUs are running."
+				    " Default is no migrations.\n"
+				    "-d <delay microseconds> - delay between migrate_pages() calls."
+				    " Default is %d microseconds.",
+				    DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int r;
+	int wait_secs;
+	const int max_halter_wait = 10;
+	int run_secs = 0;
+	int delay_usecs = 0;
+	struct test_data_page *data;
+	vm_vaddr_t test_data_page_vaddr;
+	bool migrate = false;
+	pthread_t threads[2];
+	struct thread_params params[2];
+	struct kvm_vm *vm;
+	uint64_t *pipis_rcvd;
+
+	get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs);
+	if (run_secs <= 0)
+		run_secs = DEFAULT_RUN_SECS;
+	if (delay_usecs <= 0)
+		delay_usecs = DEFAULT_DELAY_USECS;
+
+	vm = vm_create_with_one_vcpu(&params[0].vcpu, halter_guest_code);
+
+	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	params[1].vcpu = vm_vcpu_add(vm, 1, sender_guest_code);
+
+	test_data_page_vaddr = vm_vaddr_alloc_page(vm);
+	data = addr_gva2hva(vm, test_data_page_vaddr);
+	memset(data, 0, sizeof(*data));
+	params[0].data = data;
+	params[1].data = data;
+
+	vcpu_args_set(params[0].vcpu, 1, test_data_page_vaddr);
+	vcpu_args_set(params[1].vcpu, 1, test_data_page_vaddr);
+
+	pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd);
+	params[0].pipis_rcvd = pipis_rcvd;
+	params[1].pipis_rcvd = pipis_rcvd;
+
+	/* Start halter vCPU thread and wait for it to execute first HLT. */
+	r = pthread_create(&threads[0], NULL, vcpu_thread, &params[0]);
+	TEST_ASSERT(r == 0,
+		    "pthread_create halter failed errno=%d", errno);
+	fprintf(stderr, "Halter vCPU thread started\n");
+
+	wait_secs = 0;
+	while ((wait_secs < max_halter_wait) && !data->hlt_count) {
+		sleep(1);
+		wait_secs++;
+	}
+
+	TEST_ASSERT(data->hlt_count,
+		    "Halter vCPU did not execute first HLT within %d seconds",
+		    max_halter_wait);
+
+	fprintf(stderr,
+		"Halter vCPU thread reported its APIC ID: %u after %d seconds.\n",
+		data->halter_apic_id, wait_secs);
+
+	r = pthread_create(&threads[1], NULL, vcpu_thread, &params[1]);
+	TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno);
+
+	fprintf(stderr,
+		"IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n",
+		run_secs);
+
+	if (!migrate)
+		sleep(run_secs);
+	else
+		do_migrations(data, run_secs, delay_usecs, pipis_rcvd);
+
+	/*
+	 * Cancel threads and wait for them to stop.
+	 */
+	cancel_join_vcpu_thread(threads[0], params[0].vcpu);
+	cancel_join_vcpu_thread(threads[1], params[1].vcpu);
+
+	fprintf(stderr,
+		"Test successful after running for %d seconds.\n"
+		"Sending vCPU sent %lu IPIs to halting vCPU\n"
+		"Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+		"Halter APIC ID=%#x\n"
+		"Sender ICR value=%#x ICR2 value=%#x\n"
+		"Halter TPR=%#x PPR=%#x LVR=%#x\n"
+		"Migrations attempted: %lu\n"
+		"Migrations completed: %lu\n",
+		run_secs, data->ipis_sent,
+		data->hlt_count, data->wake_count, *pipis_rcvd,
+		data->halter_apic_id,
+		data->icr, data->icr2,
+		data->halter_tpr, data->halter_ppr, data->halter_lvr,
+		data->migrations_attempted, data->migrations_completed);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/xapic_state_test.c b/tools/testing/selftests/kvm/x86/xapic_state_test.c
new file mode 100644
index 000000000000..88bcca188799
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xapic_state_test.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "apic.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+struct xapic_vcpu {
+	struct kvm_vcpu *vcpu;
+	bool is_x2apic;
+	bool has_xavic_errata;
+};
+
+static void xapic_guest_code(void)
+{
+	asm volatile("cli");
+
+	xapic_enable();
+
+	while (1) {
+		uint64_t val = (u64)xapic_read_reg(APIC_IRR) |
+			       (u64)xapic_read_reg(APIC_IRR + 0x10) << 32;
+
+		xapic_write_reg(APIC_ICR2, val >> 32);
+		xapic_write_reg(APIC_ICR, val);
+		GUEST_SYNC(val);
+	}
+}
+
+#define X2APIC_RSVD_BITS_MASK  (GENMASK_ULL(31, 20) | \
+				GENMASK_ULL(17, 16) | \
+				GENMASK_ULL(13, 13))
+
+static void x2apic_guest_code(void)
+{
+	asm volatile("cli");
+
+	x2apic_enable();
+
+	do {
+		uint64_t val = x2apic_read_reg(APIC_IRR) |
+			       x2apic_read_reg(APIC_IRR + 0x10) << 32;
+
+		if (val & X2APIC_RSVD_BITS_MASK) {
+			x2apic_write_reg_fault(APIC_ICR, val);
+		} else {
+			x2apic_write_reg(APIC_ICR, val);
+			GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val);
+		}
+		GUEST_SYNC(val);
+	} while (1);
+}
+
+static void ____test_icr(struct xapic_vcpu *x, uint64_t val)
+{
+	struct kvm_vcpu *vcpu = x->vcpu;
+	struct kvm_lapic_state xapic;
+	struct ucall uc;
+	uint64_t icr;
+
+	/*
+	 * Tell the guest what ICR value to write.  Use the IRR to pass info,
+	 * all bits are valid and should not be modified by KVM (ignoring the
+	 * fact that vectors 0-15 are technically illegal).
+	 */
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+	*((u32 *)&xapic.regs[APIC_IRR]) = val;
+	*((u32 *)&xapic.regs[APIC_IRR + 0x10]) = val >> 32;
+	vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
+	TEST_ASSERT_EQ(uc.args[1], val);
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+	icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) |
+	      (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32;
+	if (!x->is_x2apic) {
+		if (!x->has_xavic_errata)
+			val &= (-1u | (0xffull << (32 + 24)));
+	} else if (val & X2APIC_RSVD_BITS_MASK) {
+		return;
+	}
+
+	if (x->has_xavic_errata)
+		TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY);
+	else
+		TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY);
+}
+
+static void __test_icr(struct xapic_vcpu *x, uint64_t val)
+{
+	/*
+	 * The BUSY bit is reserved on both AMD and Intel, but only AMD treats
+	 * it is as _must_ be zero.  Intel simply ignores the bit.  Don't test
+	 * the BUSY bit for x2APIC, as there is no single correct behavior.
+	 */
+	if (!x->is_x2apic)
+		____test_icr(x, val | APIC_ICR_BUSY);
+
+	____test_icr(x, val & ~(u64)APIC_ICR_BUSY);
+}
+
+static void test_icr(struct xapic_vcpu *x)
+{
+	struct kvm_vcpu *vcpu = x->vcpu;
+	uint64_t icr, i, j;
+
+	icr = APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_FIXED;
+	for (i = 0; i <= 0xff; i++)
+		__test_icr(x, icr | i);
+
+	icr = APIC_INT_ASSERT | APIC_DM_FIXED;
+	for (i = 0; i <= 0xff; i++)
+		__test_icr(x, icr | i);
+
+	/*
+	 * Send all flavors of IPIs to non-existent vCPUs.  TODO: use number of
+	 * vCPUs, not vcpu.id + 1.  Arbitrarily use vector 0xff.
+	 */
+	icr = APIC_INT_ASSERT | 0xff;
+	for (i = 0; i < 0xff; i++) {
+		if (i == vcpu->id)
+			continue;
+		for (j = 0; j < 8; j++)
+			__test_icr(x, i << (32 + 24) | icr | (j << 8));
+	}
+
+	/* And again with a shorthand destination for all types of IPIs. */
+	icr = APIC_DEST_ALLBUT | APIC_INT_ASSERT;
+	for (i = 0; i < 8; i++)
+		__test_icr(x, icr | (i << 8));
+
+	/* And a few garbage value, just make sure it's an IRQ (blocked). */
+	__test_icr(x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK);
+	__test_icr(x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK);
+	__test_icr(x, -1ull & ~APIC_DM_FIXED_MASK);
+}
+
+static void __test_apic_id(struct kvm_vcpu *vcpu, uint64_t apic_base)
+{
+	uint32_t apic_id, expected;
+	struct kvm_lapic_state xapic;
+
+	vcpu_set_msr(vcpu, MSR_IA32_APICBASE, apic_base);
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+
+	expected = apic_base & X2APIC_ENABLE ? vcpu->id : vcpu->id << 24;
+	apic_id = *((u32 *)&xapic.regs[APIC_ID]);
+
+	TEST_ASSERT(apic_id == expected,
+		    "APIC_ID not set back to %s format; wanted = %x, got = %x",
+		    (apic_base & X2APIC_ENABLE) ? "x2APIC" : "xAPIC",
+		    expected, apic_id);
+}
+
+/*
+ * Verify that KVM switches the APIC_ID between xAPIC and x2APIC when userspace
+ * stuffs MSR_IA32_APICBASE.  Setting the APIC_ID when x2APIC is enabled and
+ * when the APIC transitions for DISABLED to ENABLED is architectural behavior
+ * (on Intel), whereas the x2APIC => xAPIC transition behavior is KVM ABI since
+ * attempted to transition from x2APIC to xAPIC without disabling the APIC is
+ * architecturally disallowed.
+ */
+static void test_apic_id(void)
+{
+	const uint32_t NR_VCPUS = 3;
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	uint64_t apic_base;
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_with_vcpus(NR_VCPUS, NULL, vcpus);
+	vm_enable_cap(vm, KVM_CAP_X2APIC_API, KVM_X2APIC_API_USE_32BIT_IDS);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		apic_base = vcpu_get_msr(vcpus[i], MSR_IA32_APICBASE);
+
+		TEST_ASSERT(apic_base & MSR_IA32_APICBASE_ENABLE,
+			    "APIC not in ENABLED state at vCPU RESET");
+		TEST_ASSERT(!(apic_base & X2APIC_ENABLE),
+			    "APIC not in xAPIC mode at vCPU RESET");
+
+		__test_apic_id(vcpus[i], apic_base);
+		__test_apic_id(vcpus[i], apic_base | X2APIC_ENABLE);
+		__test_apic_id(vcpus[i], apic_base);
+	}
+
+	kvm_vm_free(vm);
+}
+
+static void test_x2apic_id(void)
+{
+	struct kvm_lapic_state lapic = {};
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+	vcpu_set_msr(vcpu, MSR_IA32_APICBASE, MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+
+	/*
+	 * Try stuffing a modified x2APIC ID, KVM should ignore the value and
+	 * always return the vCPU's default/readonly x2APIC ID.
+	 */
+	for (i = 0; i <= 0xff; i++) {
+		*(u32 *)(lapic.regs + APIC_ID) = i << 24;
+		*(u32 *)(lapic.regs + APIC_SPIV) = APIC_SPIV_APIC_ENABLED;
+		vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
+
+		vcpu_ioctl(vcpu, KVM_GET_LAPIC, &lapic);
+		TEST_ASSERT(*((u32 *)&lapic.regs[APIC_ID]) == vcpu->id << 24,
+			    "x2APIC ID should be fully readonly");
+	}
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	struct xapic_vcpu x = {
+		.vcpu = NULL,
+		.is_x2apic = true,
+	};
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code);
+	test_icr(&x);
+	kvm_vm_free(vm);
+
+	/*
+	 * Use a second VM for the xAPIC test so that x2APIC can be hidden from
+	 * the guest in order to test AVIC.  KVM disallows changing CPUID after
+	 * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC.
+	 */
+	vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code);
+	x.is_x2apic = false;
+
+	/*
+	 * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit),
+	 * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel
+	 * drops writes, AMD does not).  Account for the errata when checking
+	 * that KVM reads back what was written.
+	 */
+	x.has_xavic_errata = host_cpu_is_amd &&
+			     get_kvm_amd_param_bool("avic");
+
+	vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+	test_icr(&x);
+	kvm_vm_free(vm);
+
+	test_apic_id();
+	test_x2apic_id();
+}
diff --git a/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c
new file mode 100644
index 000000000000..c8a5c5e51661
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XCR0 cpuid test
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+/*
+ * Assert that architectural dependency rules are satisfied, e.g. that AVX is
+ * supported if and only if SSE is supported.
+ */
+#define ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0, xfeatures, dependencies)		\
+do {											\
+	uint64_t __supported = (supported_xcr0) & ((xfeatures) | (dependencies));	\
+											\
+	__GUEST_ASSERT((__supported & (xfeatures)) != (xfeatures) ||			\
+		       __supported == ((xfeatures) | (dependencies)),			\
+		       "supported = 0x%lx, xfeatures = 0x%llx, dependencies = 0x%llx",	\
+		       __supported, (xfeatures), (dependencies));			\
+} while (0)
+
+/*
+ * Assert that KVM reports a sane, usable as-is XCR0.  Architecturally, a CPU
+ * isn't strictly required to _support_ all XFeatures related to a feature, but
+ * at the same time XSETBV will #GP if bundled XFeatures aren't enabled and
+ * disabled coherently.  E.g. a CPU can technically enumerate supported for
+ * XTILE_CFG but not XTILE_DATA, but attempting to enable XTILE_CFG without
+ * XTILE_DATA will #GP.
+ */
+#define ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, xfeatures)		\
+do {									\
+	uint64_t __supported = (supported_xcr0) & (xfeatures);		\
+									\
+	__GUEST_ASSERT(!__supported || __supported == (xfeatures),	\
+		       "supported = 0x%lx, xfeatures = 0x%llx",		\
+		       __supported, (xfeatures));			\
+} while (0)
+
+static void guest_code(void)
+{
+	uint64_t initial_xcr0;
+	uint64_t supported_xcr0;
+	int i, vector;
+
+	set_cr4(get_cr4() | X86_CR4_OSXSAVE);
+
+	initial_xcr0 = xgetbv(0);
+	supported_xcr0 = this_cpu_supported_xcr0();
+
+	GUEST_ASSERT(initial_xcr0 == supported_xcr0);
+
+	/* Check AVX */
+	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
+				     XFEATURE_MASK_YMM,
+				     XFEATURE_MASK_SSE);
+
+	/* Check MPX */
+	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
+				    XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+	/* Check AVX-512 */
+	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
+				     XFEATURE_MASK_AVX512,
+				     XFEATURE_MASK_SSE | XFEATURE_MASK_YMM);
+	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
+				    XFEATURE_MASK_AVX512);
+
+	/* Check AMX */
+	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
+				    XFEATURE_MASK_XTILE);
+
+	vector = xsetbv_safe(0, XFEATURE_MASK_FP);
+	__GUEST_ASSERT(!vector,
+		       "Expected success on XSETBV(FP), got vector '0x%x'",
+		       vector);
+
+	vector = xsetbv_safe(0, supported_xcr0);
+	__GUEST_ASSERT(!vector,
+		       "Expected success on XSETBV(0x%lx), got vector '0x%x'",
+		       supported_xcr0, vector);
+
+	for (i = 0; i < 64; i++) {
+		if (supported_xcr0 & BIT_ULL(i))
+			continue;
+
+		vector = xsetbv_safe(0, supported_xcr0 | BIT_ULL(i));
+		__GUEST_ASSERT(vector == GP_VECTOR,
+			       "Expected #GP on XSETBV(0x%llx), supported XCR0 = %lx, got vector '0x%x'",
+			       BIT_ULL(i), supported_xcr0, vector);
+	}
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+
+	while (1) {
+		vcpu_run(vcpu);
+
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Unexpected exit reason: %u (%s),",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
new file mode 100644
index 000000000000..a59b3c799bb2
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
@@ -0,0 +1,1161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include <sys/eventfd.h>
+
+#define SHINFO_REGION_GVA	0xc0000000ULL
+#define SHINFO_REGION_GPA	0xc0000000ULL
+#define SHINFO_REGION_SLOT	10
+
+#define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (3 * PAGE_SIZE))
+#define DUMMY_REGION_SLOT	11
+
+#define DUMMY_REGION_GPA_2	(SHINFO_REGION_GPA + (4 * PAGE_SIZE))
+#define DUMMY_REGION_SLOT_2	12
+
+#define SHINFO_ADDR	(SHINFO_REGION_GPA)
+#define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
+#define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
+#define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15)
+
+#define SHINFO_VADDR	(SHINFO_REGION_GVA)
+#define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
+#define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15)
+
+#define EVTCHN_VECTOR	0x10
+
+#define EVTCHN_TEST1 15
+#define EVTCHN_TEST2 66
+#define EVTCHN_TIMER 13
+
+enum {
+	TEST_INJECT_VECTOR = 0,
+	TEST_RUNSTATE_runnable,
+	TEST_RUNSTATE_blocked,
+	TEST_RUNSTATE_offline,
+	TEST_RUNSTATE_ADJUST,
+	TEST_RUNSTATE_DATA,
+	TEST_STEAL_TIME,
+	TEST_EVTCHN_MASKED,
+	TEST_EVTCHN_UNMASKED,
+	TEST_EVTCHN_SLOWPATH,
+	TEST_EVTCHN_SEND_IOCTL,
+	TEST_EVTCHN_HCALL,
+	TEST_EVTCHN_HCALL_SLOWPATH,
+	TEST_EVTCHN_HCALL_EVENTFD,
+	TEST_TIMER_SETUP,
+	TEST_TIMER_WAIT,
+	TEST_TIMER_RESTORE,
+	TEST_POLL_READY,
+	TEST_POLL_TIMEOUT,
+	TEST_POLL_MASKED,
+	TEST_POLL_WAKE,
+	SET_VCPU_INFO,
+	TEST_TIMER_PAST,
+	TEST_LOCKING_SEND_RACE,
+	TEST_LOCKING_POLL_RACE,
+	TEST_LOCKING_POLL_TIMEOUT,
+	TEST_DONE,
+
+	TEST_GUEST_SAW_IRQ,
+};
+
+#define XEN_HYPERCALL_MSR	0x40000000
+
+#define MIN_STEAL_TIME		50000
+
+#define SHINFO_RACE_TIMEOUT	2	/* seconds */
+
+#define __HYPERVISOR_set_timer_op	15
+#define __HYPERVISOR_sched_op		29
+#define __HYPERVISOR_event_channel_op	32
+
+#define SCHEDOP_poll			3
+
+#define EVTCHNOP_send			4
+
+#define EVTCHNSTAT_interdomain		2
+
+struct evtchn_send {
+	u32 port;
+};
+
+struct sched_poll {
+	u32 *ports;
+	unsigned int nr_ports;
+	u64 timeout;
+};
+
+struct pvclock_vcpu_time_info {
+	u32   version;
+	u32   pad0;
+	u64   tsc_timestamp;
+	u64   system_time;
+	u32   tsc_to_system_mul;
+	s8    tsc_shift;
+	u8    flags;
+	u8    pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+	u32   version;
+	u32   sec;
+	u32   nsec;
+} __attribute__((__packed__));
+
+struct vcpu_runstate_info {
+	uint32_t state;
+	uint64_t state_entry_time;
+	uint64_t time[5]; /* Extra field for overrun check */
+};
+
+struct compat_vcpu_runstate_info {
+	uint32_t state;
+	uint64_t state_entry_time;
+	uint64_t time[5];
+} __attribute__((__packed__));
+
+struct arch_vcpu_info {
+	unsigned long cr2;
+	unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+struct vcpu_info {
+	uint8_t evtchn_upcall_pending;
+	uint8_t evtchn_upcall_mask;
+	unsigned long evtchn_pending_sel;
+	struct arch_vcpu_info arch;
+	struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct shared_info {
+	struct vcpu_info vcpu_info[32];
+	unsigned long evtchn_pending[64];
+	unsigned long evtchn_mask[64];
+	struct pvclock_wall_clock wc;
+	uint32_t wc_sec_hi;
+	/* arch_shared_info here */
+};
+
+#define RUNSTATE_running  0
+#define RUNSTATE_runnable 1
+#define RUNSTATE_blocked  2
+#define RUNSTATE_offline  3
+
+static const char *runstate_names[] = {
+	"running",
+	"runnable",
+	"blocked",
+	"offline"
+};
+
+struct {
+	struct kvm_irq_routing info;
+	struct kvm_irq_routing_entry entries[2];
+} irq_routes;
+
+static volatile bool guest_saw_irq;
+
+static void evtchn_handler(struct ex_regs *regs)
+{
+	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
+
+	vcpu_arch_put_guest(vi->evtchn_upcall_pending, 0);
+	vcpu_arch_put_guest(vi->evtchn_pending_sel, 0);
+	guest_saw_irq = true;
+
+	GUEST_SYNC(TEST_GUEST_SAW_IRQ);
+}
+
+static void guest_wait_for_irq(void)
+{
+	while (!guest_saw_irq)
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+	guest_saw_irq = false;
+}
+
+static void guest_code(void)
+{
+	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
+	int i;
+
+	__asm__ __volatile__(
+		"sti\n"
+		"nop\n"
+	);
+
+	/* Trigger an interrupt injection */
+	GUEST_SYNC(TEST_INJECT_VECTOR);
+
+	guest_wait_for_irq();
+
+	/* Test having the host set runstates manually */
+	GUEST_SYNC(TEST_RUNSTATE_runnable);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(TEST_RUNSTATE_blocked);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(TEST_RUNSTATE_offline);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	/* Test runstate time adjust */
+	GUEST_SYNC(TEST_RUNSTATE_ADJUST);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
+
+	/* Test runstate time set */
+	GUEST_SYNC(TEST_RUNSTATE_DATA);
+	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
+
+	/* sched_yield() should result in some 'runnable' time */
+	GUEST_SYNC(TEST_STEAL_TIME);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
+
+	/* Attempt to deliver a *masked* interrupt */
+	GUEST_SYNC(TEST_EVTCHN_MASKED);
+
+	/* Wait until we see the bit set */
+	struct shared_info *si = (void *)SHINFO_VADDR;
+	while (!si->evtchn_pending[0])
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+
+	/* Now deliver an *unmasked* interrupt */
+	GUEST_SYNC(TEST_EVTCHN_UNMASKED);
+
+	guest_wait_for_irq();
+
+	/* Change memslots and deliver an interrupt */
+	GUEST_SYNC(TEST_EVTCHN_SLOWPATH);
+
+	guest_wait_for_irq();
+
+	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
+	GUEST_SYNC(TEST_EVTCHN_SEND_IOCTL);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_EVTCHN_HCALL);
+
+	/* Our turn. Deliver event channel (to ourselves) with
+	 * EVTCHNOP_send hypercall. */
+	struct evtchn_send s = { .port = 127 };
+	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_EVTCHN_HCALL_SLOWPATH);
+
+	/*
+	 * Same again, but this time the host has messed with memslots so it
+	 * should take the slow path in kvm_xen_set_evtchn().
+	 */
+	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_EVTCHN_HCALL_EVENTFD);
+
+	/* Deliver "outbound" event channel to an eventfd which
+	 * happens to be one of our own irqfds. */
+	s.port = 197;
+	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_TIMER_SETUP);
+
+	/* Set a timer 100ms in the future. */
+	xen_hypercall(__HYPERVISOR_set_timer_op,
+		      rs->state_entry_time + 100000000, NULL);
+
+	GUEST_SYNC(TEST_TIMER_WAIT);
+
+	/* Now wait for the timer */
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_TIMER_RESTORE);
+
+	/* The host has 'restored' the timer. Just wait for it. */
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_POLL_READY);
+
+	/* Poll for an event channel port which is already set */
+	u32 ports[1] = { EVTCHN_TIMER };
+	struct sched_poll p = {
+		.ports = ports,
+		.nr_ports = 1,
+		.timeout = 0,
+	};
+
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	GUEST_SYNC(TEST_POLL_TIMEOUT);
+
+	/* Poll for an unset port and wait for the timeout. */
+	p.timeout = 100000000;
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	GUEST_SYNC(TEST_POLL_MASKED);
+
+	/* A timer will wake the masked port we're waiting on, while we poll */
+	p.timeout = 0;
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	GUEST_SYNC(TEST_POLL_WAKE);
+
+	/* Set the vcpu_info to point at exactly the place it already is to
+	 * make sure the attribute is functional. */
+	GUEST_SYNC(SET_VCPU_INFO);
+
+	/* A timer wake an *unmasked* port which should wake us with an
+	 * actual interrupt, while we're polling on a different port. */
+	ports[0]++;
+	p.timeout = 0;
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_TIMER_PAST);
+
+	/* Timer should have fired already */
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_LOCKING_SEND_RACE);
+	/* Racing host ioctls */
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_LOCKING_POLL_RACE);
+	/* Racing vmcall against host ioctl */
+
+	ports[0] = 0;
+
+	p = (struct sched_poll) {
+		.ports = ports,
+		.nr_ports = 1,
+		.timeout = 0
+	};
+
+wait_for_timer:
+	/*
+	 * Poll for a timer wake event while the worker thread is mucking with
+	 * the shared info.  KVM XEN drops timer IRQs if the shared info is
+	 * invalid when the timer expires.  Arbitrarily poll 100 times before
+	 * giving up and asking the VMM to re-arm the timer.  100 polls should
+	 * consume enough time to beat on KVM without taking too long if the
+	 * timer IRQ is dropped due to an invalid event channel.
+	 */
+	for (i = 0; i < 100 && !guest_saw_irq; i++)
+		__xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	/*
+	 * Re-send the timer IRQ if it was (likely) dropped due to the timer
+	 * expiring while the event channel was invalid.
+	 */
+	if (!guest_saw_irq) {
+		GUEST_SYNC(TEST_LOCKING_POLL_TIMEOUT);
+		goto wait_for_timer;
+	}
+	guest_saw_irq = false;
+
+	GUEST_SYNC(TEST_DONE);
+}
+
+static struct shared_info *shinfo;
+static struct vcpu_info *vinfo;
+static struct kvm_vcpu *vcpu;
+
+static void handle_alrm(int sig)
+{
+	if (vinfo)
+		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
+	vcpu_dump(stdout, vcpu, 0);
+	TEST_FAIL("IRQ delivery timed out");
+}
+
+static void *juggle_shinfo_state(void *arg)
+{
+	struct kvm_vm *vm = (struct kvm_vm *)arg;
+
+	struct kvm_xen_hvm_attr cache_activate_gfn = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
+	};
+
+	struct kvm_xen_hvm_attr cache_deactivate_gfn = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
+	};
+
+	struct kvm_xen_hvm_attr cache_activate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA,
+		.u.shared_info.hva = (unsigned long)shinfo
+	};
+
+	struct kvm_xen_hvm_attr cache_deactivate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.hva = 0
+	};
+
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+
+	for (;;) {
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn);
+		pthread_testcancel();
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn);
+
+		if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) {
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva);
+			pthread_testcancel();
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva);
+		}
+	}
+
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_xen_hvm_attr evt_reset;
+	struct kvm_vm *vm;
+	pthread_t thread;
+	bool verbose;
+	int ret;
+
+	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
+			       !strncmp(argv[1], "--verbose", 10));
+
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO);
+
+	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
+	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
+	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
+	bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	/* Map a region for the shared_info page */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
+	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
+
+	shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+
+	int zero_fd = open("/dev/zero", O_RDONLY);
+	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
+
+	struct kvm_xen_hvm_config hvmc = {
+		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+		.msr = XEN_HYPERCALL_MSR,
+	};
+
+	/* Let the kernel know that we *will* use it for sending all
+	 * event channels, which lets it intercept SCHEDOP_poll */
+	if (do_evtchn_tests)
+		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+	struct kvm_xen_hvm_attr lm = {
+		.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+		.u.long_mode = 1,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+	if (do_runstate_flag) {
+		struct kvm_xen_hvm_attr ruf = {
+			.type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG,
+			.u.runstate_update_flag = 1,
+		};
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf);
+
+		ruf.u.runstate_update_flag = 0;
+		vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf);
+		TEST_ASSERT(ruf.u.runstate_update_flag == 1,
+			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
+	}
+
+	struct kvm_xen_hvm_attr ha = {};
+
+	if (has_shinfo_hva) {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA;
+		ha.u.shared_info.hva = (unsigned long)shinfo;
+	} else {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO;
+		ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE;
+	}
+
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
+
+	/*
+	 * Test what happens when the HVA of the shinfo page is remapped after
+	 * the kernel has a reference to it. But make sure we copy the clock
+	 * info over since that's only set at setup time, and we test it later.
+	 */
+	struct pvclock_wall_clock wc_copy = shinfo->wc;
+	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
+	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
+	shinfo->wc = wc_copy;
+
+	struct kvm_xen_vcpu_attr vi = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+		.u.gpa = VCPU_INFO_ADDR,
+	};
+	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
+
+	struct kvm_xen_vcpu_attr pvclock = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+		.u.gpa = PVTIME_ADDR,
+	};
+	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock);
+
+	struct kvm_xen_hvm_attr vec = {
+		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+		.u.vector = EVTCHN_VECTOR,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
+
+	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
+
+	if (do_runstate_tests) {
+		struct kvm_xen_vcpu_attr st = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
+			.u.gpa = RUNSTATE_ADDR,
+		};
+		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
+	}
+
+	int irq_fd[2] = { -1, -1 };
+
+	if (do_eventfd_tests) {
+		irq_fd[0] = eventfd(0, 0);
+		irq_fd[1] = eventfd(0, 0);
+
+		/* Unexpected, but not a KVM failure */
+		if (irq_fd[0] == -1 || irq_fd[1] == -1)
+			do_evtchn_tests = do_eventfd_tests = false;
+	}
+
+	if (do_eventfd_tests) {
+		irq_routes.info.nr = 2;
+
+		irq_routes.entries[0].gsi = 32;
+		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
+		irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id;
+		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		irq_routes.entries[1].gsi = 33;
+		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
+		irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id;
+		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
+
+		struct kvm_irqfd ifd = { };
+
+		ifd.fd = irq_fd[0];
+		ifd.gsi = 32;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		ifd.fd = irq_fd[1];
+		ifd.gsi = 33;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		struct sigaction sa = { };
+		sa.sa_handler = handle_alrm;
+		sigaction(SIGALRM, &sa, NULL);
+	}
+
+	struct kvm_xen_vcpu_attr tmr = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
+		.u.timer.port = EVTCHN_TIMER,
+		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+		.u.timer.expires_ns = 0
+	};
+
+	if (do_evtchn_tests) {
+		struct kvm_xen_hvm_attr inj = {
+			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
+			.u.evtchn.send_port = 127,
+			.u.evtchn.type = EVTCHNSTAT_interdomain,
+			.u.evtchn.flags = 0,
+			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
+			.u.evtchn.deliver.port.vcpu = vcpu->id + 1,
+			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+		};
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+		/* Test migration to a different vCPU */
+		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
+		inj.u.evtchn.deliver.port.vcpu = vcpu->id;
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+		inj.u.evtchn.send_port = 197;
+		inj.u.evtchn.deliver.eventfd.port = 0;
+		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
+		inj.u.evtchn.flags = 0;
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+	}
+	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
+	vinfo->evtchn_upcall_pending = 0;
+
+	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
+	rs->state = 0x5a;
+
+	bool evtchn_irq_expected = false;
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC: {
+			struct kvm_xen_vcpu_attr rst;
+			long rundelay;
+
+			if (do_runstate_tests)
+				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+					    rs->time[1] + rs->time[2] + rs->time[3],
+					    "runstate times don't add up");
+
+			switch (uc.args[1]) {
+			case TEST_INJECT_VECTOR:
+				if (verbose)
+					printf("Delivering evtchn upcall\n");
+				evtchn_irq_expected = true;
+				vinfo->evtchn_upcall_pending = 1;
+				break;
+
+			case TEST_RUNSTATE_runnable...TEST_RUNSTATE_offline:
+				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
+				if (!do_runstate_tests)
+					goto done;
+				if (verbose)
+					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
+				rst.u.runstate.state = uc.args[1] + RUNSTATE_runnable -
+					TEST_RUNSTATE_runnable;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case TEST_RUNSTATE_ADJUST:
+				if (verbose)
+					printf("Testing RUNSTATE_ADJUST\n");
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = (uint64_t)-1;
+				rst.u.runstate.time_blocked =
+					0x5a - rs->time[RUNSTATE_blocked];
+				rst.u.runstate.time_offline =
+					0x6b6b - rs->time[RUNSTATE_offline];
+				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
+					rst.u.runstate.time_offline;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case TEST_RUNSTATE_DATA:
+				if (verbose)
+					printf("Testing RUNSTATE_DATA\n");
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = RUNSTATE_running;
+				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
+				rst.u.runstate.time_blocked = 0x6b6b;
+				rst.u.runstate.time_offline = 0x5a;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case TEST_STEAL_TIME:
+				if (verbose)
+					printf("Testing steal time\n");
+				/* Yield until scheduler delay exceeds target */
+				rundelay = get_run_delay() + MIN_STEAL_TIME;
+				do {
+					sched_yield();
+				} while (get_run_delay() < rundelay);
+				break;
+
+			case TEST_EVTCHN_MASKED:
+				if (!do_eventfd_tests)
+					goto done;
+				if (verbose)
+					printf("Testing masked event channel\n");
+				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
+				eventfd_write(irq_fd[0], 1UL);
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_UNMASKED:
+				if (verbose)
+					printf("Testing unmasked event channel\n");
+				/* Unmask that, but deliver the other one */
+				shinfo->evtchn_pending[0] = 0;
+				shinfo->evtchn_mask[0] = 0;
+				eventfd_write(irq_fd[1], 1UL);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_SLOWPATH:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[1] = 0;
+				if (verbose)
+					printf("Testing event channel after memslot change\n");
+				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
+				eventfd_write(irq_fd[0], 1UL);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_SEND_IOCTL:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				if (!do_evtchn_tests)
+					goto done;
+
+				shinfo->evtchn_pending[0] = 0;
+				if (verbose)
+					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
+
+				struct kvm_irq_routing_xen_evtchn e;
+				e.port = EVTCHN_TEST2;
+				e.vcpu = vcpu->id;
+				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_HCALL:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[1] = 0;
+
+				if (verbose)
+					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_HCALL_SLOWPATH:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[0] = 0;
+
+				if (verbose)
+					printf("Testing guest EVTCHNOP_send direct to evtchn after memslot change\n");
+				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+							    DUMMY_REGION_GPA_2, DUMMY_REGION_SLOT_2, 1, 0);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_HCALL_EVENTFD:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[0] = 0;
+
+				if (verbose)
+					printf("Testing guest EVTCHNOP_send to eventfd\n");
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_TIMER_SETUP:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[1] = 0;
+
+				if (verbose)
+					printf("Testing guest oneshot timer\n");
+				break;
+
+			case TEST_TIMER_WAIT:
+				memset(&tmr, 0, sizeof(tmr));
+				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
+					    "Timer port not returned");
+				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+					    "Timer priority not returned");
+				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
+					    "Timer expiry not returned");
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_TIMER_RESTORE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[0] = 0;
+
+				if (verbose)
+					printf("Testing restored oneshot timer\n");
+
+				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_POLL_READY:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+
+				if (verbose)
+					printf("Testing SCHEDOP_poll with already pending event\n");
+				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
+				alarm(1);
+				break;
+
+			case TEST_POLL_TIMEOUT:
+				if (verbose)
+					printf("Testing SCHEDOP_poll timeout\n");
+				shinfo->evtchn_pending[0] = 0;
+				alarm(1);
+				break;
+
+			case TEST_POLL_MASKED:
+				if (verbose)
+					printf("Testing SCHEDOP_poll wake on masked event\n");
+
+				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				alarm(1);
+				break;
+
+			case TEST_POLL_WAKE:
+				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
+				if (verbose)
+					printf("Testing SCHEDOP_poll wake on unmasked event\n");
+
+				evtchn_irq_expected = true;
+				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+
+				/* Read it back and check the pending time is reported correctly */
+				tmr.u.timer.expires_ns = 0;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
+					    "Timer not reported pending");
+				alarm(1);
+				break;
+
+			case SET_VCPU_INFO:
+				if (has_shinfo_hva) {
+					struct kvm_xen_vcpu_attr vih = {
+						.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA,
+						.u.hva = (unsigned long)vinfo
+					};
+					vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih);
+				}
+				break;
+
+			case TEST_TIMER_PAST:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				/* Read timer and check it is no longer pending */
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
+
+				shinfo->evtchn_pending[0] = 0;
+				if (verbose)
+					printf("Testing timer in the past\n");
+
+				evtchn_irq_expected = true;
+				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				alarm(1);
+				break;
+
+			case TEST_LOCKING_SEND_RACE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				alarm(0);
+
+				if (verbose)
+					printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
+
+				ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
+				TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
+
+				struct kvm_irq_routing_xen_evtchn uxe = {
+					.port = 1,
+					.vcpu = vcpu->id,
+					.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
+				};
+
+				evtchn_irq_expected = true;
+				for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
+					__vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
+				break;
+
+			case TEST_LOCKING_POLL_RACE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+
+				if (verbose)
+					printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
+
+				shinfo->evtchn_pending[0] = 1;
+
+				evtchn_irq_expected = true;
+				tmr.u.timer.expires_ns = rs->state_entry_time +
+							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				break;
+
+			case TEST_LOCKING_POLL_TIMEOUT:
+				/*
+				 * Optional and possibly repeated sync point.
+				 * Injecting the timer IRQ may fail if the
+				 * shinfo is invalid when the timer expires.
+				 * If the timer has expired but the IRQ hasn't
+				 * been delivered, rearm the timer and retry.
+				 */
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+
+				/* Resume the guest if the timer is still pending. */
+				if (tmr.u.timer.expires_ns)
+					break;
+
+				/* All done if the IRQ was delivered. */
+				if (!evtchn_irq_expected)
+					break;
+
+				tmr.u.timer.expires_ns = rs->state_entry_time +
+							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				break;
+			case TEST_DONE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+
+				ret = pthread_cancel(thread);
+				TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
+
+				ret = pthread_join(thread, 0);
+				TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
+				goto done;
+
+			case TEST_GUEST_SAW_IRQ:
+				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
+				evtchn_irq_expected = false;
+				break;
+			}
+			break;
+		}
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+
+ done:
+	evt_reset.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+	evt_reset.u.evtchn.flags = KVM_XEN_EVTCHN_RESET;
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset);
+
+	alarm(0);
+
+	/*
+	 * Just a *really* basic check that things are being put in the
+	 * right place. The actual calculations are much the same for
+	 * Xen as they are for the KVM variants, so no need to check.
+	 */
+	struct pvclock_wall_clock *wc;
+	struct pvclock_vcpu_time_info *ti, *ti2;
+	struct kvm_clock_data kcdata;
+	long long delta;
+
+	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
+	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
+	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
+
+	if (verbose) {
+		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
+		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
+		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
+		       ti->tsc_shift, ti->flags);
+		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
+		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
+		       ti2->tsc_shift, ti2->flags);
+	}
+
+	TEST_ASSERT(wc->version && !(wc->version & 1),
+		    "Bad wallclock version %x", wc->version);
+
+	vm_ioctl(vm, KVM_GET_CLOCK, &kcdata);
+
+	if (kcdata.flags & KVM_CLOCK_REALTIME) {
+		if (verbose) {
+			printf("KVM_GET_CLOCK clock: %lld.%09lld\n",
+			       kcdata.clock / NSEC_PER_SEC, kcdata.clock % NSEC_PER_SEC);
+			printf("KVM_GET_CLOCK realtime: %lld.%09lld\n",
+			       kcdata.realtime / NSEC_PER_SEC, kcdata.realtime % NSEC_PER_SEC);
+		}
+
+		delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock);
+
+		/*
+		 * KVM_GET_CLOCK gives CLOCK_REALTIME which jumps on leap seconds updates but
+		 * unfortunately KVM doesn't currently offer a CLOCK_TAI alternative. Accept 1s
+		 * delta as testing clock accuracy is not the goal here. The test just needs to
+		 * check that the value in shinfo is somewhat sane.
+		 */
+		TEST_ASSERT(llabs(delta) < NSEC_PER_SEC,
+			    "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld",
+			    wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC,
+			    (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC);
+	} else {
+		pr_info("Missing KVM_CLOCK_REALTIME, skipping shinfo epoch sanity check\n");
+	}
+
+	TEST_ASSERT(ti->version && !(ti->version & 1),
+		    "Bad time_info version %x", ti->version);
+	TEST_ASSERT(ti2->version && !(ti2->version & 1),
+		    "Bad time_info version %x", ti->version);
+
+	if (do_runstate_tests) {
+		/*
+		 * Fetch runstate and check sanity. Strictly speaking in the
+		 * general case we might not expect the numbers to be identical
+		 * but in this case we know we aren't running the vCPU any more.
+		 */
+		struct kvm_xen_vcpu_attr rst = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
+		};
+		vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst);
+
+		if (verbose) {
+			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
+			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
+			       rs->state, rs->state_entry_time);
+			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
+				printf("State %s: %" PRIu64 " ns\n",
+				       runstate_names[i], rs->time[i]);
+			}
+		}
+
+		/*
+		 * Exercise runstate info at all points across the page boundary, in
+		 * 32-bit and 64-bit mode. In particular, test the case where it is
+		 * configured in 32-bit mode and then switched to 64-bit mode while
+		 * active, which takes it onto the second page.
+		 */
+		unsigned long runstate_addr;
+		struct compat_vcpu_runstate_info *crs;
+		for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4;
+		     runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) {
+
+			rs = addr_gpa2hva(vm, runstate_addr);
+			crs = (void *)rs;
+
+			memset(rs, 0xa5, sizeof(*rs));
+
+			/* Set to compatibility mode */
+			lm.u.long_mode = 0;
+			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+			/* Set runstate to new address (kernel will write it) */
+			struct kvm_xen_vcpu_attr st = {
+				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
+				.u.gpa = runstate_addr,
+			};
+			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
+
+			if (verbose)
+				printf("Compatibility runstate at %08lx\n", runstate_addr);
+
+			TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch");
+			TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time,
+				    "State entry time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running,
+				    "Running time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
+				    "Runnable time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
+				    "Blocked time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
+				    "Offline time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
+				    "Structure overrun");
+			TEST_ASSERT(crs->state_entry_time == crs->time[0] +
+				    crs->time[1] + crs->time[2] + crs->time[3],
+				    "runstate times don't add up");
+
+
+			/* Now switch to 64-bit mode */
+			lm.u.long_mode = 1;
+			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+			memset(rs, 0xa5, sizeof(*rs));
+
+			/* Don't change the address, just trigger a write */
+			struct kvm_xen_vcpu_attr adj = {
+				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST,
+				.u.runstate.state = (uint64_t)-1
+			};
+			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj);
+
+			if (verbose)
+				printf("64-bit runstate at %08lx\n", runstate_addr);
+
+			TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
+			TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
+				    "State entry time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
+				    "Running time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
+				    "Runnable time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
+				    "Blocked time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
+				    "Offline time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
+				    "Structure overrun");
+
+			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+				    rs->time[1] + rs->time[2] + rs->time[3],
+				    "runstate times don't add up");
+		}
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86/xen_vmcall_test.c
new file mode 100644
index 000000000000..2585087cdf5c
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xen_vmcall_test.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xen_vmcall_test
+ *
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates.
+ *
+ * Userspace hypercall testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+#define HCALL_REGION_GPA	0xc0000000ULL
+#define HCALL_REGION_SLOT	10
+
+#define INPUTVALUE 17
+#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x)
+#define RETVALUE 0xcafef00dfbfbffffUL
+
+#define XEN_HYPERCALL_MSR	0x40000200
+#define HV_GUEST_OS_ID_MSR	0x40000000
+#define HV_HYPERCALL_MSR	0x40000001
+
+#define HVCALL_SIGNAL_EVENT		0x005d
+#define HV_STATUS_INVALID_ALIGNMENT	4
+
+static void guest_code(void)
+{
+	unsigned long rax = INPUTVALUE;
+	unsigned long rdi = ARGVALUE(1);
+	unsigned long rsi = ARGVALUE(2);
+	unsigned long rdx = ARGVALUE(3);
+	unsigned long rcx;
+	register unsigned long r10 __asm__("r10") = ARGVALUE(4);
+	register unsigned long r8 __asm__("r8") = ARGVALUE(5);
+	register unsigned long r9 __asm__("r9") = ARGVALUE(6);
+
+	/* First a direct invocation of 'vmcall' */
+	__asm__ __volatile__("vmcall" :
+			     "=a"(rax) :
+			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+			     "r"(r10), "r"(r8), "r"(r9));
+	GUEST_ASSERT(rax == RETVALUE);
+
+	/* Fill in the Xen hypercall page */
+	__asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR),
+			     "a" (HCALL_REGION_GPA & 0xffffffff),
+			     "d" (HCALL_REGION_GPA >> 32));
+
+	/* Set Hyper-V Guest OS ID */
+	__asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR),
+			     "a" (0x5a), "d" (0));
+
+	/* Hyper-V hypercall page */
+	u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1;
+	__asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR),
+			     "a" (msrval & 0xffffffff),
+			     "d" (msrval >> 32));
+
+	/* Invoke a Xen hypercall */
+	__asm__ __volatile__("call *%1" : "=a"(rax) :
+			     "r"(HCALL_REGION_GPA + INPUTVALUE * 32),
+			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+			     "r"(r10), "r"(r8), "r"(r9));
+	GUEST_ASSERT(rax == RETVALUE);
+
+	/* Invoke a Hyper-V hypercall */
+	rax = 0;
+	rcx = HVCALL_SIGNAL_EVENT;	/* code */
+	rdx = 0x5a5a5a5a;		/* ingpa (badly aligned) */
+	__asm__ __volatile__("call *%1" : "=a"(rax) :
+			     "r"(HCALL_REGION_GPA + PAGE_SIZE),
+			     "a"(rax), "c"(rcx), "d"(rdx),
+			     "r"(r8));
+	GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int xen_caps;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_set_hv_cpuid(vcpu);
+
+	struct kvm_xen_hvm_config hvmc = {
+		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+		.msr = XEN_HYPERCALL_MSR,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+	/* Map a region for the hypercall pages */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0);
+	virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2);
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu->run;
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+
+		if (run->exit_reason == KVM_EXIT_XEN) {
+			TEST_ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL);
+			TEST_ASSERT_EQ(run->xen.u.hcall.cpl, 0);
+			TEST_ASSERT_EQ(run->xen.u.hcall.longmode, 1);
+			TEST_ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE);
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6));
+			run->xen.u.hcall.result = RETVALUE;
+			continue;
+		}
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/xss_msr_test.c b/tools/testing/selftests/kvm/x86/xss_msr_test.c
new file mode 100644
index 000000000000..f331a4e9bae3
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xss_msr_test.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019, Google LLC.
+ *
+ * Tests for the IA32_XSS MSR.
+ */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define MSR_BITS      64
+
+int main(int argc, char *argv[])
+{
+	bool xss_in_msr_list;
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	uint64_t xss_val;
+	int i, r;
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVES));
+
+	xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS);
+	TEST_ASSERT(xss_val == 0,
+		    "MSR_IA32_XSS should be initialized to zero");
+
+	vcpu_set_msr(vcpu, MSR_IA32_XSS, xss_val);
+
+	/*
+	 * At present, KVM only supports a guest IA32_XSS value of 0. Verify
+	 * that trying to set the guest IA32_XSS to an unsupported value fails.
+	 * Also, in the future when a non-zero value succeeds check that
+	 * IA32_XSS is in the list of MSRs to save/restore.
+	 */
+	xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS);
+	for (i = 0; i < MSR_BITS; ++i) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_XSS, 1ull << i);
+
+		/*
+		 * Setting a list of MSRs returns the entry that "faulted", or
+		 * the last entry +1 if all MSRs were successfully written.
+		 */
+		TEST_ASSERT(!r || r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r));
+		TEST_ASSERT(r != 1 || xss_in_msr_list,
+			    "IA32_XSS was able to be set, but was not in save/restore list");
+	}
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
deleted file mode 100644
index f4ce5a185a7d..000000000000
--- a/tools/testing/selftests/kvm/x86_64/amx_test.c
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * amx tests
- *
- * Copyright (C) 2021, Intel, Inc.
- *
- * Tests for amx #NM exception and save/restore.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/syscall.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#ifndef __x86_64__
-# error This test is 64-bit only
-#endif
-
-#define NUM_TILES			8
-#define TILE_SIZE			1024
-#define XSAVE_SIZE			((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
-
-/* Tile configuration associated: */
-#define PALETTE_TABLE_INDEX		1
-#define MAX_TILES			16
-#define RESERVED_BYTES			14
-
-#define XSAVE_HDR_OFFSET		512
-
-struct tile_config {
-	u8  palette_id;
-	u8  start_row;
-	u8  reserved[RESERVED_BYTES];
-	u16 colsb[MAX_TILES];
-	u8  rows[MAX_TILES];
-};
-
-struct tile_data {
-	u8 data[NUM_TILES * TILE_SIZE];
-};
-
-struct xtile_info {
-	u16 bytes_per_tile;
-	u16 bytes_per_row;
-	u16 max_names;
-	u16 max_rows;
-	u32 xsave_offset;
-	u32 xsave_size;
-};
-
-static struct xtile_info xtile;
-
-static inline void __ldtilecfg(void *cfg)
-{
-	asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
-		     : : "a"(cfg));
-}
-
-static inline void __tileloadd(void *tile)
-{
-	asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
-		     : : "a"(tile), "d"(0));
-}
-
-static inline void __tilerelease(void)
-{
-	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
-}
-
-static inline void __xsavec(struct xstate *xstate, uint64_t rfbm)
-{
-	uint32_t rfbm_lo = rfbm;
-	uint32_t rfbm_hi = rfbm >> 32;
-
-	asm volatile("xsavec (%%rdi)"
-		     : : "D" (xstate), "a" (rfbm_lo), "d" (rfbm_hi)
-		     : "memory");
-}
-
-static void check_xtile_info(void)
-{
-	GUEST_ASSERT((xgetbv(0) & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE);
-
-	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0));
-	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE);
-
-	xtile.xsave_offset = this_cpu_property(X86_PROPERTY_XSTATE_TILE_OFFSET);
-	GUEST_ASSERT(xtile.xsave_offset == 2816);
-	xtile.xsave_size = this_cpu_property(X86_PROPERTY_XSTATE_TILE_SIZE);
-	GUEST_ASSERT(xtile.xsave_size == 8192);
-	GUEST_ASSERT(sizeof(struct tile_data) >= xtile.xsave_size);
-
-	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_MAX_PALETTE_TABLES));
-	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_AMX_MAX_PALETTE_TABLES) >=
-		     PALETTE_TABLE_INDEX);
-
-	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_NR_TILE_REGS));
-	xtile.max_names = this_cpu_property(X86_PROPERTY_AMX_NR_TILE_REGS);
-	GUEST_ASSERT(xtile.max_names == 8);
-	xtile.bytes_per_tile = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_TILE);
-	GUEST_ASSERT(xtile.bytes_per_tile == 1024);
-	xtile.bytes_per_row = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_ROW);
-	GUEST_ASSERT(xtile.bytes_per_row == 64);
-	xtile.max_rows = this_cpu_property(X86_PROPERTY_AMX_MAX_ROWS);
-	GUEST_ASSERT(xtile.max_rows == 16);
-}
-
-static void set_tilecfg(struct tile_config *cfg)
-{
-	int i;
-
-	/* Only palette id 1 */
-	cfg->palette_id = 1;
-	for (i = 0; i < xtile.max_names; i++) {
-		cfg->colsb[i] = xtile.bytes_per_row;
-		cfg->rows[i] = xtile.max_rows;
-	}
-}
-
-static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
-						    struct tile_data *tiledata,
-						    struct xstate *xstate)
-{
-	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
-		     this_cpu_has(X86_FEATURE_OSXSAVE));
-	check_xtile_info();
-	GUEST_SYNC(1);
-
-	/* xfd=0, enable amx */
-	wrmsr(MSR_IA32_XFD, 0);
-	GUEST_SYNC(2);
-	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
-	set_tilecfg(amx_cfg);
-	__ldtilecfg(amx_cfg);
-	GUEST_SYNC(3);
-	/* Check save/restore when trap to userspace */
-	__tileloadd(tiledata);
-	GUEST_SYNC(4);
-	__tilerelease();
-	GUEST_SYNC(5);
-	/*
-	 * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
-	 * the xcomp_bv.
-	 */
-	xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA;
-	__xsavec(xstate, XFEATURE_MASK_XTILE_DATA);
-	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
-	GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);
-
-	/* xfd=0x40000, disable amx tiledata */
-	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
-
-	/*
-	 * XTILEDATA is cleared in xstate_bv but set in xcomp_bv, this property
-	 * remains the same even when amx tiledata is disabled by IA32_XFD.
-	 */
-	xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA;
-	__xsavec(xstate, XFEATURE_MASK_XTILE_DATA);
-	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
-	GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));
-
-	GUEST_SYNC(6);
-	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
-	set_tilecfg(amx_cfg);
-	__ldtilecfg(amx_cfg);
-	/* Trigger #NM exception */
-	__tileloadd(tiledata);
-	GUEST_SYNC(10);
-
-	GUEST_DONE();
-}
-
-void guest_nm_handler(struct ex_regs *regs)
-{
-	/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
-	GUEST_SYNC(7);
-	GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
-	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
-	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
-	GUEST_SYNC(8);
-	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
-	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
-	/* Clear xfd_err */
-	wrmsr(MSR_IA32_XFD_ERR, 0);
-	/* xfd=0, enable amx */
-	wrmsr(MSR_IA32_XFD, 0);
-	GUEST_SYNC(9);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_regs regs1, regs2;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_x86_state *state;
-	int xsave_restore_size;
-	vm_vaddr_t amx_cfg, tiledata, xstate;
-	struct ucall uc;
-	u32 amx_offset;
-	int ret;
-
-	/*
-	 * Note, all off-by-default features must be enabled before anything
-	 * caches KVM_GET_SUPPORTED_CPUID, e.g. before using kvm_cpu_has().
-	 */
-	vm_xsave_require_permission(XFEATURE_MASK_XTILE_DATA);
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA_XFD));
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE),
-		    "KVM should enumerate max XSAVE size when XSAVE is supported");
-	xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE);
-
-	vcpu_regs_get(vcpu, &regs1);
-
-	/* Register #NM handler */
-	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
-
-	/* amx cfg for guest_code */
-	amx_cfg = vm_vaddr_alloc_page(vm);
-	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
-
-	/* amx tiledata for guest_code */
-	tiledata = vm_vaddr_alloc_pages(vm, 2);
-	memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize());
-
-	/* XSAVE state for guest_code */
-	xstate = vm_vaddr_alloc_pages(vm, DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
-	memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
-	vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);
-
-	for (;;) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			switch (uc.args[1]) {
-			case 1:
-			case 2:
-			case 3:
-			case 5:
-			case 6:
-			case 7:
-			case 8:
-				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
-				break;
-			case 4:
-			case 10:
-				fprintf(stderr,
-				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
-
-				/* Compacted mode, get amx offset by xsave area
-				 * size subtract 8K amx size.
-				 */
-				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
-				state = vcpu_save_state(vcpu);
-				void *amx_start = (void *)state->xsave + amx_offset;
-				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
-				/* Only check TMM0 register, 1 tile */
-				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
-				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
-				kvm_x86_state_cleanup(state);
-				break;
-			case 9:
-				fprintf(stderr,
-				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
-				break;
-			}
-			break;
-		case UCALL_DONE:
-			fprintf(stderr, "UCALL_DONE\n");
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		state = vcpu_save_state(vcpu);
-		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vcpu, &regs1);
-
-		kvm_vm_release(vm);
-
-		/* Restore state in a new VM.  */
-		vcpu = vm_recreate_with_one_vcpu(vm);
-		vcpu_load_state(vcpu, state);
-		kvm_x86_state_cleanup(state);
-
-		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vcpu, &regs2);
-		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
-			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
-			    (ulong) regs2.rdi, (ulong) regs2.rsi);
-	}
-done:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
deleted file mode 100644
index f8916bb34405..000000000000
--- a/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
+++ /dev/null
@@ -1,194 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2024 Intel Corporation
- *
- * Verify KVM correctly emulates the APIC bus frequency when the VMM configures
- * the frequency via KVM_CAP_X86_APIC_BUS_CYCLES_NS.  Start the APIC timer by
- * programming TMICT (timer initial count) to the largest value possible (so
- * that the timer will not expire during the test).  Then, after an arbitrary
- * amount of time has elapsed, verify TMCCT (timer current count) is within 1%
- * of the expected value based on the time elapsed, the APIC bus frequency, and
- * the programmed TDCR (timer divide configuration register).
- */
-
-#include "apic.h"
-#include "test_util.h"
-
-/*
- * Possible TDCR values with matching divide count. Used to modify APIC
- * timer frequency.
- */
-static const struct {
-	const uint32_t tdcr;
-	const uint32_t divide_count;
-} tdcrs[] = {
-	{0x0, 2},
-	{0x1, 4},
-	{0x2, 8},
-	{0x3, 16},
-	{0x8, 32},
-	{0x9, 64},
-	{0xa, 128},
-	{0xb, 1},
-};
-
-static bool is_x2apic;
-
-static void apic_enable(void)
-{
-	if (is_x2apic)
-		x2apic_enable();
-	else
-		xapic_enable();
-}
-
-static uint32_t apic_read_reg(unsigned int reg)
-{
-	return is_x2apic ? x2apic_read_reg(reg) : xapic_read_reg(reg);
-}
-
-static void apic_write_reg(unsigned int reg, uint32_t val)
-{
-	if (is_x2apic)
-		x2apic_write_reg(reg, val);
-	else
-		xapic_write_reg(reg, val);
-}
-
-static void apic_guest_code(uint64_t apic_hz, uint64_t delay_ms)
-{
-	uint64_t tsc_hz = guest_tsc_khz * 1000;
-	const uint32_t tmict = ~0u;
-	uint64_t tsc0, tsc1, freq;
-	uint32_t tmcct;
-	int i;
-
-	apic_enable();
-
-	/*
-	 * Setup one-shot timer.  The vector does not matter because the
-	 * interrupt should not fire.
-	 */
-	apic_write_reg(APIC_LVTT, APIC_LVT_TIMER_ONESHOT | APIC_LVT_MASKED);
-
-	for (i = 0; i < ARRAY_SIZE(tdcrs); i++) {
-		apic_write_reg(APIC_TDCR, tdcrs[i].tdcr);
-		apic_write_reg(APIC_TMICT, tmict);
-
-		tsc0 = rdtsc();
-		udelay(delay_ms * 1000);
-		tmcct = apic_read_reg(APIC_TMCCT);
-		tsc1 = rdtsc();
-
-		/*
-		 * Stop the timer _after_ reading the current, final count, as
-		 * writing the initial counter also modifies the current count.
-		 */
-		apic_write_reg(APIC_TMICT, 0);
-
-		freq = (tmict - tmcct) * tdcrs[i].divide_count * tsc_hz / (tsc1 - tsc0);
-		/* Check if measured frequency is within 5% of configured frequency. */
-		__GUEST_ASSERT(freq < apic_hz * 105 / 100 && freq > apic_hz * 95 / 100,
-			       "Frequency = %lu (wanted %lu - %lu), bus = %lu, div = %u, tsc = %lu",
-			       freq, apic_hz * 95 / 100, apic_hz * 105 / 100,
-			       apic_hz, tdcrs[i].divide_count, tsc_hz);
-	}
-
-	GUEST_DONE();
-}
-
-static void test_apic_bus_clock(struct kvm_vcpu *vcpu)
-{
-	bool done = false;
-	struct ucall uc;
-
-	while (!done) {
-		vcpu_run(vcpu);
-
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_DONE:
-			done = true;
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-			break;
-		}
-	}
-}
-
-static void run_apic_bus_clock_test(uint64_t apic_hz, uint64_t delay_ms,
-				    bool x2apic)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	int ret;
-
-	is_x2apic = x2apic;
-
-	vm = vm_create(1);
-
-	sync_global_to_guest(vm, is_x2apic);
-
-	vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
-		      NSEC_PER_SEC / apic_hz);
-
-	vcpu = vm_vcpu_add(vm, 0, apic_guest_code);
-	vcpu_args_set(vcpu, 2, apic_hz, delay_ms);
-
-	ret = __vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
-			      NSEC_PER_SEC / apic_hz);
-	TEST_ASSERT(ret < 0 && errno == EINVAL,
-		    "Setting of APIC bus frequency after vCPU is created should fail.");
-
-	if (!is_x2apic)
-		virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
-
-	test_apic_bus_clock(vcpu);
-	kvm_vm_free(vm);
-}
-
-static void help(char *name)
-{
-	puts("");
-	printf("usage: %s [-h] [-d delay] [-f APIC bus freq]\n", name);
-	puts("");
-	printf("-d: Delay (in msec) guest uses to measure APIC bus frequency.\n");
-	printf("-f: The APIC bus frequency (in MHz) to be configured for the guest.\n");
-	puts("");
-}
-
-int main(int argc, char *argv[])
-{
-	/*
-	 * Arbitrarilty default to 25MHz for the APIC bus frequency, which is
-	 * different enough from the default 1GHz to be interesting.
-	 */
-	uint64_t apic_hz = 25 * 1000 * 1000;
-	uint64_t delay_ms = 100;
-	int opt;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS));
-
-	while ((opt = getopt(argc, argv, "d:f:h")) != -1) {
-		switch (opt) {
-		case 'f':
-			apic_hz = atoi_positive("APIC bus frequency", optarg) * 1000 * 1000;
-			break;
-		case 'd':
-			delay_ms = atoi_positive("Delay in milliseconds", optarg);
-			break;
-		case 'h':
-		default:
-			help(argv[0]);
-			exit(KSFT_SKIP);
-		}
-	}
-
-	run_apic_bus_clock_test(apic_hz, delay_ms, false);
-	run_apic_bus_clock_test(apic_hz, delay_ms, true);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
deleted file mode 100644
index 7b3fda6842bc..000000000000
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ /dev/null
@@ -1,225 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2021, Red Hat Inc.
- *
- * Generic tests for KVM CPUID set/get ioctls
- */
-#include <asm/kvm_para.h>
-#include <linux/kvm_para.h>
-#include <stdint.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-struct cpuid_mask {
-	union {
-		struct {
-			u32 eax;
-			u32 ebx;
-			u32 ecx;
-			u32 edx;
-		};
-		u32 regs[4];
-	};
-};
-
-static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
-{
-	int i;
-	u32 eax, ebx, ecx, edx;
-
-	for (i = 0; i < guest_cpuid->nent; i++) {
-		__cpuid(guest_cpuid->entries[i].function,
-			guest_cpuid->entries[i].index,
-			&eax, &ebx, &ecx, &edx);
-
-		GUEST_ASSERT_EQ(eax, guest_cpuid->entries[i].eax);
-		GUEST_ASSERT_EQ(ebx, guest_cpuid->entries[i].ebx);
-		GUEST_ASSERT_EQ(ecx, guest_cpuid->entries[i].ecx);
-		GUEST_ASSERT_EQ(edx, guest_cpuid->entries[i].edx);
-	}
-
-}
-
-static void guest_main(struct kvm_cpuid2 *guest_cpuid)
-{
-	GUEST_SYNC(1);
-
-	test_guest_cpuids(guest_cpuid);
-
-	GUEST_SYNC(2);
-
-	GUEST_ASSERT_EQ(this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF), 0x40000001);
-
-	GUEST_DONE();
-}
-
-static struct cpuid_mask get_const_cpuid_mask(const struct kvm_cpuid_entry2 *entry)
-{
-	struct cpuid_mask mask;
-
-	memset(&mask, 0xff, sizeof(mask));
-
-	switch (entry->function) {
-	case 0x1:
-		mask.regs[X86_FEATURE_OSXSAVE.reg] &= ~BIT(X86_FEATURE_OSXSAVE.bit);
-		break;
-	case 0x7:
-		mask.regs[X86_FEATURE_OSPKE.reg] &= ~BIT(X86_FEATURE_OSPKE.bit);
-		break;
-	case 0xd:
-		/*
-		 * CPUID.0xD.{0,1}.EBX enumerate XSAVE size based on the current
-		 * XCR0 and IA32_XSS MSR values.
-		 */
-		if (entry->index < 2)
-			mask.ebx = 0;
-		break;
-	}
-	return mask;
-}
-
-static void compare_cpuids(const struct kvm_cpuid2 *cpuid1,
-			   const struct kvm_cpuid2 *cpuid2)
-{
-	const struct kvm_cpuid_entry2 *e1, *e2;
-	int i;
-
-	TEST_ASSERT(cpuid1->nent == cpuid2->nent,
-		    "CPUID nent mismatch: %d vs. %d", cpuid1->nent, cpuid2->nent);
-
-	for (i = 0; i < cpuid1->nent; i++) {
-		struct cpuid_mask mask;
-
-		e1 = &cpuid1->entries[i];
-		e2 = &cpuid2->entries[i];
-
-		TEST_ASSERT(e1->function == e2->function &&
-			    e1->index == e2->index && e1->flags == e2->flags,
-			    "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x",
-			    i, e1->function, e1->index, e1->flags,
-			    e2->function, e2->index, e2->flags);
-
-		/* Mask off dynamic bits, e.g. OSXSAVE, when comparing entries. */
-		mask = get_const_cpuid_mask(e1);
-
-		TEST_ASSERT((e1->eax & mask.eax) == (e2->eax & mask.eax) &&
-			    (e1->ebx & mask.ebx) == (e2->ebx & mask.ebx) &&
-			    (e1->ecx & mask.ecx) == (e2->ecx & mask.ecx) &&
-			    (e1->edx & mask.edx) == (e2->edx & mask.edx),
-			    "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
-			    e1->function, e1->index,
-			    e1->eax & mask.eax, e1->ebx & mask.ebx,
-			    e1->ecx & mask.ecx, e1->edx & mask.edx,
-			    e2->eax & mask.eax, e2->ebx & mask.ebx,
-			    e2->ecx & mask.ecx, e2->edx & mask.edx);
-	}
-}
-
-static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_SYNC:
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage + 1,
-			    "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage + 1, (ulong)uc.args[1]);
-		return;
-	case UCALL_DONE:
-		return;
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-	default:
-		TEST_ASSERT(false, "Unexpected exit: %s",
-			    exit_reason_str(vcpu->run->exit_reason));
-	}
-}
-
-struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
-{
-	int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
-	vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR);
-	struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
-
-	memcpy(guest_cpuids, cpuid, size);
-
-	*p_gva = gva;
-	return guest_cpuids;
-}
-
-static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *ent;
-	int rc;
-	u32 eax, ebx, x;
-
-	/* Setting unmodified CPUID is allowed */
-	rc = __vcpu_set_cpuid(vcpu);
-	TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
-
-	/* Changing CPU features is forbidden */
-	ent = vcpu_get_cpuid_entry(vcpu, 0x7);
-	ebx = ent->ebx;
-	ent->ebx--;
-	rc = __vcpu_set_cpuid(vcpu);
-	TEST_ASSERT(rc, "Changing CPU features should fail");
-	ent->ebx = ebx;
-
-	/* Changing MAXPHYADDR is forbidden */
-	ent = vcpu_get_cpuid_entry(vcpu, 0x80000008);
-	eax = ent->eax;
-	x = eax & 0xff;
-	ent->eax = (eax & ~0xffu) | (x - 1);
-	rc = __vcpu_set_cpuid(vcpu);
-	TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
-	ent->eax = eax;
-}
-
-static void test_get_cpuid2(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1);
-	int i, r;
-
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
-	TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent,
-		    "KVM didn't update nent on success, wanted %u, got %u",
-		    vcpu->cpuid->nent, cpuid->nent);
-
-	for (i = 0; i < vcpu->cpuid->nent; i++) {
-		cpuid->nent = i;
-		r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
-		TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r));
-		TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure");
-	}
-	free(cpuid);
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	vm_vaddr_t cpuid_gva;
-	struct kvm_vm *vm;
-	int stage;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-
-	compare_cpuids(kvm_get_supported_cpuid(), vcpu->cpuid);
-
-	vcpu_alloc_cpuid(vm, &cpuid_gva, vcpu->cpuid);
-
-	vcpu_args_set(vcpu, 1, cpuid_gva);
-
-	for (stage = 0; stage < 3; stage++)
-		run_vcpu(vcpu, stage);
-
-	set_cpuid_after_run(vcpu);
-
-	test_get_cpuid2(vcpu);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
deleted file mode 100644
index 28cc66454601..000000000000
--- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CR4 and CPUID sync test
- *
- * Copyright 2018, Red Hat, Inc. and/or its affiliates.
- *
- * Author:
- *   Wei Huang <wei@redhat.com>
- */
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-
-#define MAGIC_HYPERCALL_PORT	0x80
-
-static void guest_code(void)
-{
-	u32 regs[4] = {
-		[KVM_CPUID_EAX] = X86_FEATURE_OSXSAVE.function,
-		[KVM_CPUID_ECX] = X86_FEATURE_OSXSAVE.index,
-	};
-
-	/* CR4.OSXSAVE should be enabled by default (for selftests vCPUs). */
-	GUEST_ASSERT(get_cr4() & X86_CR4_OSXSAVE);
-
-	/* verify CR4.OSXSAVE == CPUID.OSXSAVE */
-	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
-
-	/*
-	 * Notify hypervisor to clear CR4.0SXSAVE, do CPUID and save output,
-	 * and then restore CR4.  Do this all in  assembly to ensure no AVX
-	 * instructions are executed while OSXSAVE=0.
-	 */
-	asm volatile (
-		"out %%al, $" __stringify(MAGIC_HYPERCALL_PORT) "\n\t"
-		"cpuid\n\t"
-		"mov %%rdi, %%cr4\n\t"
-		: "+a" (regs[KVM_CPUID_EAX]),
-		  "=b" (regs[KVM_CPUID_EBX]),
-		  "+c" (regs[KVM_CPUID_ECX]),
-		  "=d" (regs[KVM_CPUID_EDX])
-		: "D" (get_cr4())
-	);
-
-	/* Verify KVM cleared OSXSAVE in CPUID when it was cleared in CR4. */
-	GUEST_ASSERT(!(regs[X86_FEATURE_OSXSAVE.reg] & BIT(X86_FEATURE_OSXSAVE.bit)));
-
-	/* Verify restoring CR4 also restored OSXSAVE in CPUID. */
-	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_sregs sregs;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	while (1) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		if (vcpu->run->io.port == MAGIC_HYPERCALL_PORT &&
-		    vcpu->run->io.direction == KVM_EXIT_IO_OUT) {
-			/* emulate hypervisor clearing CR4.OSXSAVE */
-			vcpu_sregs_get(vcpu, &sregs);
-			sregs.cr4 &= ~X86_CR4_OSXSAVE;
-			vcpu_sregs_set(vcpu, &sregs);
-			continue;
-		}
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
deleted file mode 100644
index 2d814c1d1dc4..000000000000
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * KVM guest debug register tests
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-#include <stdio.h>
-#include <string.h>
-#include "kvm_util.h"
-#include "processor.h"
-#include "apic.h"
-
-#define DR6_BD		(1 << 13)
-#define DR7_GD		(1 << 13)
-
-#define IRQ_VECTOR 0xAA
-
-/* For testing data access debug BP */
-uint32_t guest_value;
-
-extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
-
-static void guest_code(void)
-{
-	/* Create a pending interrupt on current vCPU */
-	x2apic_enable();
-	x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT |
-			 APIC_DM_FIXED | IRQ_VECTOR);
-
-	/*
-	 * Software BP tests.
-	 *
-	 * NOTE: sw_bp need to be before the cmd here, because int3 is an
-	 * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we
-	 * capture it using the vcpu exception bitmap).
-	 */
-	asm volatile("sw_bp: int3");
-
-	/* Hardware instruction BP test */
-	asm volatile("hw_bp: nop");
-
-	/* Hardware data BP test */
-	asm volatile("mov $1234,%%rax;\n\t"
-		     "mov %%rax,%0;\n\t write_data:"
-		     : "=m" (guest_value) : : "rax");
-
-	/*
-	 * Single step test, covers 2 basic instructions and 2 emulated
-	 *
-	 * Enable interrupts during the single stepping to see that pending
-	 * interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ.
-	 *
-	 * Write MSR_IA32_TSC_DEADLINE to verify that KVM's fastpath handler
-	 * exits to userspace due to single-step being enabled.
-	 */
-	asm volatile("ss_start: "
-		     "sti\n\t"
-		     "xor %%eax,%%eax\n\t"
-		     "cpuid\n\t"
-		     "movl $" __stringify(MSR_IA32_TSC_DEADLINE) ", %%ecx\n\t"
-		     "wrmsr\n\t"
-		     "cli\n\t"
-		     : : : "eax", "ebx", "ecx", "edx");
-
-	/* DR6.BD test */
-	asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
-	GUEST_DONE();
-}
-
-#define  CAST_TO_RIP(v)  ((unsigned long long)&(v))
-
-static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len)
-{
-	struct kvm_regs regs;
-
-	vcpu_regs_get(vcpu, &regs);
-	regs.rip += insn_len;
-	vcpu_regs_set(vcpu, &regs);
-}
-
-int main(void)
-{
-	struct kvm_guest_debug debug;
-	unsigned long long target_dr6, target_rip;
-	struct kvm_vcpu *vcpu;
-	struct kvm_run *run;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	uint64_t cmd;
-	int i;
-	/* Instruction lengths starting at ss_start */
-	int ss_size[6] = {
-		1,		/* sti*/
-		2,		/* xor */
-		2,		/* cpuid */
-		5,		/* mov */
-		2,		/* rdmsr */
-		1,		/* cli */
-	};
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	run = vcpu->run;
-
-	/* Test software BPs - int3 */
-	memset(&debug, 0, sizeof(debug));
-	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
-	vcpu_guest_debug_set(vcpu, &debug);
-	vcpu_run(vcpu);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
-		    run->debug.arch.exception == BP_VECTOR &&
-		    run->debug.arch.pc == CAST_TO_RIP(sw_bp),
-		    "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)",
-		    run->exit_reason, run->debug.arch.exception,
-		    run->debug.arch.pc, CAST_TO_RIP(sw_bp));
-	vcpu_skip_insn(vcpu, 1);
-
-	/* Test instruction HW BP over DR[0-3] */
-	for (i = 0; i < 4; i++) {
-		memset(&debug, 0, sizeof(debug));
-		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
-		debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp);
-		debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1));
-		vcpu_guest_debug_set(vcpu, &debug);
-		vcpu_run(vcpu);
-		target_dr6 = 0xffff0ff0 | (1UL << i);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
-			    run->debug.arch.exception == DB_VECTOR &&
-			    run->debug.arch.pc == CAST_TO_RIP(hw_bp) &&
-			    run->debug.arch.dr6 == target_dr6,
-			    "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
-			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
-			    i, run->exit_reason, run->debug.arch.exception,
-			    run->debug.arch.pc, CAST_TO_RIP(hw_bp),
-			    run->debug.arch.dr6, target_dr6);
-	}
-	/* Skip "nop" */
-	vcpu_skip_insn(vcpu, 1);
-
-	/* Test data access HW BP over DR[0-3] */
-	for (i = 0; i < 4; i++) {
-		memset(&debug, 0, sizeof(debug));
-		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
-		debug.arch.debugreg[i] = CAST_TO_RIP(guest_value);
-		debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) |
-		    (0x000d0000UL << (4*i));
-		vcpu_guest_debug_set(vcpu, &debug);
-		vcpu_run(vcpu);
-		target_dr6 = 0xffff0ff0 | (1UL << i);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
-			    run->debug.arch.exception == DB_VECTOR &&
-			    run->debug.arch.pc == CAST_TO_RIP(write_data) &&
-			    run->debug.arch.dr6 == target_dr6,
-			    "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
-			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
-			    i, run->exit_reason, run->debug.arch.exception,
-			    run->debug.arch.pc, CAST_TO_RIP(write_data),
-			    run->debug.arch.dr6, target_dr6);
-		/* Rollback the 4-bytes "mov" */
-		vcpu_skip_insn(vcpu, -7);
-	}
-	/* Skip the 4-bytes "mov" */
-	vcpu_skip_insn(vcpu, 7);
-
-	/* Test single step */
-	target_rip = CAST_TO_RIP(ss_start);
-	target_dr6 = 0xffff4ff0ULL;
-	for (i = 0; i < ARRAY_SIZE(ss_size); i++) {
-		target_rip += ss_size[i];
-		memset(&debug, 0, sizeof(debug));
-		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |
-				KVM_GUESTDBG_BLOCKIRQ;
-		debug.arch.debugreg[7] = 0x00000400;
-		vcpu_guest_debug_set(vcpu, &debug);
-		vcpu_run(vcpu);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
-			    run->debug.arch.exception == DB_VECTOR &&
-			    run->debug.arch.pc == target_rip &&
-			    run->debug.arch.dr6 == target_dr6,
-			    "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx "
-			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
-			    i, run->exit_reason, run->debug.arch.exception,
-			    run->debug.arch.pc, target_rip, run->debug.arch.dr6,
-			    target_dr6);
-	}
-
-	/* Finally test global disable */
-	memset(&debug, 0, sizeof(debug));
-	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
-	debug.arch.debugreg[7] = 0x400 | DR7_GD;
-	vcpu_guest_debug_set(vcpu, &debug);
-	vcpu_run(vcpu);
-	target_dr6 = 0xffff0ff0 | DR6_BD;
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
-		    run->debug.arch.exception == DB_VECTOR &&
-		    run->debug.arch.pc == CAST_TO_RIP(bd_start) &&
-		    run->debug.arch.dr6 == target_dr6,
-			    "DR7.GD: exit %d exception %d rip 0x%llx "
-			    "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
-			    run->exit_reason, run->debug.arch.exception,
-			    run->debug.arch.pc, target_rip, run->debug.arch.dr6,
-			    target_dr6);
-
-	/* Disable all debug controls, run to the end */
-	memset(&debug, 0, sizeof(debug));
-	vcpu_guest_debug_set(vcpu, &debug);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	cmd = get_ucall(vcpu, &uc);
-	TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE");
-
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
deleted file mode 100644
index 2929c067c207..000000000000
--- a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
+++ /dev/null
@@ -1,263 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * KVM dirty logging page splitting test
- *
- * Based on dirty_log_perf.c
- *
- * Copyright (C) 2018, Red Hat, Inc.
- * Copyright (C) 2023, Google, Inc.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <pthread.h>
-#include <linux/bitmap.h>
-
-#include "kvm_util.h"
-#include "test_util.h"
-#include "memstress.h"
-#include "guest_modes.h"
-#include "ucall_common.h"
-
-#define VCPUS		2
-#define SLOTS		2
-#define ITERATIONS	2
-
-static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
-
-static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB;
-
-static u64 dirty_log_manual_caps;
-static bool host_quit;
-static int iteration;
-static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
-
-struct kvm_page_stats {
-	uint64_t pages_4k;
-	uint64_t pages_2m;
-	uint64_t pages_1g;
-	uint64_t hugepages;
-};
-
-static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage)
-{
-	stats->pages_4k = vm_get_stat(vm, "pages_4k");
-	stats->pages_2m = vm_get_stat(vm, "pages_2m");
-	stats->pages_1g = vm_get_stat(vm, "pages_1g");
-	stats->hugepages = stats->pages_2m + stats->pages_1g;
-
-	pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
-		 stage, stats->pages_4k, stats->pages_2m, stats->pages_1g,
-		 stats->hugepages);
-}
-
-static void run_vcpu_iteration(struct kvm_vm *vm)
-{
-	int i;
-
-	iteration++;
-	for (i = 0; i < VCPUS; i++) {
-		while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
-		       iteration)
-			;
-	}
-}
-
-static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
-{
-	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
-	int vcpu_idx = vcpu_args->vcpu_idx;
-
-	while (!READ_ONCE(host_quit)) {
-		int current_iteration = READ_ONCE(iteration);
-
-		vcpu_run(vcpu);
-
-		TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
-
-		vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
-
-		/* Wait for the start of the next iteration to be signaled. */
-		while (current_iteration == READ_ONCE(iteration) &&
-		       READ_ONCE(iteration) >= 0 &&
-		       !READ_ONCE(host_quit))
-			;
-	}
-}
-
-static void run_test(enum vm_guest_mode mode, void *unused)
-{
-	struct kvm_vm *vm;
-	unsigned long **bitmaps;
-	uint64_t guest_num_pages;
-	uint64_t host_num_pages;
-	uint64_t pages_per_slot;
-	int i;
-	struct kvm_page_stats stats_populated;
-	struct kvm_page_stats stats_dirty_logging_enabled;
-	struct kvm_page_stats stats_dirty_pass[ITERATIONS];
-	struct kvm_page_stats stats_clear_pass[ITERATIONS];
-	struct kvm_page_stats stats_dirty_logging_disabled;
-	struct kvm_page_stats stats_repopulated;
-
-	vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size,
-				 SLOTS, backing_src, false);
-
-	guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift;
-	guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
-	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
-	pages_per_slot = host_num_pages / SLOTS;
-	TEST_ASSERT_EQ(host_num_pages, pages_per_slot * SLOTS);
-	TEST_ASSERT(!(host_num_pages % 512),
-		    "Number of pages, '%lu' not a multiple of 2MiB", host_num_pages);
-
-	bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot);
-
-	if (dirty_log_manual_caps)
-		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
-			      dirty_log_manual_caps);
-
-	/* Start the iterations */
-	iteration = -1;
-	host_quit = false;
-
-	for (i = 0; i < VCPUS; i++)
-		vcpu_last_completed_iteration[i] = -1;
-
-	memstress_start_vcpu_threads(VCPUS, vcpu_worker);
-
-	run_vcpu_iteration(vm);
-	get_page_stats(vm, &stats_populated, "populating memory");
-
-	/* Enable dirty logging */
-	memstress_enable_dirty_logging(vm, SLOTS);
-
-	get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging");
-
-	while (iteration < ITERATIONS) {
-		run_vcpu_iteration(vm);
-		get_page_stats(vm, &stats_dirty_pass[iteration - 1],
-			       "dirtying memory");
-
-		memstress_get_dirty_log(vm, bitmaps, SLOTS);
-
-		if (dirty_log_manual_caps) {
-			memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot);
-
-			get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log");
-		}
-	}
-
-	/* Disable dirty logging */
-	memstress_disable_dirty_logging(vm, SLOTS);
-
-	get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging");
-
-	/* Run vCPUs again to fault pages back in. */
-	run_vcpu_iteration(vm);
-	get_page_stats(vm, &stats_repopulated, "repopulating memory");
-
-	/*
-	 * Tell the vCPU threads to quit.  No need to manually check that vCPUs
-	 * have stopped running after disabling dirty logging, the join will
-	 * wait for them to exit.
-	 */
-	host_quit = true;
-	memstress_join_vcpu_threads(VCPUS);
-
-	memstress_free_bitmaps(bitmaps, SLOTS);
-	memstress_destroy_vm(vm);
-
-	TEST_ASSERT_EQ((stats_populated.pages_2m * 512 +
-			stats_populated.pages_1g * 512 * 512), host_num_pages);
-
-	/*
-	 * Check that all huge pages were split. Since large pages can only
-	 * exist in the data slot, and the vCPUs should have dirtied all pages
-	 * in the data slot, there should be no huge pages left after splitting.
-	 * Splitting happens at dirty log enable time without
-	 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
-	 * with that capability.
-	 */
-	if (dirty_log_manual_caps) {
-		TEST_ASSERT_EQ(stats_clear_pass[0].hugepages, 0);
-		TEST_ASSERT(stats_clear_pass[0].pages_4k >= host_num_pages,
-			    "Expected at least '%lu' 4KiB pages, found only '%lu'",
-			    host_num_pages, stats_clear_pass[0].pages_4k);
-		TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages);
-	} else {
-		TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0);
-		TEST_ASSERT(stats_dirty_logging_enabled.pages_4k >= host_num_pages,
-			    "Expected at least '%lu' 4KiB pages, found only '%lu'",
-			    host_num_pages, stats_dirty_logging_enabled.pages_4k);
-	}
-
-	/*
-	 * Once dirty logging is disabled and the vCPUs have touched all their
-	 * memory again, the hugepage counts should be the same as they were
-	 * right after initial population of memory.
-	 */
-	TEST_ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m);
-	TEST_ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g);
-}
-
-static void help(char *name)
-{
-	puts("");
-	printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n",
-	       name);
-	puts("");
-	printf(" -b: specify the size of the memory region which should be\n"
-	       "     dirtied by each vCPU. e.g. 10M or 3G.\n"
-	       "     (default: 1G)\n");
-	backing_src_help("-s");
-	puts("");
-}
-
-int main(int argc, char *argv[])
-{
-	int opt;
-
-	TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
-	TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
-
-	while ((opt = getopt(argc, argv, "b:hs:")) != -1) {
-		switch (opt) {
-		case 'b':
-			guest_percpu_mem_size = parse_size(optarg);
-			break;
-		case 'h':
-			help(argv[0]);
-			exit(0);
-		case 's':
-			backing_src = parse_backing_src_type(optarg);
-			break;
-		default:
-			help(argv[0]);
-			exit(1);
-		}
-	}
-
-	if (!is_backing_src_hugetlb(backing_src)) {
-		pr_info("This test will only work reliably with HugeTLB memory. "
-			"It can work with THP, but that is best effort.\n");
-	}
-
-	guest_modes_append_default();
-
-	dirty_log_manual_caps = 0;
-	for_each_guest_mode(run_test, NULL);
-
-	dirty_log_manual_caps =
-		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
-
-	if (dirty_log_manual_caps) {
-		dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
-					  KVM_DIRTY_LOG_INITIALLY_SET);
-		for_each_guest_mode(run_test, NULL);
-	} else {
-		pr_info("Skipping testing with MANUAL_PROTECT as it is not supported");
-	}
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
deleted file mode 100644
index 81055476d394..000000000000
--- a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2022, Google LLC.
- *
- * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE.
- */
-#include "flds_emulation.h"
-#include "test_util.h"
-#include "ucall_common.h"
-
-#define MMIO_GPA	0x700000000
-#define MMIO_GVA	MMIO_GPA
-
-static void guest_code(void)
-{
-	/* Execute flds with an MMIO address to force KVM to emulate it. */
-	flds(MMIO_GVA);
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
-	virt_map(vm, MMIO_GVA, MMIO_GPA, 1);
-
-	vcpu_run(vcpu);
-	handle_flds_emulation_failure_exit(vcpu);
-	vcpu_run(vcpu);
-	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
-
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/feature_msrs_test.c b/tools/testing/selftests/kvm/x86_64/feature_msrs_test.c
deleted file mode 100644
index a72f13ae2edb..000000000000
--- a/tools/testing/selftests/kvm/x86_64/feature_msrs_test.c
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2020, Red Hat, Inc.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-static bool is_kvm_controlled_msr(uint32_t msr)
-{
-	return msr == MSR_IA32_VMX_CR0_FIXED1 || msr == MSR_IA32_VMX_CR4_FIXED1;
-}
-
-/*
- * For VMX MSRs with a "true" variant, KVM requires userspace to set the "true"
- * MSR, and doesn't allow setting the hidden version.
- */
-static bool is_hidden_vmx_msr(uint32_t msr)
-{
-	switch (msr) {
-	case MSR_IA32_VMX_PINBASED_CTLS:
-	case MSR_IA32_VMX_PROCBASED_CTLS:
-	case MSR_IA32_VMX_EXIT_CTLS:
-	case MSR_IA32_VMX_ENTRY_CTLS:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static bool is_quirked_msr(uint32_t msr)
-{
-	return msr != MSR_AMD64_DE_CFG;
-}
-
-static void test_feature_msr(uint32_t msr)
-{
-	const uint64_t supported_mask = kvm_get_feature_msr(msr);
-	uint64_t reset_value = is_quirked_msr(msr) ? supported_mask : 0;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	/*
-	 * Don't bother testing KVM-controlled MSRs beyond verifying that the
-	 * MSR can be read from userspace.  Any value is effectively legal, as
-	 * KVM is bound by x86 architecture, not by ABI.
-	 */
-	if (is_kvm_controlled_msr(msr))
-		return;
-
-	/*
-	 * More goofy behavior.  KVM reports the host CPU's actual revision ID,
-	 * but initializes the vCPU's revision ID to an arbitrary value.
-	 */
-	if (msr == MSR_IA32_UCODE_REV)
-		reset_value = host_cpu_is_intel ? 0x100000000ULL : 0x01000065;
-
-	/*
-	 * For quirked MSRs, KVM's ABI is to initialize the vCPU's value to the
-	 * full set of features supported by KVM.  For non-quirked MSRs, and
-	 * when the quirk is disabled, KVM must zero-initialize the MSR and let
-	 * userspace do the configuration.
-	 */
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-	TEST_ASSERT(vcpu_get_msr(vcpu, msr) == reset_value,
-		    "Wanted 0x%lx for %squirked MSR 0x%x, got 0x%lx",
-		    reset_value, is_quirked_msr(msr) ? "" : "non-", msr,
-		    vcpu_get_msr(vcpu, msr));
-	if (!is_hidden_vmx_msr(msr))
-		vcpu_set_msr(vcpu, msr, supported_mask);
-	kvm_vm_free(vm);
-
-	if (is_hidden_vmx_msr(msr))
-		return;
-
-	if (!kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2) ||
-	    !(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
-		return;
-
-	vm = vm_create(1);
-	vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_STUFF_FEATURE_MSRS);
-
-	vcpu = vm_vcpu_add(vm, 0, NULL);
-	TEST_ASSERT(!vcpu_get_msr(vcpu, msr),
-		    "Quirk disabled, wanted '0' for MSR 0x%x, got 0x%lx",
-		    msr, vcpu_get_msr(vcpu, msr));
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	const struct kvm_msr_list *feature_list;
-	int i;
-
-	/*
-	 * Skip the entire test if MSR_FEATURES isn't supported, other tests
-	 * will cover the "regular" list of MSRs, the coverage here is purely
-	 * opportunistic and not interesting on its own.
-	 */
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES));
-
-	(void)kvm_get_msr_index_list();
-
-	feature_list = kvm_get_feature_msr_index_list();
-	for (i = 0; i < feature_list->nmsrs; i++)
-		test_feature_msr(feature_list->indices[i]);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
deleted file mode 100644
index 762628f7d4ba..000000000000
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2020, Google LLC.
- *
- * Tests for KVM paravirtual feature disablement
- */
-#include <asm/kvm_para.h>
-#include <linux/kvm_para.h>
-#include <linux/stringify.h>
-#include <stdint.h>
-
-#include "kvm_test_harness.h"
-#include "apic.h"
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-/* VMCALL and VMMCALL are both 3-byte opcodes. */
-#define HYPERCALL_INSN_SIZE	3
-
-static bool quirk_disabled;
-
-static void guest_ud_handler(struct ex_regs *regs)
-{
-	regs->rax = -EFAULT;
-	regs->rip += HYPERCALL_INSN_SIZE;
-}
-
-static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE]  = { 0x0f, 0x01, 0xc1 };
-static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 };
-
-extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE];
-static uint64_t do_sched_yield(uint8_t apic_id)
-{
-	uint64_t ret;
-
-	asm volatile("hypercall_insn:\n\t"
-		     ".byte 0xcc,0xcc,0xcc\n\t"
-		     : "=a"(ret)
-		     : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id)
-		     : "memory");
-
-	return ret;
-}
-
-static void guest_main(void)
-{
-	const uint8_t *native_hypercall_insn;
-	const uint8_t *other_hypercall_insn;
-	uint64_t ret;
-
-	if (host_cpu_is_intel) {
-		native_hypercall_insn = vmx_vmcall;
-		other_hypercall_insn  = svm_vmmcall;
-	} else if (host_cpu_is_amd) {
-		native_hypercall_insn = svm_vmmcall;
-		other_hypercall_insn  = vmx_vmcall;
-	} else {
-		GUEST_ASSERT(0);
-		/* unreachable */
-		return;
-	}
-
-	memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE);
-
-	ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)));
-
-	/*
-	 * If the quirk is disabled, verify that guest_ud_handler() "returned"
-	 * -EFAULT and that KVM did NOT patch the hypercall.  If the quirk is
-	 * enabled, verify that the hypercall succeeded and that KVM patched in
-	 * the "right" hypercall.
-	 */
-	if (quirk_disabled) {
-		GUEST_ASSERT(ret == (uint64_t)-EFAULT);
-		GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn,
-			     HYPERCALL_INSN_SIZE));
-	} else {
-		GUEST_ASSERT(!ret);
-		GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn,
-			     HYPERCALL_INSN_SIZE));
-	}
-
-	GUEST_DONE();
-}
-
-KVM_ONE_VCPU_TEST_SUITE(fix_hypercall);
-
-static void enter_guest(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_SYNC:
-		pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]);
-		break;
-	case UCALL_DONE:
-		return;
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-	default:
-		TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)",
-			  uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason));
-	}
-}
-
-static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk)
-{
-	struct kvm_vm *vm = vcpu->vm;
-
-	vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler);
-
-	if (disable_quirk)
-		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2,
-			      KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
-
-	quirk_disabled = disable_quirk;
-	sync_global_to_guest(vm, quirk_disabled);
-
-	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
-
-	enter_guest(vcpu);
-}
-
-KVM_ONE_VCPU_TEST(fix_hypercall, enable_quirk, guest_main)
-{
-	test_fix_hypercall(vcpu, false);
-}
-
-KVM_ONE_VCPU_TEST(fix_hypercall, disable_quirk, guest_main)
-{
-	test_fix_hypercall(vcpu, true);
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
-
-	return test_harness_run(argc, argv);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/flds_emulation.h b/tools/testing/selftests/kvm/x86_64/flds_emulation.h
deleted file mode 100644
index 37b1a9f52864..000000000000
--- a/tools/testing/selftests/kvm/x86_64/flds_emulation.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef SELFTEST_KVM_FLDS_EMULATION_H
-#define SELFTEST_KVM_FLDS_EMULATION_H
-
-#include "kvm_util.h"
-
-#define FLDS_MEM_EAX ".byte 0xd9, 0x00"
-
-/*
- * flds is an instruction that the KVM instruction emulator is known not to
- * support. This can be used in guest code along with a mechanism to force
- * KVM to emulate the instruction (e.g. by providing an MMIO address) to
- * exercise emulation failures.
- */
-static inline void flds(uint64_t address)
-{
-	__asm__ __volatile__(FLDS_MEM_EAX :: "a"(address));
-}
-
-static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_regs regs;
-	uint8_t *insn_bytes;
-	uint64_t flags;
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
-
-	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
-		    "Unexpected suberror: %u",
-		    run->emulation_failure.suberror);
-
-	flags = run->emulation_failure.flags;
-	TEST_ASSERT(run->emulation_failure.ndata >= 3 &&
-		    flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES,
-		    "run->emulation_failure is missing instruction bytes");
-
-	TEST_ASSERT(run->emulation_failure.insn_size >= 2,
-		    "Expected a 2-byte opcode for 'flds', got %d bytes",
-		    run->emulation_failure.insn_size);
-
-	insn_bytes = run->emulation_failure.insn_bytes;
-	TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0,
-		    "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x",
-		    insn_bytes[0], insn_bytes[1]);
-
-	vcpu_regs_get(vcpu, &regs);
-	regs.rip += 2;
-	vcpu_regs_set(vcpu, &regs);
-}
-
-#endif /* !SELFTEST_KVM_FLDS_EMULATION_H */
diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
deleted file mode 100644
index 10b1b0ba374e..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2023, Google LLC.
- */
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "vmx.h"
-
-void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit)
-{
-	const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8);
-	const uint64_t valid = BIT_ULL(18) | BIT_ULL(24);
-	const uint64_t legal = ignored | valid;
-	uint64_t val = BIT_ULL(bit);
-	uint64_t actual;
-	int r;
-
-	r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val);
-	TEST_ASSERT(val & ~legal ? !r : r == 1,
-		    "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s",
-		    val, val & ~legal ? "fail" : "succeed");
-
-	actual = vcpu_get_msr(vcpu, MSR_K7_HWCR);
-	TEST_ASSERT(actual == (val & valid),
-		    "Bit %u: unexpected HWCR 0x%lx; expected 0x%lx",
-		    bit, actual, (val & valid));
-
-	vcpu_set_msr(vcpu, MSR_K7_HWCR, 0);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	unsigned int bit;
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	for (bit = 0; bit < BITS_PER_LONG; bit++)
-		test_hwcr_bit(vcpu, bit);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
deleted file mode 100644
index e058bc676cd6..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+++ /dev/null
@@ -1,263 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2021, Red Hat, Inc.
- *
- * Tests for Hyper-V clocksources
- */
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "hyperv.h"
-
-struct ms_hyperv_tsc_page {
-	volatile u32 tsc_sequence;
-	u32 reserved1;
-	volatile u64 tsc_scale;
-	volatile s64 tsc_offset;
-} __packed;
-
-/* Simplified mul_u64_u64_shr() */
-static inline u64 mul_u64_u64_shr64(u64 a, u64 b)
-{
-	union {
-		u64 ll;
-		struct {
-			u32 low, high;
-		} l;
-	} rm, rn, rh, a0, b0;
-	u64 c;
-
-	a0.ll = a;
-	b0.ll = b;
-
-	rm.ll = (u64)a0.l.low * b0.l.high;
-	rn.ll = (u64)a0.l.high * b0.l.low;
-	rh.ll = (u64)a0.l.high * b0.l.high;
-
-	rh.l.low = c = rm.l.high + rn.l.high + rh.l.low;
-	rh.l.high = (c >> 32) + rh.l.high;
-
-	return rh.ll;
-}
-
-static inline void nop_loop(void)
-{
-	int i;
-
-	for (i = 0; i < 100000000; i++)
-		asm volatile("nop");
-}
-
-static inline void check_tsc_msr_rdtsc(void)
-{
-	u64 tsc_freq, r1, r2, t1, t2;
-	s64 delta_ns;
-
-	tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
-	GUEST_ASSERT(tsc_freq > 0);
-
-	/* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
-	r1 = rdtsc();
-	t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
-	r1 = (r1 + rdtsc()) / 2;
-	nop_loop();
-	r2 = rdtsc();
-	t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
-	r2 = (r2 + rdtsc()) / 2;
-
-	GUEST_ASSERT(r2 > r1 && t2 > t1);
-
-	/* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
-	delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
-	if (delta_ns < 0)
-		delta_ns = -delta_ns;
-
-	/* 1% tolerance */
-	GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100);
-}
-
-static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page)
-{
-	return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset;
-}
-
-static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page)
-{
-	u64 r1, r2, t1, t2;
-
-	/* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */
-	t1 = get_tscpage_ts(tsc_page);
-	r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
-
-	/* 10 ms tolerance */
-	GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000);
-	nop_loop();
-
-	t2 = get_tscpage_ts(tsc_page);
-	r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
-	GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000);
-}
-
-static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa)
-{
-	u64 tsc_scale, tsc_offset;
-
-	/* Set Guest OS id to enable Hyper-V emulation */
-	GUEST_SYNC(1);
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	GUEST_SYNC(2);
-
-	check_tsc_msr_rdtsc();
-
-	GUEST_SYNC(3);
-
-	/* Set up TSC page is disabled state, check that it's clean */
-	wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa);
-	GUEST_ASSERT(tsc_page->tsc_sequence == 0);
-	GUEST_ASSERT(tsc_page->tsc_scale == 0);
-	GUEST_ASSERT(tsc_page->tsc_offset == 0);
-
-	GUEST_SYNC(4);
-
-	/* Set up TSC page is enabled state */
-	wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1);
-	GUEST_ASSERT(tsc_page->tsc_sequence != 0);
-
-	GUEST_SYNC(5);
-
-	check_tsc_msr_tsc_page(tsc_page);
-
-	GUEST_SYNC(6);
-
-	tsc_offset = tsc_page->tsc_offset;
-	/* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */
-
-	GUEST_SYNC(7);
-	/* Sanity check TSC page timestamp, it should be close to 0 */
-	GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000);
-
-	GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset);
-
-	nop_loop();
-
-	/*
-	 * Enable Re-enlightenment and check that TSC page stays constant across
-	 * KVM_SET_CLOCK.
-	 */
-	wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff);
-	wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1);
-	tsc_offset = tsc_page->tsc_offset;
-	tsc_scale = tsc_page->tsc_scale;
-	GUEST_SYNC(8);
-	GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset);
-	GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale);
-
-	GUEST_SYNC(9);
-
-	check_tsc_msr_tsc_page(tsc_page);
-
-	/*
-	 * Disable re-enlightenment and TSC page, check that KVM doesn't update
-	 * it anymore.
-	 */
-	wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
-	wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
-	wrmsr(HV_X64_MSR_REFERENCE_TSC, 0);
-	memset(tsc_page, 0, sizeof(*tsc_page));
-
-	GUEST_SYNC(10);
-	GUEST_ASSERT(tsc_page->tsc_sequence == 0);
-	GUEST_ASSERT(tsc_page->tsc_offset == 0);
-	GUEST_ASSERT(tsc_page->tsc_scale == 0);
-
-	GUEST_DONE();
-}
-
-static void host_check_tsc_msr_rdtsc(struct kvm_vcpu *vcpu)
-{
-	u64 tsc_freq, r1, r2, t1, t2;
-	s64 delta_ns;
-
-	tsc_freq = vcpu_get_msr(vcpu, HV_X64_MSR_TSC_FREQUENCY);
-	TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
-
-	/* For increased accuracy, take mean rdtsc() before and afrer ioctl */
-	r1 = rdtsc();
-	t1 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT);
-	r1 = (r1 + rdtsc()) / 2;
-	nop_loop();
-	r2 = rdtsc();
-	t2 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT);
-	r2 = (r2 + rdtsc()) / 2;
-
-	TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
-
-	/* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
-	delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
-	if (delta_ns < 0)
-		delta_ns = -delta_ns;
-
-	/* 1% tolerance */
-	TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100,
-		    "Elapsed time does not match (MSR=%ld, TSC=%ld)",
-		    (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq);
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	vm_vaddr_t tsc_page_gva;
-	int stage;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME));
-	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-
-	vcpu_set_hv_cpuid(vcpu);
-
-	tsc_page_gva = vm_vaddr_alloc_page(vm);
-	memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize());
-	TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0,
-		"TSC page has to be page aligned");
-	vcpu_args_set(vcpu, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva));
-
-	host_check_tsc_msr_rdtsc(vcpu);
-
-	for (stage = 1;; stage++) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			/* Keep in sync with guest_main() */
-			TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d",
-				    stage);
-			goto out;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage,
-			    "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage, (ulong)uc.args[1]);
-
-		/* Reset kvmclock triggering TSC page update */
-		if (stage == 7 || stage == 8 || stage == 10) {
-			struct kvm_clock_data clock = {0};
-
-			vm_ioctl(vm, KVM_SET_CLOCK, &clock);
-		}
-	}
-
-out:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
deleted file mode 100644
index 4f5881d4ef66..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ /dev/null
@@ -1,172 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Test for x86 KVM_CAP_HYPERV_CPUID
- *
- * Copyright (C) 2018, Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-static void guest_code(void)
-{
-}
-
-static bool smt_possible(void)
-{
-	char buf[16];
-	FILE *f;
-	bool res = true;
-
-	f = fopen("/sys/devices/system/cpu/smt/control", "r");
-	if (f) {
-		if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
-			if (!strncmp(buf, "forceoff", 8) ||
-			    !strncmp(buf, "notsupported", 12))
-				res = false;
-		}
-		fclose(f);
-	}
-
-	return res;
-}
-
-static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries,
-			  bool evmcs_expected)
-{
-	int i;
-	int nent_expected = 10;
-	u32 test_val;
-
-	TEST_ASSERT(hv_cpuid_entries->nent == nent_expected,
-		    "KVM_GET_SUPPORTED_HV_CPUID should return %d entries"
-		    " (returned %d)",
-		    nent_expected, hv_cpuid_entries->nent);
-
-	for (i = 0; i < hv_cpuid_entries->nent; i++) {
-		const struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
-
-		TEST_ASSERT((entry->function >= 0x40000000) &&
-			    (entry->function <= 0x40000082),
-			    "function %x is our of supported range",
-			    entry->function);
-
-		TEST_ASSERT(entry->index == 0,
-			    ".index field should be zero");
-
-		TEST_ASSERT(entry->flags == 0,
-			    ".flags field should be zero");
-
-		TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
-			    !entry->padding[2], "padding should be zero");
-
-		switch (entry->function) {
-		case 0x40000000:
-			test_val = 0x40000082;
-
-			TEST_ASSERT(entry->eax == test_val,
-				    "Wrong max leaf report in 0x40000000.EAX: %x"
-				    " (evmcs=%d)",
-				    entry->eax, evmcs_expected
-				);
-			break;
-		case 0x40000004:
-			test_val = entry->eax & (1UL << 18);
-
-			TEST_ASSERT(!!test_val == !smt_possible(),
-				    "NoNonArchitecturalCoreSharing bit"
-				    " doesn't reflect SMT setting");
-			break;
-		case 0x4000000A:
-			TEST_ASSERT(entry->eax & (1UL << 19),
-				    "Enlightened MSR-Bitmap should always be supported"
-				    " 0x40000000.EAX: %x", entry->eax);
-			if (evmcs_expected)
-				TEST_ASSERT((entry->eax & 0xffff) == 0x101,
-				    "Supported Enlightened VMCS version range is supposed to be 1:1"
-				    " 0x40000000.EAX: %x", entry->eax);
-
-			break;
-		default:
-			break;
-
-		}
-		/*
-		 * If needed for debug:
-		 * fprintf(stdout,
-		 *	"CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n",
-		 *	entry->function, entry->eax, entry->ebx, entry->ecx,
-		 *	entry->edx);
-		 */
-	}
-}
-
-void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
-{
-	static struct kvm_cpuid2 cpuid = {.nent = 0};
-	int ret;
-
-	if (vcpu)
-		ret = __vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
-	else
-		ret = __kvm_ioctl(vm->kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
-
-	TEST_ASSERT(ret == -1 && errno == E2BIG,
-		    "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
-		    " it should have: %d %d", !vcpu ? "KVM" : "vCPU", ret, errno);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	const struct kvm_cpuid2 *hv_cpuid_entries;
-	struct kvm_vcpu *vcpu;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	/* Test vCPU ioctl version */
-	test_hv_cpuid_e2big(vm, vcpu);
-
-	hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu);
-	test_hv_cpuid(hv_cpuid_entries, false);
-	free((void *)hv_cpuid_entries);
-
-	if (!kvm_cpu_has(X86_FEATURE_VMX) ||
-	    !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
-		print_skip("Enlightened VMCS is unsupported");
-		goto do_sys;
-	}
-	vcpu_enable_evmcs(vcpu);
-	hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu);
-	test_hv_cpuid(hv_cpuid_entries, true);
-	free((void *)hv_cpuid_entries);
-
-do_sys:
-	/* Test system ioctl version */
-	if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) {
-		print_skip("KVM_CAP_SYS_HYPERV_CPUID not supported");
-		goto out;
-	}
-
-	test_hv_cpuid_e2big(vm, NULL);
-
-	hv_cpuid_entries = kvm_get_supported_hv_cpuid();
-	test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX));
-
-out:
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
deleted file mode 100644
index 74cf19661309..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
+++ /dev/null
@@ -1,307 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018, Red Hat, Inc.
- *
- * Tests for Enlightened VMCS, including nested guest state.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <linux/bitmap.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-
-#include "hyperv.h"
-#include "vmx.h"
-
-static int ud_count;
-
-static void guest_ud_handler(struct ex_regs *regs)
-{
-	ud_count++;
-	regs->rip += 3; /* VMLAUNCH */
-}
-
-static void guest_nmi_handler(struct ex_regs *regs)
-{
-}
-
-static inline void rdmsr_from_l2(uint32_t msr)
-{
-	/* Currently, L1 doesn't preserve GPRs during vmexits. */
-	__asm__ __volatile__ ("rdmsr" : : "c"(msr) :
-			      "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
-			      "r10", "r11", "r12", "r13", "r14", "r15");
-}
-
-/* Exit to L1 from L2 with RDMSR instruction */
-void l2_guest_code(void)
-{
-	u64 unused;
-
-	GUEST_SYNC(7);
-
-	GUEST_SYNC(8);
-
-	/* Forced exit to L1 upon restore */
-	GUEST_SYNC(9);
-
-	vmcall();
-
-	/* MSR-Bitmap tests */
-	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
-	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
-	rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
-	vmcall();
-	rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
-
-	/* L2 TLB flush tests */
-	hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
-			 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS);
-	rdmsr_from_l2(MSR_FS_BASE);
-	/*
-	 * Note: hypercall status (RAX) is not preserved correctly by L1 after
-	 * synthetic vmexit, use unchecked version.
-	 */
-	__hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
-			   HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS,
-			   &unused);
-
-	/* Done, exit to L1 and never come back.  */
-	vmcall();
-}
-
-void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages,
-		vm_vaddr_t hv_hcall_page_gpa)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa);
-
-	x2apic_enable();
-
-	GUEST_SYNC(1);
-	GUEST_SYNC(2);
-
-	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
-	evmcs_enable();
-
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_SYNC(3);
-	GUEST_ASSERT(load_evmcs(hv_pages));
-	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
-
-	GUEST_SYNC(4);
-	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
-
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_SYNC(5);
-	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
-	current_evmcs->revision_id = -1u;
-	GUEST_ASSERT(vmlaunch());
-	current_evmcs->revision_id = EVMCS_VERSION;
-	GUEST_SYNC(6);
-
-	vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
-		PIN_BASED_NMI_EXITING);
-
-	/* L2 TLB flush setup */
-	current_evmcs->partition_assist_page = hv_pages->partition_assist_gpa;
-	current_evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
-	current_evmcs->hv_vm_id = 1;
-	current_evmcs->hv_vp_id = 1;
-	current_vp_assist->nested_control.features.directhypercall = 1;
-	*(u32 *)(hv_pages->partition_assist) = 0;
-
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
-	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), NMI_VECTOR);
-	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
-
-	/*
-	 * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
-	 * up-to-date (RIP points where it should and not at the beginning
-	 * of l2_guest_code(). GUEST_SYNC(9) checkes that.
-	 */
-	GUEST_ASSERT(!vmresume());
-
-	GUEST_SYNC(10);
-
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	current_evmcs->guest_rip += 3; /* vmcall */
-
-	/* Intercept RDMSR 0xc0000100 */
-	vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) |
-		CPU_BASED_USE_MSR_BITMAPS);
-	__set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400);
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
-	current_evmcs->guest_rip += 2; /* rdmsr */
-
-	/* Enable enlightened MSR bitmap */
-	current_evmcs->hv_enlightenments_control.msr_bitmap = 1;
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
-	current_evmcs->guest_rip += 2; /* rdmsr */
-
-	/* Intercept RDMSR 0xc0000101 without telling KVM about it */
-	__set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400);
-	/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
-	current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-	GUEST_ASSERT(!vmresume());
-	/* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	current_evmcs->guest_rip += 3; /* vmcall */
-
-	/* Now tell KVM we've changed MSR-Bitmap */
-	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
-	current_evmcs->guest_rip += 2; /* rdmsr */
-
-	/*
-	 * L2 TLB flush test. First VMCALL should be handled directly by L0,
-	 * no VMCALL exit expected.
-	 */
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
-	current_evmcs->guest_rip += 2; /* rdmsr */
-	/* Enable synthetic vmexit */
-	*(u32 *)(hv_pages->partition_assist) = 1;
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH);
-
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	GUEST_SYNC(11);
-
-	/* Try enlightened vmptrld with an incorrect GPA */
-	evmcs_vmptrld(0xdeadbeef, hv_pages->enlightened_vmcs);
-	GUEST_ASSERT(vmlaunch());
-	GUEST_ASSERT(ud_count == 1);
-	GUEST_DONE();
-}
-
-void inject_nmi(struct kvm_vcpu *vcpu)
-{
-	struct kvm_vcpu_events events;
-
-	vcpu_events_get(vcpu, &events);
-
-	events.nmi.pending = 1;
-	events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
-
-	vcpu_events_set(vcpu, &events);
-}
-
-static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm,
-					struct kvm_vcpu *vcpu)
-{
-	struct kvm_regs regs1, regs2;
-	struct kvm_x86_state *state;
-
-	state = vcpu_save_state(vcpu);
-	memset(&regs1, 0, sizeof(regs1));
-	vcpu_regs_get(vcpu, &regs1);
-
-	kvm_vm_release(vm);
-
-	/* Restore state in a new VM.  */
-	vcpu = vm_recreate_with_one_vcpu(vm);
-	vcpu_set_hv_cpuid(vcpu);
-	vcpu_enable_evmcs(vcpu);
-	vcpu_load_state(vcpu, state);
-	kvm_x86_state_cleanup(state);
-
-	memset(&regs2, 0, sizeof(regs2));
-	vcpu_regs_get(vcpu, &regs2);
-	TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
-		    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
-		    (ulong) regs2.rdi, (ulong) regs2.rsi);
-	return vcpu;
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0;
-	vm_vaddr_t hcall_page;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	int stage;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS));
-	TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	hcall_page = vm_vaddr_alloc_pages(vm, 1);
-	memset(addr_gva2hva(vm, hcall_page), 0x0,  getpagesize());
-
-	vcpu_set_hv_cpuid(vcpu);
-	vcpu_enable_evmcs(vcpu);
-
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
-	vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
-	vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
-
-	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
-	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
-
-	pr_info("Running L1 which uses EVMCS to run L2\n");
-
-	for (stage = 1;; stage++) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		/* UCALL_SYNC is handled here.  */
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage, (ulong)uc.args[1]);
-
-		vcpu = save_restore_vm(vm, vcpu);
-
-		/* Force immediate L2->L1 exit before resuming */
-		if (stage == 8) {
-			pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n");
-			inject_nmi(vcpu);
-		}
-
-		/*
-		 * Do KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE for a freshly
-		 * restored VM (before the first KVM_RUN) to check that
-		 * KVM_STATE_NESTED_EVMCS is not lost.
-		 */
-		if (stage == 9) {
-			pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n");
-			vcpu = save_restore_vm(vm, vcpu);
-		}
-	}
-
-done:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
deleted file mode 100644
index 949e08e98f31..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Test Hyper-V extended hypercall, HV_EXT_CALL_QUERY_CAPABILITIES (0x8001),
- * exit to userspace and receive result in guest.
- *
- * Negative tests are present in hyperv_features.c
- *
- * Copyright 2022 Google LLC
- * Author: Vipin Sharma <vipinsh@google.com>
- */
-#include "kvm_util.h"
-#include "processor.h"
-#include "hyperv.h"
-
-/* Any value is fine */
-#define EXT_CAPABILITIES 0xbull
-
-static void guest_code(vm_paddr_t in_pg_gpa, vm_paddr_t out_pg_gpa,
-		       vm_vaddr_t out_pg_gva)
-{
-	uint64_t *output_gva;
-
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	wrmsr(HV_X64_MSR_HYPERCALL, in_pg_gpa);
-
-	output_gva = (uint64_t *)out_pg_gva;
-
-	hyperv_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, in_pg_gpa, out_pg_gpa);
-
-	/* TLFS states output will be a uint64_t value */
-	GUEST_ASSERT_EQ(*output_gva, EXT_CAPABILITIES);
-
-	GUEST_DONE();
-}
-
-int main(void)
-{
-	vm_vaddr_t hcall_out_page;
-	vm_vaddr_t hcall_in_page;
-	struct kvm_vcpu *vcpu;
-	struct kvm_run *run;
-	struct kvm_vm *vm;
-	uint64_t *outval;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
-
-	/* Verify if extended hypercalls are supported */
-	if (!kvm_cpuid_has(kvm_get_supported_hv_cpuid(),
-			   HV_ENABLE_EXTENDED_HYPERCALLS)) {
-		print_skip("Extended calls not supported by the kernel");
-		exit(KSFT_SKIP);
-	}
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	run = vcpu->run;
-	vcpu_set_hv_cpuid(vcpu);
-
-	/* Hypercall input */
-	hcall_in_page = vm_vaddr_alloc_pages(vm, 1);
-	memset(addr_gva2hva(vm, hcall_in_page), 0x0, vm->page_size);
-
-	/* Hypercall output */
-	hcall_out_page = vm_vaddr_alloc_pages(vm, 1);
-	memset(addr_gva2hva(vm, hcall_out_page), 0x0, vm->page_size);
-
-	vcpu_args_set(vcpu, 3, addr_gva2gpa(vm, hcall_in_page),
-		      addr_gva2gpa(vm, hcall_out_page), hcall_out_page);
-
-	vcpu_run(vcpu);
-
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERV,
-		    "Unexpected exit reason: %u (%s)",
-		    run->exit_reason, exit_reason_str(run->exit_reason));
-
-	outval = addr_gpa2hva(vm, run->hyperv.u.hcall.params[1]);
-	*outval = EXT_CAPABILITIES;
-	run->hyperv.u.hcall.result = HV_STATUS_SUCCESS;
-
-	vcpu_run(vcpu);
-
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Unexpected exit reason: %u (%s)",
-		    run->exit_reason, exit_reason_str(run->exit_reason));
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		break;
-	default:
-		TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
-	}
-
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
deleted file mode 100644
index 068e9c69710d..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ /dev/null
@@ -1,695 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2021, Red Hat, Inc.
- *
- * Tests for Hyper-V features enablement
- */
-#include <asm/kvm_para.h>
-#include <linux/kvm_para.h>
-#include <stdint.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "hyperv.h"
-
-/*
- * HYPERV_CPUID_ENLIGHTMENT_INFO.EBX is not a 'feature' CPUID leaf
- * but to activate the feature it is sufficient to set it to a non-zero
- * value. Use BIT(0) for that.
- */
-#define HV_PV_SPINLOCKS_TEST            \
-	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EBX, 0)
-
-struct msr_data {
-	uint32_t idx;
-	bool fault_expected;
-	bool write;
-	u64 write_val;
-};
-
-struct hcall_data {
-	uint64_t control;
-	uint64_t expect;
-	bool ud_expected;
-};
-
-static bool is_write_only_msr(uint32_t msr)
-{
-	return msr == HV_X64_MSR_EOI;
-}
-
-static void guest_msr(struct msr_data *msr)
-{
-	uint8_t vector = 0;
-	uint64_t msr_val = 0;
-
-	GUEST_ASSERT(msr->idx);
-
-	if (msr->write)
-		vector = wrmsr_safe(msr->idx, msr->write_val);
-
-	if (!vector && (!msr->write || !is_write_only_msr(msr->idx)))
-		vector = rdmsr_safe(msr->idx, &msr_val);
-
-	if (msr->fault_expected)
-		__GUEST_ASSERT(vector == GP_VECTOR,
-			       "Expected #GP on %sMSR(0x%x), got vector '0x%x'",
-			       msr->write ? "WR" : "RD", msr->idx, vector);
-	else
-		__GUEST_ASSERT(!vector,
-			       "Expected success on %sMSR(0x%x), got vector '0x%x'",
-			       msr->write ? "WR" : "RD", msr->idx, vector);
-
-	if (vector || is_write_only_msr(msr->idx))
-		goto done;
-
-	if (msr->write)
-		__GUEST_ASSERT(!vector,
-			       "WRMSR(0x%x) to '0x%lx', RDMSR read '0x%lx'",
-			       msr->idx, msr->write_val, msr_val);
-
-	/* Invariant TSC bit appears when TSC invariant control MSR is written to */
-	if (msr->idx == HV_X64_MSR_TSC_INVARIANT_CONTROL) {
-		if (!this_cpu_has(HV_ACCESS_TSC_INVARIANT))
-			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC));
-		else
-			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC) ==
-				     !!(msr_val & HV_INVARIANT_TSC_EXPOSED));
-	}
-
-done:
-	GUEST_DONE();
-}
-
-static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
-{
-	u64 res, input, output;
-	uint8_t vector;
-
-	GUEST_ASSERT_NE(hcall->control, 0);
-
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
-
-	if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) {
-		input = pgs_gpa;
-		output = pgs_gpa + 4096;
-	} else {
-		input = output = 0;
-	}
-
-	vector = __hyperv_hypercall(hcall->control, input, output, &res);
-	if (hcall->ud_expected) {
-		__GUEST_ASSERT(vector == UD_VECTOR,
-			       "Expected #UD for control '%lu', got vector '0x%x'",
-			       hcall->control, vector);
-	} else {
-		__GUEST_ASSERT(!vector,
-			       "Expected no exception for control '%lu', got vector '0x%x'",
-			       hcall->control, vector);
-		GUEST_ASSERT_EQ(res, hcall->expect);
-	}
-
-	GUEST_DONE();
-}
-
-static void vcpu_reset_hv_cpuid(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * Enable all supported Hyper-V features, then clear the leafs holding
-	 * the features that will be tested one by one.
-	 */
-	vcpu_set_hv_cpuid(vcpu);
-
-	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
-	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
-	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
-}
-
-static void guest_test_msrs_access(void)
-{
-	struct kvm_cpuid2 *prev_cpuid = NULL;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	int stage = 0;
-	vm_vaddr_t msr_gva;
-	struct msr_data *msr;
-	bool has_invtsc = kvm_cpu_has(X86_FEATURE_INVTSC);
-
-	while (true) {
-		vm = vm_create_with_one_vcpu(&vcpu, guest_msr);
-
-		msr_gva = vm_vaddr_alloc_page(vm);
-		memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
-		msr = addr_gva2hva(vm, msr_gva);
-
-		vcpu_args_set(vcpu, 1, msr_gva);
-		vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1);
-
-		if (!prev_cpuid) {
-			vcpu_reset_hv_cpuid(vcpu);
-
-			prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent);
-		} else {
-			vcpu_init_cpuid(vcpu, prev_cpuid);
-		}
-
-		/* TODO: Make this entire test easier to maintain. */
-		if (stage >= 21)
-			vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0);
-
-		switch (stage) {
-		case 0:
-			/*
-			 * Only available when Hyper-V identification is set
-			 */
-			msr->idx = HV_X64_MSR_GUEST_OS_ID;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 1:
-			msr->idx = HV_X64_MSR_HYPERCALL;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 2:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
-			/*
-			 * HV_X64_MSR_GUEST_OS_ID has to be written first to make
-			 * HV_X64_MSR_HYPERCALL available.
-			 */
-			msr->idx = HV_X64_MSR_GUEST_OS_ID;
-			msr->write = true;
-			msr->write_val = HYPERV_LINUX_OS_ID;
-			msr->fault_expected = false;
-			break;
-		case 3:
-			msr->idx = HV_X64_MSR_GUEST_OS_ID;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 4:
-			msr->idx = HV_X64_MSR_HYPERCALL;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-
-		case 5:
-			msr->idx = HV_X64_MSR_VP_RUNTIME;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 6:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_RUNTIME_AVAILABLE);
-			msr->idx = HV_X64_MSR_VP_RUNTIME;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 7:
-			/* Read only */
-			msr->idx = HV_X64_MSR_VP_RUNTIME;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = true;
-			break;
-
-		case 8:
-			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 9:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_TIME_REF_COUNT_AVAILABLE);
-			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 10:
-			/* Read only */
-			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = true;
-			break;
-
-		case 11:
-			msr->idx = HV_X64_MSR_VP_INDEX;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 12:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_INDEX_AVAILABLE);
-			msr->idx = HV_X64_MSR_VP_INDEX;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 13:
-			/* Read only */
-			msr->idx = HV_X64_MSR_VP_INDEX;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = true;
-			break;
-
-		case 14:
-			msr->idx = HV_X64_MSR_RESET;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 15:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_RESET_AVAILABLE);
-			msr->idx = HV_X64_MSR_RESET;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 16:
-			msr->idx = HV_X64_MSR_RESET;
-			msr->write = true;
-			/*
-			 * TODO: the test only writes '0' to HV_X64_MSR_RESET
-			 * at the moment, writing some other value there will
-			 * trigger real vCPU reset and the code is not prepared
-			 * to handle it yet.
-			 */
-			msr->write_val = 0;
-			msr->fault_expected = false;
-			break;
-
-		case 17:
-			msr->idx = HV_X64_MSR_REFERENCE_TSC;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 18:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_REFERENCE_TSC_AVAILABLE);
-			msr->idx = HV_X64_MSR_REFERENCE_TSC;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 19:
-			msr->idx = HV_X64_MSR_REFERENCE_TSC;
-			msr->write = true;
-			msr->write_val = 0;
-			msr->fault_expected = false;
-			break;
-
-		case 20:
-			msr->idx = HV_X64_MSR_EOM;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 21:
-			/*
-			 * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2
-			 * capability enabled and guest visible CPUID bit unset.
-			 */
-			msr->idx = HV_X64_MSR_EOM;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 22:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNIC_AVAILABLE);
-			msr->idx = HV_X64_MSR_EOM;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 23:
-			msr->idx = HV_X64_MSR_EOM;
-			msr->write = true;
-			msr->write_val = 0;
-			msr->fault_expected = false;
-			break;
-
-		case 24:
-			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 25:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNTIMER_AVAILABLE);
-			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 26:
-			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = true;
-			msr->write_val = 0;
-			msr->fault_expected = false;
-			break;
-		case 27:
-			/* Direct mode test */
-			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = true;
-			msr->write_val = 1 << 12;
-			msr->fault_expected = true;
-			break;
-		case 28:
-			vcpu_set_cpuid_feature(vcpu, HV_STIMER_DIRECT_MODE_AVAILABLE);
-			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = true;
-			msr->write_val = 1 << 12;
-			msr->fault_expected = false;
-			break;
-
-		case 29:
-			msr->idx = HV_X64_MSR_EOI;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 30:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_APIC_ACCESS_AVAILABLE);
-			msr->idx = HV_X64_MSR_EOI;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = false;
-			break;
-
-		case 31:
-			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 32:
-			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_FREQUENCY_MSRS);
-			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 33:
-			/* Read only */
-			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = true;
-			break;
-
-		case 34:
-			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 35:
-			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_REENLIGHTENMENT);
-			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 36:
-			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = false;
-			break;
-		case 37:
-			/* Can only write '0' */
-			msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = true;
-			break;
-
-		case 38:
-			msr->idx = HV_X64_MSR_CRASH_P0;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 39:
-			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE);
-			msr->idx = HV_X64_MSR_CRASH_P0;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 40:
-			msr->idx = HV_X64_MSR_CRASH_P0;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = false;
-			break;
-
-		case 41:
-			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 42:
-			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_DEBUG_MSRS_AVAILABLE);
-			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
-			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 43:
-			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
-			msr->write = true;
-			msr->write_val = 0;
-			msr->fault_expected = false;
-			break;
-
-		case 44:
-			/* MSR is not available when CPUID feature bit is unset */
-			if (!has_invtsc)
-				goto next_stage;
-			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
-			msr->write = false;
-			msr->fault_expected = true;
-			break;
-		case 45:
-			/* MSR is vailable when CPUID feature bit is set */
-			if (!has_invtsc)
-				goto next_stage;
-			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT);
-			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
-			msr->write = false;
-			msr->fault_expected = false;
-			break;
-		case 46:
-			/* Writing bits other than 0 is forbidden */
-			if (!has_invtsc)
-				goto next_stage;
-			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
-			msr->write = true;
-			msr->write_val = 0xdeadbeef;
-			msr->fault_expected = true;
-			break;
-		case 47:
-			/* Setting bit 0 enables the feature */
-			if (!has_invtsc)
-				goto next_stage;
-			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
-			msr->write = true;
-			msr->write_val = 1;
-			msr->fault_expected = false;
-			break;
-
-		default:
-			kvm_vm_free(vm);
-			return;
-		}
-
-		vcpu_set_cpuid(vcpu);
-
-		memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent));
-
-		pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage,
-			 msr->idx, msr->write ? "write" : "read");
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			return;
-		case UCALL_DONE:
-			break;
-		default:
-			TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
-			return;
-		}
-
-next_stage:
-		stage++;
-		kvm_vm_free(vm);
-	}
-}
-
-static void guest_test_hcalls_access(void)
-{
-	struct kvm_cpuid2 *prev_cpuid = NULL;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	int stage = 0;
-	vm_vaddr_t hcall_page, hcall_params;
-	struct hcall_data *hcall;
-
-	while (true) {
-		vm = vm_create_with_one_vcpu(&vcpu, guest_hcall);
-
-		/* Hypercall input/output */
-		hcall_page = vm_vaddr_alloc_pages(vm, 2);
-		memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
-
-		hcall_params = vm_vaddr_alloc_page(vm);
-		memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
-		hcall = addr_gva2hva(vm, hcall_params);
-
-		vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
-		vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1);
-
-		if (!prev_cpuid) {
-			vcpu_reset_hv_cpuid(vcpu);
-
-			prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent);
-		} else {
-			vcpu_init_cpuid(vcpu, prev_cpuid);
-		}
-
-		switch (stage) {
-		case 0:
-			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
-			hcall->control = 0xbeef;
-			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
-			break;
-
-		case 1:
-			hcall->control = HVCALL_POST_MESSAGE;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 2:
-			vcpu_set_cpuid_feature(vcpu, HV_POST_MESSAGES);
-			hcall->control = HVCALL_POST_MESSAGE;
-			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
-			break;
-
-		case 3:
-			hcall->control = HVCALL_SIGNAL_EVENT;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 4:
-			vcpu_set_cpuid_feature(vcpu, HV_SIGNAL_EVENTS);
-			hcall->control = HVCALL_SIGNAL_EVENT;
-			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
-			break;
-
-		case 5:
-			hcall->control = HVCALL_RESET_DEBUG_SESSION;
-			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
-			break;
-		case 6:
-			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
-			hcall->control = HVCALL_RESET_DEBUG_SESSION;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 7:
-			vcpu_set_cpuid_feature(vcpu, HV_DEBUGGING);
-			hcall->control = HVCALL_RESET_DEBUG_SESSION;
-			hcall->expect = HV_STATUS_OPERATION_DENIED;
-			break;
-
-		case 8:
-			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 9:
-			vcpu_set_cpuid_feature(vcpu, HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED);
-			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
-			hcall->expect = HV_STATUS_SUCCESS;
-			break;
-		case 10:
-			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 11:
-			vcpu_set_cpuid_feature(vcpu, HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED);
-			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
-			hcall->expect = HV_STATUS_SUCCESS;
-			break;
-
-		case 12:
-			hcall->control = HVCALL_SEND_IPI;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 13:
-			vcpu_set_cpuid_feature(vcpu, HV_X64_CLUSTER_IPI_RECOMMENDED);
-			hcall->control = HVCALL_SEND_IPI;
-			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
-			break;
-		case 14:
-			/* Nothing in 'sparse banks' -> success */
-			hcall->control = HVCALL_SEND_IPI_EX;
-			hcall->expect = HV_STATUS_SUCCESS;
-			break;
-
-		case 15:
-			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 16:
-			vcpu_set_cpuid_feature(vcpu, HV_PV_SPINLOCKS_TEST);
-			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
-			hcall->expect = HV_STATUS_SUCCESS;
-			break;
-		case 17:
-			/* XMM fast hypercall */
-			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
-			hcall->ud_expected = true;
-			break;
-		case 18:
-			vcpu_set_cpuid_feature(vcpu, HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE);
-			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
-			hcall->ud_expected = false;
-			hcall->expect = HV_STATUS_SUCCESS;
-			break;
-		case 19:
-			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES;
-			hcall->expect = HV_STATUS_ACCESS_DENIED;
-			break;
-		case 20:
-			vcpu_set_cpuid_feature(vcpu, HV_ENABLE_EXTENDED_HYPERCALLS);
-			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES | HV_HYPERCALL_FAST_BIT;
-			hcall->expect = HV_STATUS_INVALID_PARAMETER;
-			break;
-		case 21:
-			kvm_vm_free(vm);
-			return;
-		}
-
-		vcpu_set_cpuid(vcpu);
-
-		memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent));
-
-		pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, hcall->control);
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			return;
-		case UCALL_DONE:
-			break;
-		default:
-			TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
-			return;
-		}
-
-		stage++;
-		kvm_vm_free(vm);
-	}
-}
-
-int main(void)
-{
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENFORCE_CPUID));
-
-	pr_info("Testing access to Hyper-V specific MSRs\n");
-	guest_test_msrs_access();
-
-	pr_info("Testing access to Hyper-V hypercalls\n");
-	guest_test_hcalls_access();
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
deleted file mode 100644
index 22c0c124582f..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
+++ /dev/null
@@ -1,308 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Hyper-V HvCallSendSyntheticClusterIpi{,Ex} tests
- *
- * Copyright (C) 2022, Red Hat, Inc.
- *
- */
-#include <pthread.h>
-#include <inttypes.h>
-
-#include "kvm_util.h"
-#include "hyperv.h"
-#include "test_util.h"
-#include "vmx.h"
-
-#define RECEIVER_VCPU_ID_1 2
-#define RECEIVER_VCPU_ID_2 65
-
-#define IPI_VECTOR	 0xfe
-
-static volatile uint64_t ipis_rcvd[RECEIVER_VCPU_ID_2 + 1];
-
-struct hv_vpset {
-	u64 format;
-	u64 valid_bank_mask;
-	u64 bank_contents[2];
-};
-
-enum HV_GENERIC_SET_FORMAT {
-	HV_GENERIC_SET_SPARSE_4K,
-	HV_GENERIC_SET_ALL,
-};
-
-/* HvCallSendSyntheticClusterIpi hypercall */
-struct hv_send_ipi {
-	u32 vector;
-	u32 reserved;
-	u64 cpu_mask;
-};
-
-/* HvCallSendSyntheticClusterIpiEx hypercall */
-struct hv_send_ipi_ex {
-	u32 vector;
-	u32 reserved;
-	struct hv_vpset vp_set;
-};
-
-static inline void hv_init(vm_vaddr_t pgs_gpa)
-{
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
-}
-
-static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa)
-{
-	u32 vcpu_id;
-
-	x2apic_enable();
-	hv_init(pgs_gpa);
-
-	vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
-
-	/* Signal sender vCPU we're ready */
-	ipis_rcvd[vcpu_id] = (u64)-1;
-
-	for (;;)
-		asm volatile("sti; hlt; cli");
-}
-
-static void guest_ipi_handler(struct ex_regs *regs)
-{
-	u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
-
-	ipis_rcvd[vcpu_id]++;
-	wrmsr(HV_X64_MSR_EOI, 1);
-}
-
-static inline void nop_loop(void)
-{
-	int i;
-
-	for (i = 0; i < 100000000; i++)
-		asm volatile("nop");
-}
-
-static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa)
-{
-	struct hv_send_ipi *ipi = (struct hv_send_ipi *)hcall_page;
-	struct hv_send_ipi_ex *ipi_ex = (struct hv_send_ipi_ex *)hcall_page;
-	int stage = 1, ipis_expected[2] = {0};
-
-	hv_init(pgs_gpa);
-	GUEST_SYNC(stage++);
-
-	/* Wait for receiver vCPUs to come up */
-	while (!ipis_rcvd[RECEIVER_VCPU_ID_1] || !ipis_rcvd[RECEIVER_VCPU_ID_2])
-		nop_loop();
-	ipis_rcvd[RECEIVER_VCPU_ID_1] = ipis_rcvd[RECEIVER_VCPU_ID_2] = 0;
-
-	/* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
-	ipi->vector = IPI_VECTOR;
-	ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1;
-	hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
-	GUEST_SYNC(stage++);
-	/* 'Fast' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
-	hyperv_hypercall(HVCALL_SEND_IPI | HV_HYPERCALL_FAST_BIT,
-			 IPI_VECTOR, 1 << RECEIVER_VCPU_ID_1);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
-	GUEST_SYNC(stage++);
-
-	/* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
-	memset(hcall_page, 0, 4096);
-	ipi_ex->vector = IPI_VECTOR;
-	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-	ipi_ex->vp_set.valid_bank_mask = 1 << 0;
-	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
-			 pgs_gpa, pgs_gpa + 4096);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
-	GUEST_SYNC(stage++);
-	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
-	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
-			 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
-			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
-	GUEST_SYNC(stage++);
-
-	/* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
-	memset(hcall_page, 0, 4096);
-	ipi_ex->vector = IPI_VECTOR;
-	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-	ipi_ex->vp_set.valid_bank_mask = 1 << 1;
-	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
-			 pgs_gpa, pgs_gpa + 4096);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
-	GUEST_SYNC(stage++);
-	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
-	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
-			 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
-			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
-	GUEST_SYNC(stage++);
-
-	/* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */
-	memset(hcall_page, 0, 4096);
-	ipi_ex->vector = IPI_VECTOR;
-	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-	ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1;
-	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
-	ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET),
-			 pgs_gpa, pgs_gpa + 4096);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
-	GUEST_SYNC(stage++);
-	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1, 2} */
-	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
-			 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
-			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
-	GUEST_SYNC(stage++);
-
-	/* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */
-	memset(hcall_page, 0, 4096);
-	ipi_ex->vector = IPI_VECTOR;
-	ipi_ex->vp_set.format = HV_GENERIC_SET_ALL;
-	hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
-	GUEST_SYNC(stage++);
-	/*
-	 * 'XMM Fast' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL.
-	 */
-	ipi_ex->vp_set.valid_bank_mask = 0;
-	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
-	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT,
-			 IPI_VECTOR, HV_GENERIC_SET_ALL);
-	nop_loop();
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
-	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
-	GUEST_SYNC(stage++);
-
-	GUEST_DONE();
-}
-
-static void *vcpu_thread(void *arg)
-{
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
-	int old, r;
-
-	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
-	TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
-		    vcpu->id, r);
-
-	vcpu_run(vcpu);
-
-	TEST_FAIL("vCPU %u exited unexpectedly", vcpu->id);
-
-	return NULL;
-}
-
-static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
-{
-	void *retval;
-	int r;
-
-	r = pthread_cancel(thread);
-	TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
-		    vcpu->id, r);
-
-	r = pthread_join(thread, &retval);
-	TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
-		    vcpu->id, r);
-	TEST_ASSERT(retval == PTHREAD_CANCELED,
-		    "expected retval=%p, got %p", PTHREAD_CANCELED,
-		    retval);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu[3];
-	vm_vaddr_t hcall_page;
-	pthread_t threads[2];
-	int stage = 1, r;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_SEND_IPI));
-
-	vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
-
-	/* Hypercall input/output */
-	hcall_page = vm_vaddr_alloc_pages(vm, 2);
-	memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
-
-
-	vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code);
-	vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
-	vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1);
-	vcpu_set_hv_cpuid(vcpu[1]);
-
-	vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code);
-	vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
-	vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2);
-	vcpu_set_hv_cpuid(vcpu[2]);
-
-	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
-
-	vcpu_args_set(vcpu[0], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
-	vcpu_set_hv_cpuid(vcpu[0]);
-
-	r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
-	TEST_ASSERT(!r, "pthread_create failed errno=%d", r);
-
-	r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
-	TEST_ASSERT(!r, "pthread_create failed errno=%d", errno);
-
-	while (true) {
-		vcpu_run(vcpu[0]);
-
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu[0], &uc)) {
-		case UCALL_SYNC:
-			TEST_ASSERT(uc.args[1] == stage,
-				    "Unexpected stage: %ld (%d expected)",
-				    uc.args[1], stage);
-			break;
-		case UCALL_DONE:
-			goto done;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		stage++;
-	}
-
-done:
-	cancel_join_vcpu_thread(threads[0], vcpu[1]);
-	cancel_join_vcpu_thread(threads[1], vcpu[2]);
-	kvm_vm_free(vm);
-
-	return r;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
deleted file mode 100644
index 0ddb63229bcb..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
+++ /dev/null
@@ -1,199 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2022, Red Hat, Inc.
- *
- * Tests for Hyper-V extensions to SVM.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <linux/bitmap.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-#include "hyperv.h"
-
-#define L2_GUEST_STACK_SIZE 256
-
-/* Exit to L1 from L2 with RDMSR instruction */
-static inline void rdmsr_from_l2(uint32_t msr)
-{
-	/* Currently, L1 doesn't preserve GPRs during vmexits. */
-	__asm__ __volatile__ ("rdmsr" : : "c"(msr) :
-			      "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
-			      "r10", "r11", "r12", "r13", "r14", "r15");
-}
-
-void l2_guest_code(void)
-{
-	u64 unused;
-
-	GUEST_SYNC(3);
-	/* Exit to L1 */
-	vmmcall();
-
-	/* MSR-Bitmap tests */
-	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
-	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
-	rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
-	vmmcall();
-	rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
-
-	GUEST_SYNC(5);
-
-	/* L2 TLB flush tests */
-	hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
-			 HV_HYPERCALL_FAST_BIT, 0x0,
-			 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
-			 HV_FLUSH_ALL_PROCESSORS);
-	rdmsr_from_l2(MSR_FS_BASE);
-	/*
-	 * Note: hypercall status (RAX) is not preserved correctly by L1 after
-	 * synthetic vmexit, use unchecked version.
-	 */
-	__hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
-			   HV_HYPERCALL_FAST_BIT, 0x0,
-			   HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
-			   HV_FLUSH_ALL_PROCESSORS, &unused);
-
-	/* Done, exit to L1 and never come back.  */
-	vmmcall();
-}
-
-static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm,
-						    struct hyperv_test_pages *hv_pages,
-						    vm_vaddr_t pgs_gpa)
-{
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	struct vmcb *vmcb = svm->vmcb;
-	struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
-
-	GUEST_SYNC(1);
-
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
-	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
-
-	GUEST_ASSERT(svm->vmcb_gpa);
-	/* Prepare for L2 execution. */
-	generic_svm_setup(svm, l2_guest_code,
-			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	/* L2 TLB flush setup */
-	hve->partition_assist_page = hv_pages->partition_assist_gpa;
-	hve->hv_enlightenments_control.nested_flush_hypercall = 1;
-	hve->hv_vm_id = 1;
-	hve->hv_vp_id = 1;
-	current_vp_assist->nested_control.features.directhypercall = 1;
-	*(u32 *)(hv_pages->partition_assist) = 0;
-
-	GUEST_SYNC(2);
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	GUEST_SYNC(4);
-	vmcb->save.rip += 3;
-
-	/* Intercept RDMSR 0xc0000100 */
-	vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT;
-	__set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800);
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
-	vmcb->save.rip += 2; /* rdmsr */
-
-	/* Enable enlightened MSR bitmap */
-	hve->hv_enlightenments_control.msr_bitmap = 1;
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
-	vmcb->save.rip += 2; /* rdmsr */
-
-	/* Intercept RDMSR 0xc0000101 without telling KVM about it */
-	__set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800);
-	/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
-	vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS;
-	run_guest(vmcb, svm->vmcb_gpa);
-	/* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	vmcb->save.rip += 3; /* vmcall */
-
-	/* Now tell KVM we've changed MSR-Bitmap */
-	vmcb->control.clean &= ~HV_VMCB_NESTED_ENLIGHTENMENTS;
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
-	vmcb->save.rip += 2; /* rdmsr */
-
-
-	/*
-	 * L2 TLB flush test. First VMCALL should be handled directly by L0,
-	 * no VMCALL exit expected.
-	 */
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
-	vmcb->save.rip += 2; /* rdmsr */
-	/* Enable synthetic vmexit */
-	*(u32 *)(hv_pages->partition_assist) = 1;
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == HV_SVM_EXITCODE_ENL);
-	GUEST_ASSERT(vmcb->control.exit_info_1 == HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH);
-
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	GUEST_SYNC(6);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t nested_gva = 0, hv_pages_gva = 0;
-	vm_vaddr_t hcall_page;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	int stage;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
-	TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH));
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vcpu_set_hv_cpuid(vcpu);
-	vcpu_alloc_svm(vm, &nested_gva);
-	vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
-
-	hcall_page = vm_vaddr_alloc_pages(vm, 1);
-	memset(addr_gva2hva(vm, hcall_page), 0x0,  getpagesize());
-
-	vcpu_args_set(vcpu, 3, nested_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
-	vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
-
-	for (stage = 1;; stage++) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		/* UCALL_SYNC is handled here.  */
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage, (ulong)uc.args[1]);
-
-	}
-
-done:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
deleted file mode 100644
index 077cd0ec3040..000000000000
--- a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
+++ /dev/null
@@ -1,680 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Hyper-V HvFlushVirtualAddress{List,Space}{,Ex} tests
- *
- * Copyright (C) 2022, Red Hat, Inc.
- *
- */
-#include <asm/barrier.h>
-#include <pthread.h>
-#include <inttypes.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "hyperv.h"
-#include "test_util.h"
-#include "vmx.h"
-
-#define WORKER_VCPU_ID_1 2
-#define WORKER_VCPU_ID_2 65
-
-#define NTRY 100
-#define NTEST_PAGES 2
-
-struct hv_vpset {
-	u64 format;
-	u64 valid_bank_mask;
-	u64 bank_contents[];
-};
-
-enum HV_GENERIC_SET_FORMAT {
-	HV_GENERIC_SET_SPARSE_4K,
-	HV_GENERIC_SET_ALL,
-};
-
-#define HV_FLUSH_ALL_PROCESSORS			BIT(0)
-#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
-#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
-#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
-
-/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
-struct hv_tlb_flush {
-	u64 address_space;
-	u64 flags;
-	u64 processor_mask;
-	u64 gva_list[];
-} __packed;
-
-/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
-struct hv_tlb_flush_ex {
-	u64 address_space;
-	u64 flags;
-	struct hv_vpset hv_vp_set;
-	u64 gva_list[];
-} __packed;
-
-/*
- * Pass the following info to 'workers' and 'sender'
- * - Hypercall page's GVA
- * - Hypercall page's GPA
- * - Test pages GVA
- * - GVAs of the test pages' PTEs
- */
-struct test_data {
-	vm_vaddr_t hcall_gva;
-	vm_paddr_t hcall_gpa;
-	vm_vaddr_t test_pages;
-	vm_vaddr_t test_pages_pte[NTEST_PAGES];
-};
-
-/* 'Worker' vCPU code checking the contents of the test page */
-static void worker_guest_code(vm_vaddr_t test_data)
-{
-	struct test_data *data = (struct test_data *)test_data;
-	u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
-	void *exp_page = (void *)data->test_pages + PAGE_SIZE * NTEST_PAGES;
-	u64 *this_cpu = (u64 *)(exp_page + vcpu_id * sizeof(u64));
-	u64 expected, val;
-
-	x2apic_enable();
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-
-	for (;;) {
-		cpu_relax();
-
-		expected = READ_ONCE(*this_cpu);
-
-		/*
-		 * Make sure the value in the test page is read after reading
-		 * the expectation for the first time. Pairs with wmb() in
-		 * prepare_to_test().
-		 */
-		rmb();
-
-		val = READ_ONCE(*(u64 *)data->test_pages);
-
-		/*
-		 * Make sure the value in the test page is read after before
-		 * reading the expectation for the second time. Pairs with wmb()
-		 * post_test().
-		 */
-		rmb();
-
-		/*
-		 * '0' indicates the sender is between iterations, wait until
-		 * the sender is ready for this vCPU to start checking again.
-		 */
-		if (!expected)
-			continue;
-
-		/*
-		 * Re-read the per-vCPU byte to ensure the sender didn't move
-		 * onto a new iteration.
-		 */
-		if (expected != READ_ONCE(*this_cpu))
-			continue;
-
-		GUEST_ASSERT(val == expected);
-	}
-}
-
-/*
- * Write per-CPU info indicating what each 'worker' CPU is supposed to see in
- * test page. '0' means don't check.
- */
-static void set_expected_val(void *addr, u64 val, int vcpu_id)
-{
-	void *exp_page = addr + PAGE_SIZE * NTEST_PAGES;
-
-	*(u64 *)(exp_page + vcpu_id * sizeof(u64)) = val;
-}
-
-/*
- * Update PTEs swapping two test pages.
- * TODO: use swap()/xchg() when these are provided.
- */
-static void swap_two_test_pages(vm_paddr_t pte_gva1, vm_paddr_t pte_gva2)
-{
-	uint64_t tmp = *(uint64_t *)pte_gva1;
-
-	*(uint64_t *)pte_gva1 = *(uint64_t *)pte_gva2;
-	*(uint64_t *)pte_gva2 = tmp;
-}
-
-/*
- * TODO: replace the silly NOP loop with a proper udelay() implementation.
- */
-static inline void do_delay(void)
-{
-	int i;
-
-	for (i = 0; i < 1000000; i++)
-		asm volatile("nop");
-}
-
-/*
- * Prepare to test: 'disable' workers by setting the expectation to '0',
- * clear hypercall input page and then swap two test pages.
- */
-static inline void prepare_to_test(struct test_data *data)
-{
-	/* Clear hypercall input page */
-	memset((void *)data->hcall_gva, 0, PAGE_SIZE);
-
-	/* 'Disable' workers */
-	set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_1);
-	set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_2);
-
-	/* Make sure workers are 'disabled' before we swap PTEs. */
-	wmb();
-
-	/* Make sure workers have enough time to notice */
-	do_delay();
-
-	/* Swap test page mappings */
-	swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]);
-}
-
-/*
- * Finalize the test: check hypercall resule set the expected val for
- * 'worker' CPUs and give them some time to test.
- */
-static inline void post_test(struct test_data *data, u64 exp1, u64 exp2)
-{
-	/* Make sure we change the expectation after swapping PTEs */
-	wmb();
-
-	/* Set the expectation for workers, '0' means don't test */
-	set_expected_val((void *)data->test_pages, exp1, WORKER_VCPU_ID_1);
-	set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2);
-
-	/* Make sure workers have enough time to test */
-	do_delay();
-}
-
-#define TESTVAL1 0x0101010101010101
-#define TESTVAL2 0x0202020202020202
-
-/* Main vCPU doing the test */
-static void sender_guest_code(vm_vaddr_t test_data)
-{
-	struct test_data *data = (struct test_data *)test_data;
-	struct hv_tlb_flush *flush = (struct hv_tlb_flush *)data->hcall_gva;
-	struct hv_tlb_flush_ex *flush_ex = (struct hv_tlb_flush_ex *)data->hcall_gva;
-	vm_paddr_t hcall_gpa = data->hcall_gpa;
-	int i, stage = 1;
-
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
-	wrmsr(HV_X64_MSR_HYPERCALL, data->hcall_gpa);
-
-	/* "Slow" hypercalls */
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
-				 hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
-		flush->gva_list[0] = (u64)data->test_pages;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
-			HV_FLUSH_ALL_PROCESSORS;
-		flush->processor_mask = 0;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
-				 hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
-			HV_FLUSH_ALL_PROCESSORS;
-		flush->gva_list[0] = (u64)data->test_pages;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
-				 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		/* bank_contents and gva_list occupy the same space, thus [1] */
-		flush_ex->gva_list[1] = (u64)data->test_pages;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
-				 (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
-			BIT_ULL(WORKER_VCPU_ID_1 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
-		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
-				 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
-			BIT_ULL(WORKER_VCPU_ID_2 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
-		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		/* bank_contents and gva_list occupy the same space, thus [2] */
-		flush_ex->gva_list[2] = (u64)data->test_pages;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
-				 (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
-		flush_ex->gva_list[0] = (u64)data->test_pages;
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 hcall_gpa, hcall_gpa + PAGE_SIZE);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	/* "Fast" hypercalls */
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
-		hyperv_write_xmm_input(&flush->processor_mask, 1);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
-				 HV_HYPERCALL_FAST_BIT, 0x0,
-				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
-		flush->gva_list[0] = (u64)data->test_pages;
-		hyperv_write_xmm_input(&flush->processor_mask, 1);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
-				 HV_HYPERCALL_FAST_BIT |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		hyperv_write_xmm_input(&flush->processor_mask, 1);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
-				 HV_HYPERCALL_FAST_BIT, 0x0,
-				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
-				 HV_FLUSH_ALL_PROCESSORS);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush->gva_list[0] = (u64)data->test_pages;
-		hyperv_write_xmm_input(&flush->processor_mask, 1);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
-				 HV_HYPERCALL_FAST_BIT |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET), 0x0,
-				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
-				 HV_FLUSH_ALL_PROCESSORS);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
-				 HV_HYPERCALL_FAST_BIT |
-				 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		/* bank_contents and gva_list occupy the same space, thus [1] */
-		flush_ex->gva_list[1] = (u64)data->test_pages;
-		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
-				 HV_HYPERCALL_FAST_BIT |
-				 (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
-			BIT_ULL(WORKER_VCPU_ID_1 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
-		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
-				 HV_HYPERCALL_FAST_BIT |
-				 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, i % 2 ? TESTVAL1 :
-			  TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
-			BIT_ULL(WORKER_VCPU_ID_2 / 64);
-		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
-		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
-		/* bank_contents and gva_list occupy the same space, thus [2] */
-		flush_ex->gva_list[2] = (u64)data->test_pages;
-		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 3);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
-				 HV_HYPERCALL_FAST_BIT |
-				 (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
-		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
-				 HV_HYPERCALL_FAST_BIT,
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_SYNC(stage++);
-
-	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
-	for (i = 0; i < NTRY; i++) {
-		prepare_to_test(data);
-		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
-		flush_ex->gva_list[0] = (u64)data->test_pages;
-		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
-		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
-				 HV_HYPERCALL_FAST_BIT |
-				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
-				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
-		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
-			  i % 2 ? TESTVAL1 : TESTVAL2);
-	}
-
-	GUEST_DONE();
-}
-
-static void *vcpu_thread(void *arg)
-{
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
-	struct ucall uc;
-	int old;
-	int r;
-
-	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
-	TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
-		    vcpu->id, r);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		/* NOT REACHED */
-	default:
-		TEST_FAIL("Unexpected ucall %lu, vCPU %d", uc.cmd, vcpu->id);
-	}
-
-	return NULL;
-}
-
-static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
-{
-	void *retval;
-	int r;
-
-	r = pthread_cancel(thread);
-	TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
-		    vcpu->id, r);
-
-	r = pthread_join(thread, &retval);
-	TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
-		    vcpu->id, r);
-	TEST_ASSERT(retval == PTHREAD_CANCELED,
-		    "expected retval=%p, got %p", PTHREAD_CANCELED,
-		    retval);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu[3];
-	pthread_t threads[2];
-	vm_vaddr_t test_data_page, gva;
-	vm_paddr_t gpa;
-	uint64_t *pte;
-	struct test_data *data;
-	struct ucall uc;
-	int stage = 1, r, i;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TLBFLUSH));
-
-	vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
-
-	/* Test data page */
-	test_data_page = vm_vaddr_alloc_page(vm);
-	data = (struct test_data *)addr_gva2hva(vm, test_data_page);
-
-	/* Hypercall input/output */
-	data->hcall_gva = vm_vaddr_alloc_pages(vm, 2);
-	data->hcall_gpa = addr_gva2gpa(vm, data->hcall_gva);
-	memset(addr_gva2hva(vm, data->hcall_gva), 0x0, 2 * PAGE_SIZE);
-
-	/*
-	 * Test pages: the first one is filled with '0x01's, the second with '0x02's
-	 * and the test will swap their mappings. The third page keeps the indication
-	 * about the current state of mappings.
-	 */
-	data->test_pages = vm_vaddr_alloc_pages(vm, NTEST_PAGES + 1);
-	for (i = 0; i < NTEST_PAGES; i++)
-		memset(addr_gva2hva(vm, data->test_pages + PAGE_SIZE * i),
-		       (u8)(i + 1), PAGE_SIZE);
-	set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_1);
-	set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_2);
-
-	/*
-	 * Get PTE pointers for test pages and map them inside the guest.
-	 * Use separate page for each PTE for simplicity.
-	 */
-	gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR);
-	for (i = 0; i < NTEST_PAGES; i++) {
-		pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE);
-		gpa = addr_hva2gpa(vm, pte);
-		__virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K);
-		data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK);
-	}
-
-	/*
-	 * Sender vCPU which performs the test: swaps test pages, sets expectation
-	 * for 'workers' and issues TLB flush hypercalls.
-	 */
-	vcpu_args_set(vcpu[0], 1, test_data_page);
-	vcpu_set_hv_cpuid(vcpu[0]);
-
-	/* Create worker vCPUs which check the contents of the test pages */
-	vcpu[1] = vm_vcpu_add(vm, WORKER_VCPU_ID_1, worker_guest_code);
-	vcpu_args_set(vcpu[1], 1, test_data_page);
-	vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_1);
-	vcpu_set_hv_cpuid(vcpu[1]);
-
-	vcpu[2] = vm_vcpu_add(vm, WORKER_VCPU_ID_2, worker_guest_code);
-	vcpu_args_set(vcpu[2], 1, test_data_page);
-	vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_2);
-	vcpu_set_hv_cpuid(vcpu[2]);
-
-	r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
-	TEST_ASSERT(!r, "pthread_create() failed");
-
-	r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
-	TEST_ASSERT(!r, "pthread_create() failed");
-
-	while (true) {
-		vcpu_run(vcpu[0]);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu[0], &uc)) {
-		case UCALL_SYNC:
-			TEST_ASSERT(uc.args[1] == stage,
-				    "Unexpected stage: %ld (%d expected)",
-				    uc.args[1], stage);
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		stage++;
-	}
-
-done:
-	cancel_join_vcpu_thread(threads[0], vcpu[1]);
-	cancel_join_vcpu_thread(threads[1], vcpu[2]);
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
deleted file mode 100644
index 5bc12222d87a..000000000000
--- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2021, Google LLC.
- *
- * Tests for adjusting the KVM clock from userspace
- */
-#include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/pvclock-abi.h>
-#include <stdint.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <time.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-struct test_case {
-	uint64_t kvmclock_base;
-	int64_t realtime_offset;
-};
-
-static struct test_case test_cases[] = {
-	{ .kvmclock_base = 0 },
-	{ .kvmclock_base = 180 * NSEC_PER_SEC },
-	{ .kvmclock_base = 0, .realtime_offset = -180 * NSEC_PER_SEC },
-	{ .kvmclock_base = 0, .realtime_offset = 180 * NSEC_PER_SEC },
-};
-
-#define GUEST_SYNC_CLOCK(__stage, __val)			\
-		GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
-
-static void guest_main(vm_paddr_t pvti_pa, struct pvclock_vcpu_time_info *pvti)
-{
-	int i;
-
-	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
-	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
-		GUEST_SYNC_CLOCK(i, __pvclock_read_cycles(pvti, rdtsc()));
-}
-
-#define EXPECTED_FLAGS (KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
-
-static inline void assert_flags(struct kvm_clock_data *data)
-{
-	TEST_ASSERT((data->flags & EXPECTED_FLAGS) == EXPECTED_FLAGS,
-		    "unexpected clock data flags: %x (want set: %x)",
-		    data->flags, EXPECTED_FLAGS);
-}
-
-static void handle_sync(struct ucall *uc, struct kvm_clock_data *start,
-			struct kvm_clock_data *end)
-{
-	uint64_t obs, exp_lo, exp_hi;
-
-	obs = uc->args[2];
-	exp_lo = start->clock;
-	exp_hi = end->clock;
-
-	assert_flags(start);
-	assert_flags(end);
-
-	TEST_ASSERT(exp_lo <= obs && obs <= exp_hi,
-		    "unexpected kvm-clock value: %"PRIu64" expected range: [%"PRIu64", %"PRIu64"]",
-		    obs, exp_lo, exp_hi);
-
-	pr_info("kvm-clock value: %"PRIu64" expected range [%"PRIu64", %"PRIu64"]\n",
-		obs, exp_lo, exp_hi);
-}
-
-static void handle_abort(struct ucall *uc)
-{
-	REPORT_GUEST_ASSERT(*uc);
-}
-
-static void setup_clock(struct kvm_vm *vm, struct test_case *test_case)
-{
-	struct kvm_clock_data data;
-
-	memset(&data, 0, sizeof(data));
-
-	data.clock = test_case->kvmclock_base;
-	if (test_case->realtime_offset) {
-		struct timespec ts;
-		int r;
-
-		data.flags |= KVM_CLOCK_REALTIME;
-		do {
-			r = clock_gettime(CLOCK_REALTIME, &ts);
-			if (!r)
-				break;
-		} while (errno == EINTR);
-
-		TEST_ASSERT(!r, "clock_gettime() failed: %d", r);
-
-		data.realtime = ts.tv_sec * NSEC_PER_SEC;
-		data.realtime += ts.tv_nsec;
-		data.realtime += test_case->realtime_offset;
-	}
-
-	vm_ioctl(vm, KVM_SET_CLOCK, &data);
-}
-
-static void enter_guest(struct kvm_vcpu *vcpu)
-{
-	struct kvm_clock_data start, end;
-	struct kvm_vm *vm = vcpu->vm;
-	struct ucall uc;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
-		setup_clock(vm, &test_cases[i]);
-
-		vm_ioctl(vm, KVM_GET_CLOCK, &start);
-
-		vcpu_run(vcpu);
-		vm_ioctl(vm, KVM_GET_CLOCK, &end);
-
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			handle_sync(&uc, &start, &end);
-			break;
-		case UCALL_ABORT:
-			handle_abort(&uc);
-			return;
-		default:
-			TEST_ASSERT(0, "unhandled ucall: %ld", uc.cmd);
-		}
-	}
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	vm_vaddr_t pvti_gva;
-	vm_paddr_t pvti_gpa;
-	struct kvm_vm *vm;
-	int flags;
-
-	flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK);
-	TEST_REQUIRE(flags & KVM_CLOCK_REALTIME);
-
-	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-
-	pvti_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000);
-	pvti_gpa = addr_gva2gpa(vm, pvti_gva);
-	vcpu_args_set(vcpu, 2, pvti_gpa, pvti_gva);
-
-	enter_guest(vcpu);
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
deleted file mode 100644
index 78878b3a2725..000000000000
--- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
+++ /dev/null
@@ -1,190 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2020, Google LLC.
- *
- * Tests for KVM paravirtual feature disablement
- */
-#include <asm/kvm_para.h>
-#include <linux/kvm_para.h>
-#include <stdint.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-struct msr_data {
-	uint32_t idx;
-	const char *name;
-};
-
-#define TEST_MSR(msr) { .idx = msr, .name = #msr }
-#define UCALL_PR_MSR 0xdeadbeef
-#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr)
-
-/*
- * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or
- * written, as the KVM_CPUID_FEATURES leaf is cleared.
- */
-static struct msr_data msrs_to_test[] = {
-	TEST_MSR(MSR_KVM_SYSTEM_TIME),
-	TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW),
-	TEST_MSR(MSR_KVM_WALL_CLOCK),
-	TEST_MSR(MSR_KVM_WALL_CLOCK_NEW),
-	TEST_MSR(MSR_KVM_ASYNC_PF_EN),
-	TEST_MSR(MSR_KVM_STEAL_TIME),
-	TEST_MSR(MSR_KVM_PV_EOI_EN),
-	TEST_MSR(MSR_KVM_POLL_CONTROL),
-	TEST_MSR(MSR_KVM_ASYNC_PF_INT),
-	TEST_MSR(MSR_KVM_ASYNC_PF_ACK),
-};
-
-static void test_msr(struct msr_data *msr)
-{
-	uint64_t ignored;
-	uint8_t vector;
-
-	PR_MSR(msr);
-
-	vector = rdmsr_safe(msr->idx, &ignored);
-	GUEST_ASSERT_EQ(vector, GP_VECTOR);
-
-	vector = wrmsr_safe(msr->idx, 0);
-	GUEST_ASSERT_EQ(vector, GP_VECTOR);
-}
-
-struct hcall_data {
-	uint64_t nr;
-	const char *name;
-};
-
-#define TEST_HCALL(hc) { .nr = hc, .name = #hc }
-#define UCALL_PR_HCALL 0xdeadc0de
-#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc)
-
-/*
- * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding
- * features have been cleared in KVM_CPUID_FEATURES.
- */
-static struct hcall_data hcalls_to_test[] = {
-	TEST_HCALL(KVM_HC_KICK_CPU),
-	TEST_HCALL(KVM_HC_SEND_IPI),
-	TEST_HCALL(KVM_HC_SCHED_YIELD),
-};
-
-static void test_hcall(struct hcall_data *hc)
-{
-	uint64_t r;
-
-	PR_HCALL(hc);
-	r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
-	GUEST_ASSERT_EQ(r, -KVM_ENOSYS);
-}
-
-static void guest_main(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) {
-		test_msr(&msrs_to_test[i]);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) {
-		test_hcall(&hcalls_to_test[i]);
-	}
-
-	GUEST_DONE();
-}
-
-static void pr_msr(struct ucall *uc)
-{
-	struct msr_data *msr = (struct msr_data *)uc->args[0];
-
-	pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx);
-}
-
-static void pr_hcall(struct ucall *uc)
-{
-	struct hcall_data *hc = (struct hcall_data *)uc->args[0];
-
-	pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr);
-}
-
-static void enter_guest(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	while (true) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_PR_MSR:
-			pr_msr(&uc);
-			break;
-		case UCALL_PR_HCALL:
-			pr_hcall(&uc);
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			return;
-		case UCALL_DONE:
-			return;
-		}
-	}
-}
-
-static void test_pv_unhalt(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_cpuid_entry2 *ent;
-	u32 kvm_sig_old;
-
-	pr_info("testing KVM_FEATURE_PV_UNHALT\n");
-
-	TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS);
-
-	/* KVM_PV_UNHALT test */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT);
-
-	TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
-		    "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect");
-
-	/* Make sure KVM clears vcpu->arch.kvm_cpuid */
-	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
-	kvm_sig_old = ent->ebx;
-	ent->ebx = 0xdeadbeef;
-	vcpu_set_cpuid(vcpu);
-
-	vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT);
-	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
-	ent->ebx = kvm_sig_old;
-	vcpu_set_cpuid(vcpu);
-
-	TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
-		    "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS");
-
-	/* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */
-
-	kvm_vm_free(vm);
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-
-	vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1);
-
-	vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES);
-
-	enter_guest(vcpu);
-	kvm_vm_free(vm);
-
-	test_pv_unhalt();
-}
diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
deleted file mode 100644
index 7e2bfb3c3f3b..000000000000
--- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * maximum APIC ID capability tests
- *
- * Copyright (C) 2022, Intel, Inc.
- *
- * Tests for getting/setting maximum APIC ID capability
- */
-
-#include "kvm_util.h"
-
-#define MAX_VCPU_ID	2
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	int ret;
-
-	vm = vm_create_barebones();
-
-	/* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */
-	ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
-
-	/* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */
-	ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, ret + 1);
-	TEST_ASSERT(ret < 0,
-		    "Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail");
-
-	/* Test BOOT_CPU_ID interaction (MAX_VCPU_ID cannot be lower) */
-	if (kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) {
-		vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)MAX_VCPU_ID);
-
-		/* Try setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID */
-		ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID - 1);
-		TEST_ASSERT(ret < 0,
-			    "Setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID should fail");
-	}
-
-	/* Set KVM_CAP_MAX_VCPU_ID */
-	vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID);
-
-	/* Try to set KVM_CAP_MAX_VCPU_ID again */
-	ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1);
-	TEST_ASSERT(ret < 0,
-		    "Setting KVM_CAP_MAX_VCPU_ID multiple times should fail");
-
-	/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap */
-	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID);
-	TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail");
-
-	/* Create vCPU with bits 63:32 != 0, but an otherwise valid id */
-	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(1L << 32));
-	TEST_ASSERT(ret < 0, "Creating vCPU with ID[63:32] != 0 should fail");
-
-	/* Create vCPU with id within bounds */
-	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)0);
-	TEST_ASSERT(ret >= 0, "Creating vCPU with ID 0 should succeed");
-
-	close(ret);
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
deleted file mode 100644
index 2b550eff35f1..000000000000
--- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-
-#define CPUID_MWAIT (1u << 3)
-
-enum monitor_mwait_testcases {
-	MWAIT_QUIRK_DISABLED = BIT(0),
-	MISC_ENABLES_QUIRK_DISABLED = BIT(1),
-	MWAIT_DISABLED = BIT(2),
-};
-
-/*
- * If both MWAIT and its quirk are disabled, MONITOR/MWAIT should #UD, in all
- * other scenarios KVM should emulate them as nops.
- */
-#define GUEST_ASSERT_MONITOR_MWAIT(insn, testcase, vector)		\
-do {									\
-	bool fault_wanted = ((testcase) & MWAIT_QUIRK_DISABLED) &&	\
-			    ((testcase) & MWAIT_DISABLED);		\
-									\
-	if (fault_wanted)						\
-		__GUEST_ASSERT((vector) == UD_VECTOR,			\
-			       "Expected #UD on " insn " for testcase '0x%x', got '0x%x'", \
-			       testcase, vector);			\
-	else								\
-		__GUEST_ASSERT(!(vector),				\
-			       "Expected success on " insn " for testcase '0x%x', got '0x%x'", \
-			       testcase, vector);			\
-} while (0)
-
-static void guest_monitor_wait(int testcase)
-{
-	u8 vector;
-
-	GUEST_SYNC(testcase);
-
-	/*
-	 * Arbitrarily MONITOR this function, SVM performs fault checks before
-	 * intercept checks, so the inputs for MONITOR and MWAIT must be valid.
-	 */
-	vector = kvm_asm_safe("monitor", "a"(guest_monitor_wait), "c"(0), "d"(0));
-	GUEST_ASSERT_MONITOR_MWAIT("MONITOR", testcase, vector);
-
-	vector = kvm_asm_safe("mwait", "a"(guest_monitor_wait), "c"(0), "d"(0));
-	GUEST_ASSERT_MONITOR_MWAIT("MWAIT", testcase, vector);
-}
-
-static void guest_code(void)
-{
-	guest_monitor_wait(MWAIT_DISABLED);
-
-	guest_monitor_wait(MWAIT_QUIRK_DISABLED | MWAIT_DISABLED);
-
-	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_DISABLED);
-	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED);
-
-	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED | MWAIT_DISABLED);
-	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	uint64_t disabled_quirks;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	int testcase;
-
-	TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT);
-
-	while (1) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			testcase = uc.args[1];
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			goto done;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-			goto done;
-		}
-
-		disabled_quirks = 0;
-		if (testcase & MWAIT_QUIRK_DISABLED)
-			disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS;
-		if (testcase & MISC_ENABLES_QUIRK_DISABLED)
-			disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT;
-		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks);
-
-		/*
-		 * If the MISC_ENABLES quirk (KVM neglects to update CPUID to
-		 * enable/disable MWAIT) is disabled, toggle the ENABLE_MWAIT
-		 * bit in MISC_ENABLES accordingly.  If the quirk is enabled,
-		 * the only valid configuration is MWAIT disabled, as CPUID
-		 * can't be manually changed after running the vCPU.
-		 */
-		if (!(testcase & MISC_ENABLES_QUIRK_DISABLED)) {
-			TEST_ASSERT(testcase & MWAIT_DISABLED,
-				    "Can't toggle CPUID features after running vCPU");
-			continue;
-		}
-
-		vcpu_set_msr(vcpu, MSR_IA32_MISC_ENABLE,
-			     (testcase & MWAIT_DISABLED) ? 0 : MSR_IA32_MISC_ENABLE_MWAIT);
-	}
-
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
deleted file mode 100644
index 3eb0313ffa39..000000000000
--- a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
+++ /dev/null
@@ -1,288 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-#include "svm_util.h"
-
-#define L2_GUEST_STACK_SIZE 256
-
-/*
- * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with
- * the "real" exceptions used, #SS/#GP/#DF (12/13/8).
- */
-#define FAKE_TRIPLE_FAULT_VECTOR	0xaa
-
-/* Arbitrary 32-bit error code injected by this test. */
-#define SS_ERROR_CODE 0xdeadbeef
-
-/*
- * Bit '0' is set on Intel if the exception occurs while delivering a previous
- * event/exception.  AMD's wording is ambiguous, but presumably the bit is set
- * if the exception occurs while delivering an external event, e.g. NMI or INTR,
- * but not for exceptions that occur when delivering other exceptions or
- * software interrupts.
- *
- * Note, Intel's name for it, "External event", is misleading and much more
- * aligned with AMD's behavior, but the SDM is quite clear on its behavior.
- */
-#define ERROR_CODE_EXT_FLAG	BIT(0)
-
-/*
- * Bit '1' is set if the fault occurred when looking up a descriptor in the
- * IDT, which is the case here as the IDT is empty/NULL.
- */
-#define ERROR_CODE_IDT_FLAG	BIT(1)
-
-/*
- * The #GP that occurs when vectoring #SS should show the index into the IDT
- * for #SS, plus have the "IDT flag" set.
- */
-#define GP_ERROR_CODE_AMD ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG)
-#define GP_ERROR_CODE_INTEL ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG | ERROR_CODE_EXT_FLAG)
-
-/*
- * Intel and AMD both shove '0' into the error code on #DF, regardless of what
- * led to the double fault.
- */
-#define DF_ERROR_CODE 0
-
-#define INTERCEPT_SS		(BIT_ULL(SS_VECTOR))
-#define INTERCEPT_SS_DF		(INTERCEPT_SS | BIT_ULL(DF_VECTOR))
-#define INTERCEPT_SS_GP_DF	(INTERCEPT_SS_DF | BIT_ULL(GP_VECTOR))
-
-static void l2_ss_pending_test(void)
-{
-	GUEST_SYNC(SS_VECTOR);
-}
-
-static void l2_ss_injected_gp_test(void)
-{
-	GUEST_SYNC(GP_VECTOR);
-}
-
-static void l2_ss_injected_df_test(void)
-{
-	GUEST_SYNC(DF_VECTOR);
-}
-
-static void l2_ss_injected_tf_test(void)
-{
-	GUEST_SYNC(FAKE_TRIPLE_FAULT_VECTOR);
-}
-
-static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
-		       uint32_t error_code)
-{
-	struct vmcb *vmcb = svm->vmcb;
-	struct vmcb_control_area *ctrl = &vmcb->control;
-
-	vmcb->save.rip = (u64)l2_code;
-	run_guest(vmcb, svm->vmcb_gpa);
-
-	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
-		return;
-
-	GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
-	GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
-}
-
-static void l1_svm_code(struct svm_test_data *svm)
-{
-	struct vmcb_control_area *ctrl = &svm->vmcb->control;
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-	svm->vmcb->save.idtr.limit = 0;
-	ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN);
-
-	ctrl->intercept_exceptions = INTERCEPT_SS_GP_DF;
-	svm_run_l2(svm, l2_ss_pending_test, SS_VECTOR, SS_ERROR_CODE);
-	svm_run_l2(svm, l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_AMD);
-
-	ctrl->intercept_exceptions = INTERCEPT_SS_DF;
-	svm_run_l2(svm, l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
-
-	ctrl->intercept_exceptions = INTERCEPT_SS;
-	svm_run_l2(svm, l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
-	GUEST_ASSERT_EQ(ctrl->exit_code, SVM_EXIT_SHUTDOWN);
-
-	GUEST_DONE();
-}
-
-static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
-{
-	GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_code));
-
-	GUEST_ASSERT_EQ(vector == SS_VECTOR ? vmlaunch() : vmresume(), 0);
-
-	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
-		return;
-
-	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
-	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
-	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
-}
-
-static void l1_vmx_code(struct vmx_pages *vmx)
-{
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
-
-	GUEST_ASSERT_EQ(load_vmcs(vmx), true);
-
-	prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-	GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0);
-
-	/*
-	 * VMX disallows injecting an exception with error_code[31:16] != 0,
-	 * and hardware will never generate a VM-Exit with bits 31:16 set.
-	 * KVM should likewise truncate the "bad" userspace value.
-	 */
-	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_GP_DF), 0);
-	vmx_run_l2(l2_ss_pending_test, SS_VECTOR, (u16)SS_ERROR_CODE);
-	vmx_run_l2(l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_INTEL);
-
-	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_DF), 0);
-	vmx_run_l2(l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
-
-	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS), 0);
-	vmx_run_l2(l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
-	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_TRIPLE_FAULT);
-
-	GUEST_DONE();
-}
-
-static void __attribute__((__flatten__)) l1_guest_code(void *test_data)
-{
-	if (this_cpu_has(X86_FEATURE_SVM))
-		l1_svm_code(test_data);
-	else
-		l1_vmx_code(test_data);
-}
-
-static void assert_ucall_vector(struct kvm_vcpu *vcpu, int vector)
-{
-	struct ucall uc;
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_SYNC:
-		TEST_ASSERT(vector == uc.args[1],
-			    "Expected L2 to ask for %d, got %ld", vector, uc.args[1]);
-		break;
-	case UCALL_DONE:
-		TEST_ASSERT(vector == -1,
-			    "Expected L2 to ask for %d, L2 says it's done", vector);
-		break;
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	default:
-		TEST_FAIL("Expected L2 to ask for %d, got unexpected ucall %lu", vector, uc.cmd);
-	}
-}
-
-static void queue_ss_exception(struct kvm_vcpu *vcpu, bool inject)
-{
-	struct kvm_vcpu_events events;
-
-	vcpu_events_get(vcpu, &events);
-
-	TEST_ASSERT(!events.exception.pending,
-		    "Vector %d unexpectedlt pending", events.exception.nr);
-	TEST_ASSERT(!events.exception.injected,
-		    "Vector %d unexpectedly injected", events.exception.nr);
-
-	events.flags = KVM_VCPUEVENT_VALID_PAYLOAD;
-	events.exception.pending = !inject;
-	events.exception.injected = inject;
-	events.exception.nr = SS_VECTOR;
-	events.exception.has_error_code = true;
-	events.exception.error_code = SS_ERROR_CODE;
-	vcpu_events_set(vcpu, &events);
-}
-
-/*
- * Verify KVM_{G,S}ET_EVENTS play nice with pending vs. injected exceptions
- * when an exception is being queued for L2.  Specifically, verify that KVM
- * honors L1 exception intercept controls when a #SS is pending/injected,
- * triggers a #GP on vectoring the #SS, morphs to #DF if #GP isn't intercepted
- * by L1, and finally causes (nested) SHUTDOWN if #DF isn't intercepted by L1.
- */
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t nested_test_data_gva;
-	struct kvm_vcpu_events events;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXCEPTION_PAYLOAD));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-	vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul);
-
-	if (kvm_cpu_has(X86_FEATURE_SVM))
-		vcpu_alloc_svm(vm, &nested_test_data_gva);
-	else
-		vcpu_alloc_vmx(vm, &nested_test_data_gva);
-
-	vcpu_args_set(vcpu, 1, nested_test_data_gva);
-
-	/* Run L1 => L2.  L2 should sync and request #SS. */
-	vcpu_run(vcpu);
-	assert_ucall_vector(vcpu, SS_VECTOR);
-
-	/* Pend #SS and request immediate exit.  #SS should still be pending. */
-	queue_ss_exception(vcpu, false);
-	vcpu->run->immediate_exit = true;
-	vcpu_run_complete_io(vcpu);
-
-	/* Verify the pending events comes back out the same as it went in. */
-	vcpu_events_get(vcpu, &events);
-	TEST_ASSERT_EQ(events.flags & KVM_VCPUEVENT_VALID_PAYLOAD,
-			KVM_VCPUEVENT_VALID_PAYLOAD);
-	TEST_ASSERT_EQ(events.exception.pending, true);
-	TEST_ASSERT_EQ(events.exception.nr, SS_VECTOR);
-	TEST_ASSERT_EQ(events.exception.has_error_code, true);
-	TEST_ASSERT_EQ(events.exception.error_code, SS_ERROR_CODE);
-
-	/*
-	 * Run for real with the pending #SS, L1 should get a VM-Exit due to
-	 * #SS interception and re-enter L2 to request #GP (via injected #SS).
-	 */
-	vcpu->run->immediate_exit = false;
-	vcpu_run(vcpu);
-	assert_ucall_vector(vcpu, GP_VECTOR);
-
-	/*
-	 * Inject #SS, the #SS should bypass interception and cause #GP, which
-	 * L1 should intercept before KVM morphs it to #DF.  L1 should then
-	 * disable #GP interception and run L2 to request #DF (via #SS => #GP).
-	 */
-	queue_ss_exception(vcpu, true);
-	vcpu_run(vcpu);
-	assert_ucall_vector(vcpu, DF_VECTOR);
-
-	/*
-	 * Inject #SS, the #SS should bypass interception and cause #GP, which
-	 * L1 is no longer interception, and so should see a #DF VM-Exit.  L1
-	 * should then signal that is done.
-	 */
-	queue_ss_exception(vcpu, true);
-	vcpu_run(vcpu);
-	assert_ucall_vector(vcpu, FAKE_TRIPLE_FAULT_VECTOR);
-
-	/*
-	 * Inject #SS yet again.  L1 is not intercepting #GP or #DF, and so
-	 * should see nested TRIPLE_FAULT / SHUTDOWN.
-	 */
-	queue_ss_exception(vcpu, true);
-	vcpu_run(vcpu);
-	assert_ucall_vector(vcpu, -1);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
deleted file mode 100644
index e7efb2b35f8b..000000000000
--- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
+++ /dev/null
@@ -1,266 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Usage: to be run via nx_huge_page_test.sh, which does the necessary
- * environment setup and teardown
- *
- * Copyright (C) 2022, Google LLC.
- */
-#include <fcntl.h>
-#include <stdint.h>
-#include <time.h>
-
-#include <test_util.h>
-#include "kvm_util.h"
-#include "processor.h"
-
-#define HPAGE_SLOT		10
-#define HPAGE_GPA		(4UL << 30) /* 4G prevents collision w/ slot 0 */
-#define HPAGE_GVA		HPAGE_GPA /* GVA is arbitrary, so use GPA. */
-#define PAGES_PER_2MB_HUGE_PAGE 512
-#define HPAGE_SLOT_NPAGES	(3 * PAGES_PER_2MB_HUGE_PAGE)
-
-/*
- * Passed by nx_huge_pages_test.sh to provide an easy warning if this test is
- * being run without it.
- */
-#define MAGIC_TOKEN 887563923
-
-/*
- * x86 opcode for the return instruction. Used to call into, and then
- * immediately return from, memory backed with hugepages.
- */
-#define RETURN_OPCODE 0xC3
-
-/* Call the specified memory address. */
-static void guest_do_CALL(uint64_t target)
-{
-	((void (*)(void)) target)();
-}
-
-/*
- * Exit the VM after each memory access so that the userspace component of the
- * test can make assertions about the pages backing the VM.
- *
- * See the below for an explanation of how each access should affect the
- * backing mappings.
- */
-void guest_code(void)
-{
-	uint64_t hpage_1 = HPAGE_GVA;
-	uint64_t hpage_2 = hpage_1 + (PAGE_SIZE * 512);
-	uint64_t hpage_3 = hpage_2 + (PAGE_SIZE * 512);
-
-	READ_ONCE(*(uint64_t *)hpage_1);
-	GUEST_SYNC(1);
-
-	READ_ONCE(*(uint64_t *)hpage_2);
-	GUEST_SYNC(2);
-
-	guest_do_CALL(hpage_1);
-	GUEST_SYNC(3);
-
-	guest_do_CALL(hpage_3);
-	GUEST_SYNC(4);
-
-	READ_ONCE(*(uint64_t *)hpage_1);
-	GUEST_SYNC(5);
-
-	READ_ONCE(*(uint64_t *)hpage_3);
-	GUEST_SYNC(6);
-}
-
-static void check_2m_page_count(struct kvm_vm *vm, int expected_pages_2m)
-{
-	int actual_pages_2m;
-
-	actual_pages_2m = vm_get_stat(vm, "pages_2m");
-
-	TEST_ASSERT(actual_pages_2m == expected_pages_2m,
-		    "Unexpected 2m page count. Expected %d, got %d",
-		    expected_pages_2m, actual_pages_2m);
-}
-
-static void check_split_count(struct kvm_vm *vm, int expected_splits)
-{
-	int actual_splits;
-
-	actual_splits = vm_get_stat(vm, "nx_lpage_splits");
-
-	TEST_ASSERT(actual_splits == expected_splits,
-		    "Unexpected NX huge page split count. Expected %d, got %d",
-		    expected_splits, actual_splits);
-}
-
-static void wait_for_reclaim(int reclaim_period_ms)
-{
-	long reclaim_wait_ms;
-	struct timespec ts;
-
-	reclaim_wait_ms = reclaim_period_ms * 5;
-	ts.tv_sec = reclaim_wait_ms / 1000;
-	ts.tv_nsec = (reclaim_wait_ms - (ts.tv_sec * 1000)) * 1000000;
-	nanosleep(&ts, NULL);
-}
-
-void run_test(int reclaim_period_ms, bool disable_nx_huge_pages,
-	      bool reboot_permissions)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	uint64_t nr_bytes;
-	void *hva;
-	int r;
-
-	vm = vm_create(1);
-
-	if (disable_nx_huge_pages) {
-		r = __vm_disable_nx_huge_pages(vm);
-		if (reboot_permissions) {
-			TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions");
-		} else {
-			TEST_ASSERT(r == -1 && errno == EPERM,
-				    "This process should not have permission to disable NX huge pages");
-			return;
-		}
-	}
-
-	vcpu = vm_vcpu_add(vm, 0, guest_code);
-
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_HUGETLB,
-				    HPAGE_GPA, HPAGE_SLOT,
-				    HPAGE_SLOT_NPAGES, 0);
-
-	nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size;
-
-	/*
-	 * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the
-	 * region into the guest with 2MiB pages whenever TDP is disabled (i.e.
-	 * whenever KVM is shadowing the guest page tables).
-	 *
-	 * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge
-	 * pages irrespective of the guest page size, so map with 4KiB pages
-	 * to test that that is the case.
-	 */
-	if (kvm_is_tdp_enabled())
-		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K);
-	else
-		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M);
-
-	hva = addr_gpa2hva(vm, HPAGE_GPA);
-	memset(hva, RETURN_OPCODE, nr_bytes);
-
-	check_2m_page_count(vm, 0);
-	check_split_count(vm, 0);
-
-	/*
-	 * The guest code will first read from the first hugepage, resulting
-	 * in a huge page mapping being created.
-	 */
-	vcpu_run(vcpu);
-	check_2m_page_count(vm, 1);
-	check_split_count(vm, 0);
-
-	/*
-	 * Then the guest code will read from the second hugepage, resulting
-	 * in another huge page mapping being created.
-	 */
-	vcpu_run(vcpu);
-	check_2m_page_count(vm, 2);
-	check_split_count(vm, 0);
-
-	/*
-	 * Next, the guest will execute from the first huge page, causing it
-	 * to be remapped at 4k.
-	 *
-	 * If NX huge pages are disabled, this should have no effect.
-	 */
-	vcpu_run(vcpu);
-	check_2m_page_count(vm, disable_nx_huge_pages ? 2 : 1);
-	check_split_count(vm, disable_nx_huge_pages ? 0 : 1);
-
-	/*
-	 * Executing from the third huge page (previously unaccessed) will
-	 * cause part to be mapped at 4k.
-	 *
-	 * If NX huge pages are disabled, it should be mapped at 2M.
-	 */
-	vcpu_run(vcpu);
-	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
-	check_split_count(vm, disable_nx_huge_pages ? 0 : 2);
-
-	/* Reading from the first huge page again should have no effect. */
-	vcpu_run(vcpu);
-	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
-	check_split_count(vm, disable_nx_huge_pages ? 0 : 2);
-
-	/* Give recovery thread time to run. */
-	wait_for_reclaim(reclaim_period_ms);
-
-	/*
-	 * Now that the reclaimer has run, all the split pages should be gone.
-	 *
-	 * If NX huge pages are disabled, the relaimer will not run, so
-	 * nothing should change from here on.
-	 */
-	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
-	check_split_count(vm, 0);
-
-	/*
-	 * The 4k mapping on hpage 3 should have been removed, so check that
-	 * reading from it causes a huge page mapping to be installed.
-	 */
-	vcpu_run(vcpu);
-	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 2);
-	check_split_count(vm, 0);
-
-	kvm_vm_free(vm);
-}
-
-static void help(char *name)
-{
-	puts("");
-	printf("usage: %s [-h] [-p period_ms] [-t token]\n", name);
-	puts("");
-	printf(" -p: The NX reclaim period in milliseconds.\n");
-	printf(" -t: The magic token to indicate environment setup is done.\n");
-	printf(" -r: The test has reboot permissions and can disable NX huge pages.\n");
-	puts("");
-	exit(0);
-}
-
-int main(int argc, char **argv)
-{
-	int reclaim_period_ms = 0, token = 0, opt;
-	bool reboot_permissions = false;
-
-	while ((opt = getopt(argc, argv, "hp:t:r")) != -1) {
-		switch (opt) {
-		case 'p':
-			reclaim_period_ms = atoi_positive("Reclaim period", optarg);
-			break;
-		case 't':
-			token = atoi_paranoid(optarg);
-			break;
-		case 'r':
-			reboot_permissions = true;
-			break;
-		case 'h':
-		default:
-			help(argv[0]);
-			break;
-		}
-	}
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES));
-
-	__TEST_REQUIRE(token == MAGIC_TOKEN,
-		       "This test must be run with the magic token via '-t %d'.\n"
-		       "Running via nx_huge_pages_test.sh, which also handles "
-		       "environment setup, is strongly recommended.", MAGIC_TOKEN);
-
-	run_test(reclaim_period_ms, false, reboot_permissions);
-	run_test(reclaim_period_ms, true, reboot_permissions);
-
-	return 0;
-}
-
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh
deleted file mode 100755
index caad084b8bfd..000000000000
--- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0-only */
-#
-# Wrapper script which performs setup and cleanup for nx_huge_pages_test.
-# Makes use of root privileges to set up huge pages and KVM module parameters.
-#
-# Copyright (C) 2022, Google LLC.
-
-set -e
-
-NX_HUGE_PAGES=$(cat /sys/module/kvm/parameters/nx_huge_pages)
-NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio)
-NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms)
-HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages)
-
-# If we're already root, the host might not have sudo.
-if [ $(whoami) == "root" ]; then
-	function do_sudo () {
-		"$@"
-	}
-else
-	function do_sudo () {
-		sudo "$@"
-	}
-fi
-
-set +e
-
-function sudo_echo () {
-	echo "$1" | do_sudo tee -a "$2" > /dev/null
-}
-
-NXECUTABLE="$(dirname $0)/nx_huge_pages_test"
-
-sudo_echo test /dev/null || exit 4 # KSFT_SKIP=4
-
-(
-	set -e
-
-	sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages
-	sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio
-	sudo_echo 100 /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms
-	sudo_echo "$(( $HUGE_PAGES + 3 ))" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
-
-	# Test with reboot permissions
-	if [ $(whoami) == "root" ] || sudo setcap cap_sys_boot+ep $NXECUTABLE 2> /dev/null; then
-		echo Running test with CAP_SYS_BOOT enabled
-		$NXECUTABLE -t 887563923 -p 100 -r
-		test $(whoami) == "root" || sudo setcap cap_sys_boot-ep $NXECUTABLE
-	else
-		echo setcap failed, skipping nx_huge_pages_test with CAP_SYS_BOOT enabled
-	fi
-
-	# Test without reboot permissions
-	if [ $(whoami) != "root" ] ; then
-		echo Running test with CAP_SYS_BOOT disabled
-		$NXECUTABLE -t 887563923 -p 100
-	else
-		echo Running as root, skipping nx_huge_pages_test with CAP_SYS_BOOT disabled
-	fi
-)
-RET=$?
-
-sudo_echo "$NX_HUGE_PAGES" /sys/module/kvm/parameters/nx_huge_pages
-sudo_echo "$NX_HUGE_PAGES_RECOVERY_RATIO" /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio
-sudo_echo "$NX_HUGE_PAGES_RECOVERY_PERIOD" /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms
-sudo_echo "$HUGE_PAGES" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
-
-exit $RET
diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
deleted file mode 100644
index 9cbf283ebc55..000000000000
--- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Test for x86 KVM_CAP_MSR_PLATFORM_INFO
- *
- * Copyright (C) 2018, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * Verifies expected behavior of controlling guest access to
- * MSR_PLATFORM_INFO.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
-
-static void guest_code(void)
-{
-	uint64_t msr_platform_info;
-	uint8_t vector;
-
-	GUEST_SYNC(true);
-	msr_platform_info = rdmsr(MSR_PLATFORM_INFO);
-	GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
-			MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
-
-	GUEST_SYNC(false);
-	vector = rdmsr_safe(MSR_PLATFORM_INFO, &msr_platform_info);
-	GUEST_ASSERT_EQ(vector, GP_VECTOR);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	uint64_t msr_platform_info;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO);
-	vcpu_set_msr(vcpu, MSR_PLATFORM_INFO,
-		     msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
-
-	for (;;) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, uc.args[1]);
-			break;
-		case UCALL_DONE:
-			goto done;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-		default:
-			TEST_FAIL("Unexpected ucall %lu", uc.cmd);
-			break;
-		}
-	}
-
-done:
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
deleted file mode 100644
index 698cb36989db..000000000000
--- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
+++ /dev/null
@@ -1,644 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2023, Tencent, Inc.
- */
-#include <x86intrin.h>
-
-#include "pmu.h"
-#include "processor.h"
-
-/* Number of iterations of the loop for the guest measurement payload. */
-#define NUM_LOOPS			10
-
-/* Each iteration of the loop retires one branch instruction. */
-#define NUM_BRANCH_INSNS_RETIRED	(NUM_LOOPS)
-
-/*
- * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE,
- * 1 LOOP.
- */
-#define NUM_INSNS_PER_LOOP		3
-
-/*
- * Number of "extra" instructions that will be counted, i.e. the number of
- * instructions that are needed to set up the loop and then disable the
- * counter.  2 MOV, 2 XOR, 1 WRMSR.
- */
-#define NUM_EXTRA_INSNS			5
-
-/* Total number of instructions retired within the measured section. */
-#define NUM_INSNS_RETIRED		(NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS)
-
-
-static uint8_t kvm_pmu_version;
-static bool kvm_has_perf_caps;
-
-static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
-						  void *guest_code,
-						  uint8_t pmu_version,
-						  uint64_t perf_capabilities)
-{
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(vcpu, guest_code);
-	sync_global_to_guest(vm, kvm_pmu_version);
-
-	/*
-	 * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling
-	 * features via PERF_CAPABILITIES if the guest doesn't have a vPMU.
-	 */
-	if (kvm_has_perf_caps)
-		vcpu_set_msr(*vcpu, MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
-	vcpu_set_cpuid_property(*vcpu, X86_PROPERTY_PMU_VERSION, pmu_version);
-	return vm;
-}
-
-static void run_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	do {
-		vcpu_run(vcpu);
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_PRINTF:
-			pr_info("%s", uc.buffer);
-			break;
-		case UCALL_DONE:
-			break;
-		default:
-			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-		}
-	} while (uc.cmd != UCALL_DONE);
-}
-
-static uint8_t guest_get_pmu_version(void)
-{
-	/*
-	 * Return the effective PMU version, i.e. the minimum between what KVM
-	 * supports and what is enumerated to the guest.  The host deliberately
-	 * advertises a PMU version to the guest beyond what is actually
-	 * supported by KVM to verify KVM doesn't freak out and do something
-	 * bizarre with an architecturally valid, but unsupported, version.
-	 */
-	return min_t(uint8_t, kvm_pmu_version, this_cpu_property(X86_PROPERTY_PMU_VERSION));
-}
-
-/*
- * If an architectural event is supported and guaranteed to generate at least
- * one "hit, assert that its count is non-zero.  If an event isn't supported or
- * the test can't guarantee the associated action will occur, then all bets are
- * off regarding the count, i.e. no checks can be done.
- *
- * Sanity check that in all cases, the event doesn't count when it's disabled,
- * and that KVM correctly emulates the write of an arbitrary value.
- */
-static void guest_assert_event_count(uint8_t idx,
-				     struct kvm_x86_pmu_feature event,
-				     uint32_t pmc, uint32_t pmc_msr)
-{
-	uint64_t count;
-
-	count = _rdpmc(pmc);
-	if (!this_pmu_has(event))
-		goto sanity_checks;
-
-	switch (idx) {
-	case INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX:
-		GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED);
-		break;
-	case INTEL_ARCH_BRANCHES_RETIRED_INDEX:
-		GUEST_ASSERT_EQ(count, NUM_BRANCH_INSNS_RETIRED);
-		break;
-	case INTEL_ARCH_LLC_REFERENCES_INDEX:
-	case INTEL_ARCH_LLC_MISSES_INDEX:
-		if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
-		    !this_cpu_has(X86_FEATURE_CLFLUSH))
-			break;
-		fallthrough;
-	case INTEL_ARCH_CPU_CYCLES_INDEX:
-	case INTEL_ARCH_REFERENCE_CYCLES_INDEX:
-		GUEST_ASSERT_NE(count, 0);
-		break;
-	case INTEL_ARCH_TOPDOWN_SLOTS_INDEX:
-		GUEST_ASSERT(count >= NUM_INSNS_RETIRED);
-		break;
-	default:
-		break;
-	}
-
-sanity_checks:
-	__asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
-	GUEST_ASSERT_EQ(_rdpmc(pmc), count);
-
-	wrmsr(pmc_msr, 0xdead);
-	GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead);
-}
-
-/*
- * Enable and disable the PMC in a monolithic asm blob to ensure that the
- * compiler can't insert _any_ code into the measured sequence.  Note, ECX
- * doesn't need to be clobbered as the input value, @pmc_msr, is restored
- * before the end of the sequence.
- *
- * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
- * CLFUSH{,OPT} instruction on each loop iteration to force LLC references and
- * misses, i.e. to allow testing that those events actually count.
- *
- * If forced emulation is enabled (and specified), force emulation on a subset
- * of the measured code to verify that KVM correctly emulates instructions and
- * branches retired events in conjunction with hardware also counting said
- * events.
- */
-#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
-do {										\
-	__asm__ __volatile__("wrmsr\n\t"					\
-			     " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t"	\
-			     "1:\n\t"						\
-			     clflush "\n\t"					\
-			     "mfence\n\t"					\
-			     FEP "loop 1b\n\t"					\
-			     FEP "mov %%edi, %%ecx\n\t"				\
-			     FEP "xor %%eax, %%eax\n\t"				\
-			     FEP "xor %%edx, %%edx\n\t"				\
-			     "wrmsr\n\t"					\
-			     :: "a"((uint32_t)_value), "d"(_value >> 32),	\
-				"c"(_msr), "D"(_msr)				\
-	);									\
-} while (0)
-
-#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP)	\
-do {										\
-	wrmsr(pmc_msr, 0);							\
-										\
-	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
-		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP);	\
-	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
-		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP);	\
-	else									\
-		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
-										\
-	guest_assert_event_count(_idx, _event, _pmc, _pmc_msr);			\
-} while (0)
-
-static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event,
-				    uint32_t pmc, uint32_t pmc_msr,
-				    uint32_t ctrl_msr, uint64_t ctrl_msr_value)
-{
-	GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, "");
-
-	if (is_forced_emulation_enabled)
-		GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
-}
-
-#define X86_PMU_FEATURE_NULL						\
-({									\
-	struct kvm_x86_pmu_feature feature = {};			\
-									\
-	feature;							\
-})
-
-static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event)
-{
-	return !(*(u64 *)&event);
-}
-
-static void guest_test_arch_event(uint8_t idx)
-{
-	const struct {
-		struct kvm_x86_pmu_feature gp_event;
-		struct kvm_x86_pmu_feature fixed_event;
-	} intel_event_to_feature[] = {
-		[INTEL_ARCH_CPU_CYCLES_INDEX]		 = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED },
-		[INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX]	 = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED },
-		/*
-		 * Note, the fixed counter for reference cycles is NOT the same
-		 * as the general purpose architectural event.  The fixed counter
-		 * explicitly counts at the same frequency as the TSC, whereas
-		 * the GP event counts at a fixed, but uarch specific, frequency.
-		 * Bundle them here for simplicity.
-		 */
-		[INTEL_ARCH_REFERENCE_CYCLES_INDEX]	 = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED },
-		[INTEL_ARCH_LLC_REFERENCES_INDEX]	 = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_LLC_MISSES_INDEX]		 = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_BRANCHES_RETIRED_INDEX]	 = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL },
-		[INTEL_ARCH_TOPDOWN_SLOTS_INDEX]	 = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED },
-	};
-
-	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
-	uint32_t pmu_version = guest_get_pmu_version();
-	/* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */
-	bool guest_has_perf_global_ctrl = pmu_version >= 2;
-	struct kvm_x86_pmu_feature gp_event, fixed_event;
-	uint32_t base_pmc_msr;
-	unsigned int i;
-
-	/* The host side shouldn't invoke this without a guest PMU. */
-	GUEST_ASSERT(pmu_version);
-
-	if (this_cpu_has(X86_FEATURE_PDCM) &&
-	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
-		base_pmc_msr = MSR_IA32_PMC0;
-	else
-		base_pmc_msr = MSR_IA32_PERFCTR0;
-
-	gp_event = intel_event_to_feature[idx].gp_event;
-	GUEST_ASSERT_EQ(idx, gp_event.f.bit);
-
-	GUEST_ASSERT(nr_gp_counters);
-
-	for (i = 0; i < nr_gp_counters; i++) {
-		uint64_t eventsel = ARCH_PERFMON_EVENTSEL_OS |
-				    ARCH_PERFMON_EVENTSEL_ENABLE |
-				    intel_pmu_arch_events[idx];
-
-		wrmsr(MSR_P6_EVNTSEL0 + i, 0);
-		if (guest_has_perf_global_ctrl)
-			wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
-
-		__guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i,
-					MSR_P6_EVNTSEL0 + i, eventsel);
-	}
-
-	if (!guest_has_perf_global_ctrl)
-		return;
-
-	fixed_event = intel_event_to_feature[idx].fixed_event;
-	if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event))
-		return;
-
-	i = fixed_event.f.bit;
-
-	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
-
-	__guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED,
-				MSR_CORE_PERF_FIXED_CTR0 + i,
-				MSR_CORE_PERF_GLOBAL_CTRL,
-				FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
-}
-
-static void guest_test_arch_events(void)
-{
-	uint8_t i;
-
-	for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++)
-		guest_test_arch_event(i);
-
-	GUEST_DONE();
-}
-
-static void test_arch_events(uint8_t pmu_version, uint64_t perf_capabilities,
-			     uint8_t length, uint8_t unavailable_mask)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	/* Testing arch events requires a vPMU (there are no negative tests). */
-	if (!pmu_version)
-		return;
-
-	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_arch_events,
-					 pmu_version, perf_capabilities);
-
-	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
-				length);
-	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
-				unavailable_mask);
-
-	run_vcpu(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-/*
- * Limit testing to MSRs that are actually defined by Intel (in the SDM).  MSRs
- * that aren't defined counter MSRs *probably* don't exist, but there's no
- * guarantee that currently undefined MSR indices won't be used for something
- * other than PMCs in the future.
- */
-#define MAX_NR_GP_COUNTERS	8
-#define MAX_NR_FIXED_COUNTERS	3
-
-#define GUEST_ASSERT_PMC_MSR_ACCESS(insn, msr, expect_gp, vector)		\
-__GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector,			\
-	       "Expected %s on " #insn "(0x%x), got vector %u",			\
-	       expect_gp ? "#GP" : "no fault", msr, vector)			\
-
-#define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected)			\
-	__GUEST_ASSERT(val == expected_val,					\
-		       "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx",	\
-		       msr, expected_val, val);
-
-static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success,
-			     uint64_t expected_val)
-{
-	uint8_t vector;
-	uint64_t val;
-
-	vector = rdpmc_safe(rdpmc_idx, &val);
-	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
-	if (expect_success)
-		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
-
-	if (!is_forced_emulation_enabled)
-		return;
-
-	vector = rdpmc_safe_fep(rdpmc_idx, &val);
-	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
-	if (expect_success)
-		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
-}
-
-static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters,
-				 uint8_t nr_counters, uint32_t or_mask)
-{
-	const bool pmu_has_fast_mode = !guest_get_pmu_version();
-	uint8_t i;
-
-	for (i = 0; i < nr_possible_counters; i++) {
-		/*
-		 * TODO: Test a value that validates full-width writes and the
-		 * width of the counters.
-		 */
-		const uint64_t test_val = 0xffff;
-		const uint32_t msr = base_msr + i;
-
-		/*
-		 * Fixed counters are supported if the counter is less than the
-		 * number of enumerated contiguous counters *or* the counter is
-		 * explicitly enumerated in the supported counters mask.
-		 */
-		const bool expect_success = i < nr_counters || (or_mask & BIT(i));
-
-		/*
-		 * KVM drops writes to MSR_P6_PERFCTR[0|1] if the counters are
-		 * unsupported, i.e. doesn't #GP and reads back '0'.
-		 */
-		const uint64_t expected_val = expect_success ? test_val : 0;
-		const bool expect_gp = !expect_success && msr != MSR_P6_PERFCTR0 &&
-				       msr != MSR_P6_PERFCTR1;
-		uint32_t rdpmc_idx;
-		uint8_t vector;
-		uint64_t val;
-
-		vector = wrmsr_safe(msr, test_val);
-		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
-
-		vector = rdmsr_safe(msr, &val);
-		GUEST_ASSERT_PMC_MSR_ACCESS(RDMSR, msr, expect_gp, vector);
-
-		/* On #GP, the result of RDMSR is undefined. */
-		if (!expect_gp)
-			GUEST_ASSERT_PMC_VALUE(RDMSR, msr, val, expected_val);
-
-		/*
-		 * Redo the read tests with RDPMC, which has different indexing
-		 * semantics and additional capabilities.
-		 */
-		rdpmc_idx = i;
-		if (base_msr == MSR_CORE_PERF_FIXED_CTR0)
-			rdpmc_idx |= INTEL_RDPMC_FIXED;
-
-		guest_test_rdpmc(rdpmc_idx, expect_success, expected_val);
-
-		/*
-		 * KVM doesn't support non-architectural PMUs, i.e. it should
-		 * impossible to have fast mode RDPMC.  Verify that attempting
-		 * to use fast RDPMC always #GPs.
-		 */
-		GUEST_ASSERT(!expect_success || !pmu_has_fast_mode);
-		rdpmc_idx |= INTEL_RDPMC_FAST;
-		guest_test_rdpmc(rdpmc_idx, false, -1ull);
-
-		vector = wrmsr_safe(msr, 0);
-		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
-	}
-}
-
-static void guest_test_gp_counters(void)
-{
-	uint8_t pmu_version = guest_get_pmu_version();
-	uint8_t nr_gp_counters = 0;
-	uint32_t base_msr;
-
-	if (pmu_version)
-		nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
-
-	/*
-	 * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is
-	 * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number
-	 * of GP counters.  If there are no GP counters, require KVM to leave
-	 * PERF_GLOBAL_CTRL '0'.  This edge case isn't covered by the SDM, but
-	 * follow the spirit of the architecture and only globally enable GP
-	 * counters, of which there are none.
-	 */
-	if (pmu_version > 1) {
-		uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL);
-
-		if (nr_gp_counters)
-			GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0));
-		else
-			GUEST_ASSERT_EQ(global_ctrl, 0);
-	}
-
-	if (this_cpu_has(X86_FEATURE_PDCM) &&
-	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
-		base_msr = MSR_IA32_PMC0;
-	else
-		base_msr = MSR_IA32_PERFCTR0;
-
-	guest_rd_wr_counters(base_msr, MAX_NR_GP_COUNTERS, nr_gp_counters, 0);
-	GUEST_DONE();
-}
-
-static void test_gp_counters(uint8_t pmu_version, uint64_t perf_capabilities,
-			     uint8_t nr_gp_counters)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_gp_counters,
-					 pmu_version, perf_capabilities);
-
-	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_GP_COUNTERS,
-				nr_gp_counters);
-
-	run_vcpu(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-static void guest_test_fixed_counters(void)
-{
-	uint64_t supported_bitmask = 0;
-	uint8_t nr_fixed_counters = 0;
-	uint8_t i;
-
-	/* Fixed counters require Architectural vPMU Version 2+. */
-	if (guest_get_pmu_version() >= 2)
-		nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
-
-	/*
-	 * The supported bitmask for fixed counters was introduced in PMU
-	 * version 5.
-	 */
-	if (guest_get_pmu_version() >= 5)
-		supported_bitmask = this_cpu_property(X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK);
-
-	guest_rd_wr_counters(MSR_CORE_PERF_FIXED_CTR0, MAX_NR_FIXED_COUNTERS,
-			     nr_fixed_counters, supported_bitmask);
-
-	for (i = 0; i < MAX_NR_FIXED_COUNTERS; i++) {
-		uint8_t vector;
-		uint64_t val;
-
-		if (i >= nr_fixed_counters && !(supported_bitmask & BIT_ULL(i))) {
-			vector = wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL,
-					    FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
-			__GUEST_ASSERT(vector == GP_VECTOR,
-				       "Expected #GP for counter %u in FIXED_CTR_CTRL", i);
-
-			vector = wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL,
-					    FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
-			__GUEST_ASSERT(vector == GP_VECTOR,
-				       "Expected #GP for counter %u in PERF_GLOBAL_CTRL", i);
-			continue;
-		}
-
-		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
-		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
-		__asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-		val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i);
-
-		GUEST_ASSERT_NE(val, 0);
-	}
-	GUEST_DONE();
-}
-
-static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
-				uint8_t nr_fixed_counters,
-				uint32_t supported_bitmask)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_fixed_counters,
-					 pmu_version, perf_capabilities);
-
-	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK,
-				supported_bitmask);
-	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_FIXED_COUNTERS,
-				nr_fixed_counters);
-
-	run_vcpu(vcpu);
-
-	kvm_vm_free(vm);
-}
-
-static void test_intel_counters(void)
-{
-	uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
-	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
-	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
-	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
-	unsigned int i;
-	uint8_t v, j;
-	uint32_t k;
-
-	const uint64_t perf_caps[] = {
-		0,
-		PMU_CAP_FW_WRITES,
-	};
-
-	/*
-	 * Test up to PMU v5, which is the current maximum version defined by
-	 * Intel, i.e. is the last version that is guaranteed to be backwards
-	 * compatible with KVM's existing behavior.
-	 */
-	uint8_t max_pmu_version = max_t(typeof(pmu_version), pmu_version, 5);
-
-	/*
-	 * Detect the existence of events that aren't supported by selftests.
-	 * This will (obviously) fail any time the kernel adds support for a
-	 * new event, but it's worth paying that price to keep the test fresh.
-	 */
-	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
-		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
-		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
-
-	/*
-	 * Force iterating over known arch events regardless of whether or not
-	 * KVM/hardware supports a given event.
-	 */
-	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
-
-	for (v = 0; v <= max_pmu_version; v++) {
-		for (i = 0; i < ARRAY_SIZE(perf_caps); i++) {
-			if (!kvm_has_perf_caps && perf_caps[i])
-				continue;
-
-			pr_info("Testing arch events, PMU version %u, perf_caps = %lx\n",
-				v, perf_caps[i]);
-			/*
-			 * To keep the total runtime reasonable, test every
-			 * possible non-zero, non-reserved bitmap combination
-			 * only with the native PMU version and the full bit
-			 * vector length.
-			 */
-			if (v == pmu_version) {
-				for (k = 1; k < (BIT(nr_arch_events) - 1); k++)
-					test_arch_events(v, perf_caps[i], nr_arch_events, k);
-			}
-			/*
-			 * Test single bits for all PMU version and lengths up
-			 * the number of events +1 (to verify KVM doesn't do
-			 * weird things if the guest length is greater than the
-			 * host length).  Explicitly test a mask of '0' and all
-			 * ones i.e. all events being available and unavailable.
-			 */
-			for (j = 0; j <= nr_arch_events + 1; j++) {
-				test_arch_events(v, perf_caps[i], j, 0);
-				test_arch_events(v, perf_caps[i], j, 0xff);
-
-				for (k = 0; k < nr_arch_events; k++)
-					test_arch_events(v, perf_caps[i], j, BIT(k));
-			}
-
-			pr_info("Testing GP counters, PMU version %u, perf_caps = %lx\n",
-				v, perf_caps[i]);
-			for (j = 0; j <= nr_gp_counters; j++)
-				test_gp_counters(v, perf_caps[i], j);
-
-			pr_info("Testing fixed counters, PMU version %u, perf_caps = %lx\n",
-				v, perf_caps[i]);
-			for (j = 0; j <= nr_fixed_counters; j++) {
-				for (k = 0; k <= (BIT(nr_fixed_counters) - 1); k++)
-					test_fixed_counters(v, perf_caps[i], j, k);
-			}
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_is_pmu_enabled());
-
-	TEST_REQUIRE(host_cpu_is_intel);
-	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
-	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
-
-	kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
-	kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM);
-
-	test_intel_counters();
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
deleted file mode 100644
index c15513cd74d1..000000000000
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ /dev/null
@@ -1,876 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Test for x86 KVM_SET_PMU_EVENT_FILTER.
- *
- * Copyright (C) 2022, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * Verifies the expected behavior of allow lists and deny lists for
- * virtual PMU events.
- */
-#include "kvm_util.h"
-#include "pmu.h"
-#include "processor.h"
-#include "test_util.h"
-
-#define NUM_BRANCHES 42
-#define MAX_TEST_EVENTS		10
-
-#define PMU_EVENT_FILTER_INVALID_ACTION		(KVM_PMU_EVENT_DENY + 1)
-#define PMU_EVENT_FILTER_INVALID_FLAGS			(KVM_PMU_EVENT_FLAGS_VALID_MASK << 1)
-#define PMU_EVENT_FILTER_INVALID_NEVENTS		(KVM_PMU_EVENT_FILTER_MAX_EVENTS + 1)
-
-struct __kvm_pmu_event_filter {
-	__u32 action;
-	__u32 nevents;
-	__u32 fixed_counter_bitmap;
-	__u32 flags;
-	__u32 pad[4];
-	__u64 events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
-};
-
-/*
- * This event list comprises Intel's known architectural events, plus AMD's
- * Branch Instructions Retired for Zen CPUs.  Note, AMD and Intel use the
- * same encoding for Instructions Retired.
- */
-kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED);
-
-static const struct __kvm_pmu_event_filter base_event_filter = {
-	.nevents = ARRAY_SIZE(base_event_filter.events),
-	.events = {
-		INTEL_ARCH_CPU_CYCLES,
-		INTEL_ARCH_INSTRUCTIONS_RETIRED,
-		INTEL_ARCH_REFERENCE_CYCLES,
-		INTEL_ARCH_LLC_REFERENCES,
-		INTEL_ARCH_LLC_MISSES,
-		INTEL_ARCH_BRANCHES_RETIRED,
-		INTEL_ARCH_BRANCHES_MISPREDICTED,
-		INTEL_ARCH_TOPDOWN_SLOTS,
-		AMD_ZEN_BRANCHES_RETIRED,
-	},
-};
-
-struct {
-	uint64_t loads;
-	uint64_t stores;
-	uint64_t loads_stores;
-	uint64_t branches_retired;
-	uint64_t instructions_retired;
-} pmc_results;
-
-/*
- * If we encounter a #GP during the guest PMU sanity check, then the guest
- * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
- */
-static void guest_gp_handler(struct ex_regs *regs)
-{
-	GUEST_SYNC(-EFAULT);
-}
-
-/*
- * Check that we can write a new value to the given MSR and read it back.
- * The caller should provide a non-empty set of bits that are safe to flip.
- *
- * Return on success. GUEST_SYNC(0) on error.
- */
-static void check_msr(uint32_t msr, uint64_t bits_to_flip)
-{
-	uint64_t v = rdmsr(msr) ^ bits_to_flip;
-
-	wrmsr(msr, v);
-	if (rdmsr(msr) != v)
-		GUEST_SYNC(-EIO);
-
-	v ^= bits_to_flip;
-	wrmsr(msr, v);
-	if (rdmsr(msr) != v)
-		GUEST_SYNC(-EIO);
-}
-
-static void run_and_measure_loop(uint32_t msr_base)
-{
-	const uint64_t branches_retired = rdmsr(msr_base + 0);
-	const uint64_t insn_retired = rdmsr(msr_base + 1);
-
-	__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
-
-	pmc_results.branches_retired = rdmsr(msr_base + 0) - branches_retired;
-	pmc_results.instructions_retired = rdmsr(msr_base + 1) - insn_retired;
-}
-
-static void intel_guest_code(void)
-{
-	check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
-	check_msr(MSR_P6_EVNTSEL0, 0xffff);
-	check_msr(MSR_IA32_PMC0, 0xffff);
-	GUEST_SYNC(0);
-
-	for (;;) {
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-		wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_BRANCHES_RETIRED);
-		wrmsr(MSR_P6_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_INSTRUCTIONS_RETIRED);
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
-
-		run_and_measure_loop(MSR_IA32_PMC0);
-		GUEST_SYNC(0);
-	}
-}
-
-/*
- * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
- * this code uses the always-available, legacy K7 PMU MSRs, which alias to
- * the first four of the six extended core PMU MSRs.
- */
-static void amd_guest_code(void)
-{
-	check_msr(MSR_K7_EVNTSEL0, 0xffff);
-	check_msr(MSR_K7_PERFCTR0, 0xffff);
-	GUEST_SYNC(0);
-
-	for (;;) {
-		wrmsr(MSR_K7_EVNTSEL0, 0);
-		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BRANCHES_RETIRED);
-		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_INSTRUCTIONS_RETIRED);
-
-		run_and_measure_loop(MSR_K7_PERFCTR0);
-		GUEST_SYNC(0);
-	}
-}
-
-/*
- * Run the VM to the next GUEST_SYNC(value), and return the value passed
- * to the sync. Any other exit from the guest is fatal.
- */
-static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	get_ucall(vcpu, &uc);
-	TEST_ASSERT(uc.cmd == UCALL_SYNC,
-		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
-	return uc.args[1];
-}
-
-static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
-{
-	uint64_t r;
-
-	memset(&pmc_results, 0, sizeof(pmc_results));
-	sync_global_to_guest(vcpu->vm, pmc_results);
-
-	r = run_vcpu_to_sync(vcpu);
-	TEST_ASSERT(!r, "Unexpected sync value: 0x%lx", r);
-
-	sync_global_from_guest(vcpu->vm, pmc_results);
-}
-
-/*
- * In a nested environment or if the vPMU is disabled, the guest PMU
- * might not work as architected (accessing the PMU MSRs may raise
- * #GP, or writes could simply be discarded). In those situations,
- * there is no point in running these tests. The guest code will perform
- * a sanity check and then GUEST_SYNC(success). In the case of failure,
- * the behavior of the guest on resumption is undefined.
- */
-static bool sanity_check_pmu(struct kvm_vcpu *vcpu)
-{
-	uint64_t r;
-
-	vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler);
-	r = run_vcpu_to_sync(vcpu);
-	vm_install_exception_handler(vcpu->vm, GP_VECTOR, NULL);
-
-	return !r;
-}
-
-/*
- * Remove the first occurrence of 'event' (if any) from the filter's
- * event list.
- */
-static void remove_event(struct __kvm_pmu_event_filter *f, uint64_t event)
-{
-	bool found = false;
-	int i;
-
-	for (i = 0; i < f->nevents; i++) {
-		if (found)
-			f->events[i - 1] = f->events[i];
-		else
-			found = f->events[i] == event;
-	}
-	if (found)
-		f->nevents--;
-}
-
-#define ASSERT_PMC_COUNTING_INSTRUCTIONS()						\
-do {											\
-	uint64_t br = pmc_results.branches_retired;					\
-	uint64_t ir = pmc_results.instructions_retired;					\
-											\
-	if (br && br != NUM_BRANCHES)							\
-		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",	\
-			__func__, br, NUM_BRANCHES);					\
-	TEST_ASSERT(br, "%s: Branch instructions retired = %lu (expected > 0)",		\
-		    __func__, br);							\
-	TEST_ASSERT(ir,	"%s: Instructions retired = %lu (expected > 0)",		\
-		    __func__, ir);							\
-} while (0)
-
-#define ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS()						\
-do {											\
-	uint64_t br = pmc_results.branches_retired;					\
-	uint64_t ir = pmc_results.instructions_retired;					\
-											\
-	TEST_ASSERT(!br, "%s: Branch instructions retired = %lu (expected 0)",		\
-		    __func__, br);							\
-	TEST_ASSERT(!ir, "%s: Instructions retired = %lu (expected 0)",			\
-		    __func__, ir);							\
-} while (0)
-
-static void test_without_filter(struct kvm_vcpu *vcpu)
-{
-	run_vcpu_and_sync_pmc_results(vcpu);
-
-	ASSERT_PMC_COUNTING_INSTRUCTIONS();
-}
-
-static void test_with_filter(struct kvm_vcpu *vcpu,
-			     struct __kvm_pmu_event_filter *__f)
-{
-	struct kvm_pmu_event_filter *f = (void *)__f;
-
-	vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
-	run_vcpu_and_sync_pmc_results(vcpu);
-}
-
-static void test_amd_deny_list(struct kvm_vcpu *vcpu)
-{
-	struct __kvm_pmu_event_filter f = {
-		.action = KVM_PMU_EVENT_DENY,
-		.nevents = 1,
-		.events = {
-			RAW_EVENT(0x1C2, 0),
-		},
-	};
-
-	test_with_filter(vcpu, &f);
-
-	ASSERT_PMC_COUNTING_INSTRUCTIONS();
-}
-
-static void test_member_deny_list(struct kvm_vcpu *vcpu)
-{
-	struct __kvm_pmu_event_filter f = base_event_filter;
-
-	f.action = KVM_PMU_EVENT_DENY;
-	test_with_filter(vcpu, &f);
-
-	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
-}
-
-static void test_member_allow_list(struct kvm_vcpu *vcpu)
-{
-	struct __kvm_pmu_event_filter f = base_event_filter;
-
-	f.action = KVM_PMU_EVENT_ALLOW;
-	test_with_filter(vcpu, &f);
-
-	ASSERT_PMC_COUNTING_INSTRUCTIONS();
-}
-
-static void test_not_member_deny_list(struct kvm_vcpu *vcpu)
-{
-	struct __kvm_pmu_event_filter f = base_event_filter;
-
-	f.action = KVM_PMU_EVENT_DENY;
-
-	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
-	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
-	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
-	test_with_filter(vcpu, &f);
-
-	ASSERT_PMC_COUNTING_INSTRUCTIONS();
-}
-
-static void test_not_member_allow_list(struct kvm_vcpu *vcpu)
-{
-	struct __kvm_pmu_event_filter f = base_event_filter;
-
-	f.action = KVM_PMU_EVENT_ALLOW;
-
-	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
-	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
-	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
-	test_with_filter(vcpu, &f);
-
-	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
-}
-
-/*
- * Verify that setting KVM_PMU_CAP_DISABLE prevents the use of the PMU.
- *
- * Note that KVM_CAP_PMU_CAPABILITY must be invoked prior to creating VCPUs.
- */
-static void test_pmu_config_disable(void (*guest_code)(void))
-{
-	struct kvm_vcpu *vcpu;
-	int r;
-	struct kvm_vm *vm;
-
-	r = kvm_check_cap(KVM_CAP_PMU_CAPABILITY);
-	if (!(r & KVM_PMU_CAP_DISABLE))
-		return;
-
-	vm = vm_create(1);
-
-	vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE);
-
-	vcpu = vm_vcpu_add(vm, 0, guest_code);
-	TEST_ASSERT(!sanity_check_pmu(vcpu),
-		    "Guest should not be able to use disabled PMU.");
-
-	kvm_vm_free(vm);
-}
-
-/*
- * On Intel, check for a non-zero PMU version, at least one general-purpose
- * counter per logical processor, and support for counting the number of branch
- * instructions retired.
- */
-static bool use_intel_pmu(void)
-{
-	return host_cpu_is_intel &&
-	       kvm_cpu_property(X86_PROPERTY_PMU_VERSION) &&
-	       kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) &&
-	       kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
-}
-
-/*
- * On AMD, all Family 17h+ CPUs (Zen and its successors) use event encoding
- * 0xc2,0 for Branch Instructions Retired.
- */
-static bool use_amd_pmu(void)
-{
-	return host_cpu_is_amd && kvm_cpu_family() >= 0x17;
-}
-
-/*
- * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and
- * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/
- * supported on Intel Xeon processors:
- *  - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake.
- */
-#define MEM_INST_RETIRED		0xD0
-#define MEM_INST_RETIRED_LOAD		RAW_EVENT(MEM_INST_RETIRED, 0x81)
-#define MEM_INST_RETIRED_STORE		RAW_EVENT(MEM_INST_RETIRED, 0x82)
-#define MEM_INST_RETIRED_LOAD_STORE	RAW_EVENT(MEM_INST_RETIRED, 0x83)
-
-static bool supports_event_mem_inst_retired(void)
-{
-	uint32_t eax, ebx, ecx, edx;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-	if (x86_family(eax) == 0x6) {
-		switch (x86_model(eax)) {
-		/* Sapphire Rapids */
-		case 0x8F:
-		/* Ice Lake */
-		case 0x6A:
-		/* Skylake */
-		/* Cascade Lake */
-		case 0x55:
-			return true;
-		}
-	}
-
-	return false;
-}
-
-/*
- * "LS Dispatch", from Processor Programming Reference
- * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
- * Preliminary Processor Programming Reference (PPR) for AMD Family
- * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
- * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
- * B1 Processors Volume 1 of 2.
- */
-#define LS_DISPATCH		0x29
-#define LS_DISPATCH_LOAD	RAW_EVENT(LS_DISPATCH, BIT(0))
-#define LS_DISPATCH_STORE	RAW_EVENT(LS_DISPATCH, BIT(1))
-#define LS_DISPATCH_LOAD_STORE	RAW_EVENT(LS_DISPATCH, BIT(2))
-
-#define INCLUDE_MASKED_ENTRY(event_select, mask, match) \
-	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false)
-#define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \
-	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true)
-
-static void masked_events_guest_test(uint32_t msr_base)
-{
-	/*
-	 * The actual value of the counters don't determine the outcome of
-	 * the test.  Only that they are zero or non-zero.
-	 */
-	const uint64_t loads = rdmsr(msr_base + 0);
-	const uint64_t stores = rdmsr(msr_base + 1);
-	const uint64_t loads_stores = rdmsr(msr_base + 2);
-	int val;
-
-
-	__asm__ __volatile__("movl $0, %[v];"
-			     "movl %[v], %%eax;"
-			     "incl %[v];"
-			     : [v]"+m"(val) :: "eax");
-
-	pmc_results.loads = rdmsr(msr_base + 0) - loads;
-	pmc_results.stores = rdmsr(msr_base + 1) - stores;
-	pmc_results.loads_stores = rdmsr(msr_base + 2) - loads_stores;
-}
-
-static void intel_masked_events_guest_code(void)
-{
-	for (;;) {
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-		wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD);
-		wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE);
-		wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE);
-
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7);
-
-		masked_events_guest_test(MSR_IA32_PMC0);
-		GUEST_SYNC(0);
-	}
-}
-
-static void amd_masked_events_guest_code(void)
-{
-	for (;;) {
-		wrmsr(MSR_K7_EVNTSEL0, 0);
-		wrmsr(MSR_K7_EVNTSEL1, 0);
-		wrmsr(MSR_K7_EVNTSEL2, 0);
-
-		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD);
-		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE);
-		wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE);
-
-		masked_events_guest_test(MSR_K7_PERFCTR0);
-		GUEST_SYNC(0);
-	}
-}
-
-static void run_masked_events_test(struct kvm_vcpu *vcpu,
-				   const uint64_t masked_events[],
-				   const int nmasked_events)
-{
-	struct __kvm_pmu_event_filter f = {
-		.nevents = nmasked_events,
-		.action = KVM_PMU_EVENT_ALLOW,
-		.flags = KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
-	};
-
-	memcpy(f.events, masked_events, sizeof(uint64_t) * nmasked_events);
-	test_with_filter(vcpu, &f);
-}
-
-#define ALLOW_LOADS		BIT(0)
-#define ALLOW_STORES		BIT(1)
-#define ALLOW_LOADS_STORES	BIT(2)
-
-struct masked_events_test {
-	uint64_t intel_events[MAX_TEST_EVENTS];
-	uint64_t intel_event_end;
-	uint64_t amd_events[MAX_TEST_EVENTS];
-	uint64_t amd_event_end;
-	const char *msg;
-	uint32_t flags;
-};
-
-/*
- * These are the test cases for the masked events tests.
- *
- * For each test, the guest enables 3 PMU counters (loads, stores,
- * loads + stores).  The filter is then set in KVM with the masked events
- * provided.  The test then verifies that the counters agree with which
- * ones should be counting and which ones should be filtered.
- */
-const struct masked_events_test test_cases[] = {
-	{
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
-		},
-		.msg = "Only allow loads.",
-		.flags = ALLOW_LOADS,
-	}, {
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
-		},
-		.msg = "Only allow stores.",
-		.flags = ALLOW_STORES,
-	}, {
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)),
-		},
-		.msg = "Only allow loads + stores.",
-		.flags = ALLOW_LOADS_STORES,
-	}, {
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
-			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0),
-		},
-		.msg = "Only allow loads and stores.",
-		.flags = ALLOW_LOADS | ALLOW_STORES,
-	}, {
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
-			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
-			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
-		},
-		.msg = "Only allow loads and loads + stores.",
-		.flags = ALLOW_LOADS | ALLOW_LOADS_STORES
-	}, {
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
-			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
-		},
-		.msg = "Only allow stores and loads + stores.",
-		.flags = ALLOW_STORES | ALLOW_LOADS_STORES
-	}, {
-		.intel_events = {
-			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
-		},
-		.amd_events = {
-			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
-		},
-		.msg = "Only allow loads, stores, and loads + stores.",
-		.flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES
-	},
-};
-
-static int append_test_events(const struct masked_events_test *test,
-			      uint64_t *events, int nevents)
-{
-	const uint64_t *evts;
-	int i;
-
-	evts = use_intel_pmu() ? test->intel_events : test->amd_events;
-	for (i = 0; i < MAX_TEST_EVENTS; i++) {
-		if (evts[i] == 0)
-			break;
-
-		events[nevents + i] = evts[i];
-	}
-
-	return nevents + i;
-}
-
-static bool bool_eq(bool a, bool b)
-{
-	return a == b;
-}
-
-static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events,
-				    int nevents)
-{
-	int ntests = ARRAY_SIZE(test_cases);
-	int i, n;
-
-	for (i = 0; i < ntests; i++) {
-		const struct masked_events_test *test = &test_cases[i];
-
-		/* Do any test case events overflow MAX_TEST_EVENTS? */
-		assert(test->intel_event_end == 0);
-		assert(test->amd_event_end == 0);
-
-		n = append_test_events(test, events, nevents);
-
-		run_masked_events_test(vcpu, events, n);
-
-		TEST_ASSERT(bool_eq(pmc_results.loads, test->flags & ALLOW_LOADS) &&
-			    bool_eq(pmc_results.stores, test->flags & ALLOW_STORES) &&
-			    bool_eq(pmc_results.loads_stores,
-				    test->flags & ALLOW_LOADS_STORES),
-			    "%s  loads: %lu, stores: %lu, loads + stores: %lu",
-			    test->msg, pmc_results.loads, pmc_results.stores,
-			    pmc_results.loads_stores);
-	}
-}
-
-static void add_dummy_events(uint64_t *events, int nevents)
-{
-	int i;
-
-	for (i = 0; i < nevents; i++) {
-		int event_select = i % 0xFF;
-		bool exclude = ((i % 4) == 0);
-
-		if (event_select == MEM_INST_RETIRED ||
-		    event_select == LS_DISPATCH)
-			event_select++;
-
-		events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0,
-							0, exclude);
-	}
-}
-
-static void test_masked_events(struct kvm_vcpu *vcpu)
-{
-	int nevents = KVM_PMU_EVENT_FILTER_MAX_EVENTS - MAX_TEST_EVENTS;
-	uint64_t events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
-
-	/* Run the test cases against a sparse PMU event filter. */
-	run_masked_events_tests(vcpu, events, 0);
-
-	/* Run the test cases against a dense PMU event filter. */
-	add_dummy_events(events, KVM_PMU_EVENT_FILTER_MAX_EVENTS);
-	run_masked_events_tests(vcpu, events, nevents);
-}
-
-static int set_pmu_event_filter(struct kvm_vcpu *vcpu,
-				struct __kvm_pmu_event_filter *__f)
-{
-	struct kvm_pmu_event_filter *f = (void *)__f;
-
-	return __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
-}
-
-static int set_pmu_single_event_filter(struct kvm_vcpu *vcpu, uint64_t event,
-				       uint32_t flags, uint32_t action)
-{
-	struct __kvm_pmu_event_filter f = {
-		.nevents = 1,
-		.flags = flags,
-		.action = action,
-		.events = {
-			event,
-		},
-	};
-
-	return set_pmu_event_filter(vcpu, &f);
-}
-
-static void test_filter_ioctl(struct kvm_vcpu *vcpu)
-{
-	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
-	struct __kvm_pmu_event_filter f;
-	uint64_t e = ~0ul;
-	int r;
-
-	/*
-	 * Unfortunately having invalid bits set in event data is expected to
-	 * pass when flags == 0 (bits other than eventsel+umask).
-	 */
-	r = set_pmu_single_event_filter(vcpu, e, 0, KVM_PMU_EVENT_ALLOW);
-	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
-
-	r = set_pmu_single_event_filter(vcpu, e,
-					KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
-					KVM_PMU_EVENT_ALLOW);
-	TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail");
-
-	e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf);
-	r = set_pmu_single_event_filter(vcpu, e,
-					KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
-					KVM_PMU_EVENT_ALLOW);
-	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
-
-	f = base_event_filter;
-	f.action = PMU_EVENT_FILTER_INVALID_ACTION;
-	r = set_pmu_event_filter(vcpu, &f);
-	TEST_ASSERT(r, "Set invalid action is expected to fail");
-
-	f = base_event_filter;
-	f.flags = PMU_EVENT_FILTER_INVALID_FLAGS;
-	r = set_pmu_event_filter(vcpu, &f);
-	TEST_ASSERT(r, "Set invalid flags is expected to fail");
-
-	f = base_event_filter;
-	f.nevents = PMU_EVENT_FILTER_INVALID_NEVENTS;
-	r = set_pmu_event_filter(vcpu, &f);
-	TEST_ASSERT(r, "Exceeding the max number of filter events should fail");
-
-	f = base_event_filter;
-	f.fixed_counter_bitmap = ~GENMASK_ULL(nr_fixed_counters, 0);
-	r = set_pmu_event_filter(vcpu, &f);
-	TEST_ASSERT(!r, "Masking non-existent fixed counters should be allowed");
-}
-
-static void intel_run_fixed_counter_guest_code(uint8_t idx)
-{
-	for (;;) {
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
-
-		/* Only OS_EN bit is enabled for fixed counter[idx]. */
-		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL));
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx));
-		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-		GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + idx));
-	}
-}
-
-static uint64_t test_with_fixed_counter_filter(struct kvm_vcpu *vcpu,
-					       uint32_t action, uint32_t bitmap)
-{
-	struct __kvm_pmu_event_filter f = {
-		.action = action,
-		.fixed_counter_bitmap = bitmap,
-	};
-	set_pmu_event_filter(vcpu, &f);
-
-	return run_vcpu_to_sync(vcpu);
-}
-
-static uint64_t test_set_gp_and_fixed_event_filter(struct kvm_vcpu *vcpu,
-						   uint32_t action,
-						   uint32_t bitmap)
-{
-	struct __kvm_pmu_event_filter f = base_event_filter;
-
-	f.action = action;
-	f.fixed_counter_bitmap = bitmap;
-	set_pmu_event_filter(vcpu, &f);
-
-	return run_vcpu_to_sync(vcpu);
-}
-
-static void __test_fixed_counter_bitmap(struct kvm_vcpu *vcpu, uint8_t idx,
-					uint8_t nr_fixed_counters)
-{
-	unsigned int i;
-	uint32_t bitmap;
-	uint64_t count;
-
-	TEST_ASSERT(nr_fixed_counters < sizeof(bitmap) * 8,
-		    "Invalid nr_fixed_counters");
-
-	/*
-	 * Check the fixed performance counter can count normally when KVM
-	 * userspace doesn't set any pmu filter.
-	 */
-	count = run_vcpu_to_sync(vcpu);
-	TEST_ASSERT(count, "Unexpected count value: %ld", count);
-
-	for (i = 0; i < BIT(nr_fixed_counters); i++) {
-		bitmap = BIT(i);
-		count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_ALLOW,
-						       bitmap);
-		TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx)));
-
-		count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_DENY,
-						       bitmap);
-		TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx)));
-
-		/*
-		 * Check that fixed_counter_bitmap has higher priority than
-		 * events[] when both are set.
-		 */
-		count = test_set_gp_and_fixed_event_filter(vcpu,
-							   KVM_PMU_EVENT_ALLOW,
-							   bitmap);
-		TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx)));
-
-		count = test_set_gp_and_fixed_event_filter(vcpu,
-							   KVM_PMU_EVENT_DENY,
-							   bitmap);
-		TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx)));
-	}
-}
-
-static void test_fixed_counter_bitmap(void)
-{
-	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	uint8_t idx;
-
-	/*
-	 * Check that pmu_event_filter works as expected when it's applied to
-	 * fixed performance counters.
-	 */
-	for (idx = 0; idx < nr_fixed_counters; idx++) {
-		vm = vm_create_with_one_vcpu(&vcpu,
-					     intel_run_fixed_counter_guest_code);
-		vcpu_args_set(vcpu, 1, idx);
-		__test_fixed_counter_bitmap(vcpu, idx, nr_fixed_counters);
-		kvm_vm_free(vm);
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	void (*guest_code)(void);
-	struct kvm_vcpu *vcpu, *vcpu2 = NULL;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_is_pmu_enabled());
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS));
-
-	TEST_REQUIRE(use_intel_pmu() || use_amd_pmu());
-	guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	TEST_REQUIRE(sanity_check_pmu(vcpu));
-
-	if (use_amd_pmu())
-		test_amd_deny_list(vcpu);
-
-	test_without_filter(vcpu);
-	test_member_deny_list(vcpu);
-	test_member_allow_list(vcpu);
-	test_not_member_deny_list(vcpu);
-	test_not_member_allow_list(vcpu);
-
-	if (use_intel_pmu() &&
-	    supports_event_mem_inst_retired() &&
-	    kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3)
-		vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code);
-	else if (use_amd_pmu())
-		vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code);
-
-	if (vcpu2)
-		test_masked_events(vcpu2);
-	test_filter_ioctl(vcpu);
-
-	kvm_vm_free(vm);
-
-	test_pmu_config_disable(guest_code);
-	test_fixed_counter_bitmap();
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
deleted file mode 100644
index 82a8d88b5338..000000000000
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ /dev/null
@@ -1,483 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2022, Google LLC.
- */
-#include <fcntl.h>
-#include <limits.h>
-#include <pthread.h>
-#include <sched.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/kvm_para.h>
-#include <linux/memfd.h>
-#include <linux/sizes.h>
-
-#include <test_util.h>
-#include <kvm_util.h>
-#include <processor.h>
-
-#define BASE_DATA_SLOT		10
-#define BASE_DATA_GPA		((uint64_t)(1ull << 32))
-#define PER_CPU_DATA_SIZE	((uint64_t)(SZ_2M + PAGE_SIZE))
-
-/* Horrific macro so that the line info is captured accurately :-( */
-#define memcmp_g(gpa, pattern,  size)								\
-do {												\
-	uint8_t *mem = (uint8_t *)gpa;								\
-	size_t i;										\
-												\
-	for (i = 0; i < size; i++)								\
-		__GUEST_ASSERT(mem[i] == pattern,						\
-			       "Guest expected 0x%x at offset %lu (gpa 0x%lx), got 0x%x",	\
-			       pattern, i, gpa + i, mem[i]);					\
-} while (0)
-
-static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size)
-{
-	size_t i;
-
-	for (i = 0; i < size; i++)
-		TEST_ASSERT(mem[i] == pattern,
-			    "Host expected 0x%x at gpa 0x%lx, got 0x%x",
-			    pattern, gpa + i, mem[i]);
-}
-
-/*
- * Run memory conversion tests with explicit conversion:
- * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit
- * to back/unback private memory. Subsequent accesses by guest to the gpa range
- * will not cause exit to userspace.
- *
- * Test memory conversion scenarios with following steps:
- * 1) Access private memory using private access and verify that memory contents
- *   are not visible to userspace.
- * 2) Convert memory to shared using explicit conversions and ensure that
- *   userspace is able to access the shared regions.
- * 3) Convert memory back to private using explicit conversions and ensure that
- *   userspace is again not able to access converted private regions.
- */
-
-#define GUEST_STAGE(o, s) { .offset = o, .size = s }
-
-enum ucall_syncs {
-	SYNC_SHARED,
-	SYNC_PRIVATE,
-};
-
-static void guest_sync_shared(uint64_t gpa, uint64_t size,
-			      uint8_t current_pattern, uint8_t new_pattern)
-{
-	GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern);
-}
-
-static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern)
-{
-	GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern);
-}
-
-/* Arbitrary values, KVM doesn't care about the attribute flags. */
-#define MAP_GPA_SET_ATTRIBUTES	BIT(0)
-#define MAP_GPA_SHARED		BIT(1)
-#define MAP_GPA_DO_FALLOCATE	BIT(2)
-
-static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared,
-			  bool do_fallocate)
-{
-	uint64_t flags = MAP_GPA_SET_ATTRIBUTES;
-
-	if (map_shared)
-		flags |= MAP_GPA_SHARED;
-	if (do_fallocate)
-		flags |= MAP_GPA_DO_FALLOCATE;
-	kvm_hypercall_map_gpa_range(gpa, size, flags);
-}
-
-static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate)
-{
-	guest_map_mem(gpa, size, true, do_fallocate);
-}
-
-static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate)
-{
-	guest_map_mem(gpa, size, false, do_fallocate);
-}
-
-struct {
-	uint64_t offset;
-	uint64_t size;
-} static const test_ranges[] = {
-	GUEST_STAGE(0, PAGE_SIZE),
-	GUEST_STAGE(0, SZ_2M),
-	GUEST_STAGE(PAGE_SIZE, PAGE_SIZE),
-	GUEST_STAGE(PAGE_SIZE, SZ_2M),
-	GUEST_STAGE(SZ_2M, PAGE_SIZE),
-};
-
-static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate)
-{
-	const uint8_t def_p = 0xaa;
-	const uint8_t init_p = 0xcc;
-	uint64_t j;
-	int i;
-
-	/* Memory should be shared by default. */
-	memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE);
-	memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE);
-	guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p);
-
-	memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
-
-	for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
-		uint64_t gpa = base_gpa + test_ranges[i].offset;
-		uint64_t size = test_ranges[i].size;
-		uint8_t p1 = 0x11;
-		uint8_t p2 = 0x22;
-		uint8_t p3 = 0x33;
-		uint8_t p4 = 0x44;
-
-		/*
-		 * Set the test region to pattern one to differentiate it from
-		 * the data range as a whole (contains the initial pattern).
-		 */
-		memset((void *)gpa, p1, size);
-
-		/*
-		 * Convert to private, set and verify the private data, and
-		 * then verify that the rest of the data (map shared) still
-		 * holds the initial pattern, and that the host always sees the
-		 * shared memory (initial pattern).  Unlike shared memory,
-		 * punching a hole in private memory is destructive, i.e.
-		 * previous values aren't guaranteed to be preserved.
-		 */
-		guest_map_private(gpa, size, do_fallocate);
-
-		if (size > PAGE_SIZE) {
-			memset((void *)gpa, p2, PAGE_SIZE);
-			goto skip;
-		}
-
-		memset((void *)gpa, p2, size);
-		guest_sync_private(gpa, size, p1);
-
-		/*
-		 * Verify that the private memory was set to pattern two, and
-		 * that shared memory still holds the initial pattern.
-		 */
-		memcmp_g(gpa, p2, size);
-		if (gpa > base_gpa)
-			memcmp_g(base_gpa, init_p, gpa - base_gpa);
-		if (gpa + size < base_gpa + PER_CPU_DATA_SIZE)
-			memcmp_g(gpa + size, init_p,
-				 (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size));
-
-		/*
-		 * Convert odd-number page frames back to shared to verify KVM
-		 * also correctly handles holes in private ranges.
-		 */
-		for (j = 0; j < size; j += PAGE_SIZE) {
-			if ((j >> PAGE_SHIFT) & 1) {
-				guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate);
-				guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3);
-
-				memcmp_g(gpa + j, p3, PAGE_SIZE);
-			} else {
-				guest_sync_private(gpa + j, PAGE_SIZE, p1);
-			}
-		}
-
-skip:
-		/*
-		 * Convert the entire region back to shared, explicitly write
-		 * pattern three to fill in the even-number frames before
-		 * asking the host to verify (and write pattern four).
-		 */
-		guest_map_shared(gpa, size, do_fallocate);
-		memset((void *)gpa, p3, size);
-		guest_sync_shared(gpa, size, p3, p4);
-		memcmp_g(gpa, p4, size);
-
-		/* Reset the shared memory back to the initial pattern. */
-		memset((void *)gpa, init_p, size);
-
-		/*
-		 * Free (via PUNCH_HOLE) *all* private memory so that the next
-		 * iteration starts from a clean slate, e.g. with respect to
-		 * whether or not there are pages/folios in guest_mem.
-		 */
-		guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true);
-	}
-}
-
-static void guest_punch_hole(uint64_t gpa, uint64_t size)
-{
-	/* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */
-	uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE;
-
-	kvm_hypercall_map_gpa_range(gpa, size, flags);
-}
-
-/*
- * Test that PUNCH_HOLE actually frees memory by punching holes without doing a
- * proper conversion.  Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating
- * (subsequent fault) should zero memory.
- */
-static void guest_test_punch_hole(uint64_t base_gpa, bool precise)
-{
-	const uint8_t init_p = 0xcc;
-	int i;
-
-	/*
-	 * Convert the entire range to private, this testcase is all about
-	 * punching holes in guest_memfd, i.e. shared mappings aren't needed.
-	 */
-	guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false);
-
-	for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
-		uint64_t gpa = base_gpa + test_ranges[i].offset;
-		uint64_t size = test_ranges[i].size;
-
-		/*
-		 * Free all memory before each iteration, even for the !precise
-		 * case where the memory will be faulted back in.  Freeing and
-		 * reallocating should obviously work, and freeing all memory
-		 * minimizes the probability of cross-testcase influence.
-		 */
-		guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE);
-
-		/* Fault-in and initialize memory, and verify the pattern. */
-		if (precise) {
-			memset((void *)gpa, init_p, size);
-			memcmp_g(gpa, init_p, size);
-		} else {
-			memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE);
-			memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
-		}
-
-		/*
-		 * Punch a hole at the target range and verify that reads from
-		 * the guest succeed and return zeroes.
-		 */
-		guest_punch_hole(gpa, size);
-		memcmp_g(gpa, 0, size);
-	}
-}
-
-static void guest_code(uint64_t base_gpa)
-{
-	/*
-	 * Run the conversion test twice, with and without doing fallocate() on
-	 * the guest_memfd backing when converting between shared and private.
-	 */
-	guest_test_explicit_conversion(base_gpa, false);
-	guest_test_explicit_conversion(base_gpa, true);
-
-	/*
-	 * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd
-	 * faulted in, once with only the target range faulted in.
-	 */
-	guest_test_punch_hole(base_gpa, false);
-	guest_test_punch_hole(base_gpa, true);
-	GUEST_DONE();
-}
-
-static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	uint64_t gpa = run->hypercall.args[0];
-	uint64_t size = run->hypercall.args[1] * PAGE_SIZE;
-	bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES;
-	bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED;
-	bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE;
-	struct kvm_vm *vm = vcpu->vm;
-
-	TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE,
-		    "Wanted MAP_GPA_RANGE (%u), got '%llu'",
-		    KVM_HC_MAP_GPA_RANGE, run->hypercall.nr);
-
-	if (do_fallocate)
-		vm_guest_mem_fallocate(vm, gpa, size, map_shared);
-
-	if (set_attributes)
-		vm_set_memory_attributes(vm, gpa, size,
-					 map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
-	run->hypercall.ret = 0;
-}
-
-static bool run_vcpus;
-
-static void *__test_mem_conversions(void *__vcpu)
-{
-	struct kvm_vcpu *vcpu = __vcpu;
-	struct kvm_run *run = vcpu->run;
-	struct kvm_vm *vm = vcpu->vm;
-	struct ucall uc;
-
-	while (!READ_ONCE(run_vcpus))
-		;
-
-	for ( ;; ) {
-		vcpu_run(vcpu);
-
-		if (run->exit_reason == KVM_EXIT_HYPERCALL) {
-			handle_exit_hypercall(vcpu);
-			continue;
-		}
-
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
-			    run->exit_reason, exit_reason_str(run->exit_reason));
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-		case UCALL_SYNC: {
-			uint64_t gpa  = uc.args[1];
-			size_t size = uc.args[2];
-			size_t i;
-
-			TEST_ASSERT(uc.args[0] == SYNC_SHARED ||
-				    uc.args[0] == SYNC_PRIVATE,
-				    "Unknown sync command '%ld'", uc.args[0]);
-
-			for (i = 0; i < size; i += vm->page_size) {
-				size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
-				uint8_t *hva = addr_gpa2hva(vm, gpa + i);
-
-				/* In all cases, the host should observe the shared data. */
-				memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
-
-				/* For shared, write the new pattern to guest memory. */
-				if (uc.args[0] == SYNC_SHARED)
-					memset(hva, uc.args[4], nr_bytes);
-			}
-			break;
-		}
-		case UCALL_DONE:
-			return NULL;
-		default:
-			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
-		}
-	}
-}
-
-static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus,
-				 uint32_t nr_memslots)
-{
-	/*
-	 * Allocate enough memory so that each vCPU's chunk of memory can be
-	 * naturally aligned with respect to the size of the backing store.
-	 */
-	const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type));
-	const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment);
-	const size_t memfd_size = per_cpu_size * nr_vcpus;
-	const size_t slot_size = memfd_size / nr_memslots;
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-	pthread_t threads[KVM_MAX_VCPUS];
-	struct kvm_vm *vm;
-	int memfd, i, r;
-
-	const struct vm_shape shape = {
-		.mode = VM_MODE_DEFAULT,
-		.type = KVM_X86_SW_PROTECTED_VM,
-	};
-
-	TEST_ASSERT(slot_size * nr_memslots == memfd_size,
-		    "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)",
-		    memfd_size, nr_memslots);
-	vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus);
-
-	vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
-
-	memfd = vm_create_guest_memfd(vm, memfd_size, 0);
-
-	for (i = 0; i < nr_memslots; i++)
-		vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
-			   BASE_DATA_SLOT + i, slot_size / vm->page_size,
-			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
-
-	for (i = 0; i < nr_vcpus; i++) {
-		uint64_t gpa =  BASE_DATA_GPA + i * per_cpu_size;
-
-		vcpu_args_set(vcpus[i], 1, gpa);
-
-		/*
-		 * Map only what is needed so that an out-of-bounds access
-		 * results #PF => SHUTDOWN instead of data corruption.
-		 */
-		virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size);
-
-		pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]);
-	}
-
-	WRITE_ONCE(run_vcpus, true);
-
-	for (i = 0; i < nr_vcpus; i++)
-		pthread_join(threads[i], NULL);
-
-	kvm_vm_free(vm);
-
-	/*
-	 * Allocate and free memory from the guest_memfd after closing the VM
-	 * fd.  The guest_memfd is gifted a reference to its owning VM, i.e.
-	 * should prevent the VM from being fully destroyed until the last
-	 * reference to the guest_memfd is also put.
-	 */
-	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size);
-	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
-
-	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size);
-	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
-
-	close(memfd);
-}
-
-static void usage(const char *cmd)
-{
-	puts("");
-	printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd);
-	puts("");
-	backing_src_help("-s");
-	puts("");
-	puts(" -n: specify the number of vcpus (default: 1)");
-	puts("");
-	puts(" -m: specify the number of memslots (default: 1)");
-	puts("");
-}
-
-int main(int argc, char *argv[])
-{
-	enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
-	uint32_t nr_memslots = 1;
-	uint32_t nr_vcpus = 1;
-	int opt;
-
-	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
-
-	while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
-		switch (opt) {
-		case 's':
-			src_type = parse_backing_src_type(optarg);
-			break;
-		case 'n':
-			nr_vcpus = atoi_positive("nr_vcpus", optarg);
-			break;
-		case 'm':
-			nr_memslots = atoi_positive("nr_memslots", optarg);
-			break;
-		case 'h':
-		default:
-			usage(argv[0]);
-			exit(0);
-		}
-	}
-
-	test_mem_conversions(src_type, nr_vcpus, nr_memslots);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
deleted file mode 100644
index 13e72fcec8dd..000000000000
--- a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2023, Google LLC.
- */
-#include <linux/kvm.h>
-#include <pthread.h>
-#include <stdint.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-
-/* Arbitrarily selected to avoid overlaps with anything else */
-#define EXITS_TEST_GVA 0xc0000000
-#define EXITS_TEST_GPA EXITS_TEST_GVA
-#define EXITS_TEST_NPAGES 1
-#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE)
-#define EXITS_TEST_SLOT 10
-
-static uint64_t guest_repeatedly_read(void)
-{
-	volatile uint64_t value;
-
-	while (true)
-		value = *((uint64_t *) EXITS_TEST_GVA);
-
-	return value;
-}
-
-static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	r = _vcpu_run(vcpu);
-	if (r) {
-		TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r));
-		TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT);
-	}
-	return vcpu->run->exit_reason;
-}
-
-const struct vm_shape protected_vm_shape = {
-	.mode = VM_MODE_DEFAULT,
-	.type = KVM_X86_SW_PROTECTED_VM,
-};
-
-static void test_private_access_memslot_deleted(void)
-{
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	pthread_t vm_thread;
-	void *thread_return;
-	uint32_t exit_reason;
-
-	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
-					   guest_repeatedly_read);
-
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    EXITS_TEST_GPA, EXITS_TEST_SLOT,
-				    EXITS_TEST_NPAGES,
-				    KVM_MEM_GUEST_MEMFD);
-
-	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
-
-	/* Request to access page privately */
-	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
-
-	pthread_create(&vm_thread, NULL,
-		       (void *(*)(void *))run_vcpu_get_exit_reason,
-		       (void *)vcpu);
-
-	vm_mem_region_delete(vm, EXITS_TEST_SLOT);
-
-	pthread_join(vm_thread, &thread_return);
-	exit_reason = (uint32_t)(uint64_t)thread_return;
-
-	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
-
-	kvm_vm_free(vm);
-}
-
-static void test_private_access_memslot_not_private(void)
-{
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	uint32_t exit_reason;
-
-	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
-					   guest_repeatedly_read);
-
-	/* Add a non-private memslot (flags = 0) */
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    EXITS_TEST_GPA, EXITS_TEST_SLOT,
-				    EXITS_TEST_NPAGES, 0);
-
-	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
-
-	/* Request to access page privately */
-	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
-
-	exit_reason = run_vcpu_get_exit_reason(vcpu);
-
-	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
-
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
-
-	test_private_access_memslot_deleted();
-	test_private_access_memslot_not_private();
-}
diff --git a/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
deleted file mode 100644
index cbc92a862ea9..000000000000
--- a/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
+++ /dev/null
@@ -1,74 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Test edge cases and race conditions in kvm_recalculate_apic_map().
- */
-
-#include <sys/ioctl.h>
-#include <pthread.h>
-#include <time.h>
-
-#include "processor.h"
-#include "test_util.h"
-#include "kvm_util.h"
-#include "apic.h"
-
-#define TIMEOUT		5	/* seconds */
-
-#define LAPIC_DISABLED	0
-#define LAPIC_X2APIC	(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)
-#define MAX_XAPIC_ID	0xff
-
-static void *race(void *arg)
-{
-	struct kvm_lapic_state lapic = {};
-	struct kvm_vcpu *vcpu = arg;
-
-	while (1) {
-		/* Trigger kvm_recalculate_apic_map(). */
-		vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
-		pthread_testcancel();
-	}
-
-	return NULL;
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-	struct kvm_vcpu *vcpuN;
-	struct kvm_vm *vm;
-	pthread_t thread;
-	time_t t;
-	int i;
-
-	kvm_static_assert(KVM_MAX_VCPUS > MAX_XAPIC_ID);
-
-	/*
-	 * Create the max number of vCPUs supported by selftests so that KVM
-	 * has decent amount of work to do when recalculating the map, i.e. to
-	 * make the problematic window large enough to hit.
-	 */
-	vm = vm_create_with_vcpus(KVM_MAX_VCPUS, NULL, vcpus);
-
-	/*
-	 * Enable x2APIC on all vCPUs so that KVM doesn't bail from the recalc
-	 * due to vCPUs having aliased xAPIC IDs (truncated to 8 bits).
-	 */
-	for (i = 0; i < KVM_MAX_VCPUS; i++)
-		vcpu_set_msr(vcpus[i], MSR_IA32_APICBASE, LAPIC_X2APIC);
-
-	TEST_ASSERT_EQ(pthread_create(&thread, NULL, race, vcpus[0]), 0);
-
-	vcpuN = vcpus[KVM_MAX_VCPUS - 1];
-	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
-		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_X2APIC);
-		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_DISABLED);
-	}
-
-	TEST_ASSERT_EQ(pthread_cancel(thread), 0);
-	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
-
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
deleted file mode 100644
index 49913784bc82..000000000000
--- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
+++ /dev/null
@@ -1,146 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Test that KVM_SET_BOOT_CPU_ID works as intended
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "apic.h"
-
-static void guest_bsp_vcpu(void *arg)
-{
-	GUEST_SYNC(1);
-
-	GUEST_ASSERT_NE(get_bsp_flag(), 0);
-
-	GUEST_DONE();
-}
-
-static void guest_not_bsp_vcpu(void *arg)
-{
-	GUEST_SYNC(1);
-
-	GUEST_ASSERT_EQ(get_bsp_flag(), 0);
-
-	GUEST_DONE();
-}
-
-static void test_set_invalid_bsp(struct kvm_vm *vm)
-{
-	unsigned long max_vcpu_id = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
-	int r;
-
-	if (max_vcpu_id) {
-		r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(max_vcpu_id + 1));
-		TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID > MAX should fail");
-	}
-
-	r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(1L << 32));
-	TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID[63:32]!=0 should fail");
-}
-
-static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg)
-{
-	int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID,
-			   (void *)(unsigned long)vcpu->id);
-
-	TEST_ASSERT(r == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set %s", msg);
-}
-
-static void run_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-	int stage;
-
-	for (stage = 0; stage < 2; stage++) {
-
-		vcpu_run(vcpu);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-					uc.args[1] == stage + 1,
-					"Stage %d: Unexpected register values vmexit, got %lx",
-					stage + 1, (ulong)uc.args[1]);
-			test_set_bsp_busy(vcpu, "while running vm");
-			break;
-		case UCALL_DONE:
-			TEST_ASSERT(stage == 1,
-					"Expected GUEST_DONE in stage 2, got stage %d",
-					stage);
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-		default:
-			TEST_ASSERT(false, "Unexpected exit: %s",
-				    exit_reason_str(vcpu->run->exit_reason));
-		}
-	}
-}
-
-static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id,
-				struct kvm_vcpu *vcpus[])
-{
-	struct kvm_vm *vm;
-	uint32_t i;
-
-	vm = vm_create(nr_vcpus);
-
-	test_set_invalid_bsp(vm);
-
-	vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id);
-
-	for (i = 0; i < nr_vcpus; i++)
-		vcpus[i] = vm_vcpu_add(vm, i, i == bsp_vcpu_id ? guest_bsp_vcpu :
-								 guest_not_bsp_vcpu);
-	return vm;
-}
-
-static void run_vm_bsp(uint32_t bsp_vcpu_id)
-{
-	struct kvm_vcpu *vcpus[2];
-	struct kvm_vm *vm;
-
-	vm = create_vm(ARRAY_SIZE(vcpus), bsp_vcpu_id, vcpus);
-
-	run_vcpu(vcpus[0]);
-	run_vcpu(vcpus[1]);
-
-	kvm_vm_free(vm);
-}
-
-static void check_set_bsp_busy(void)
-{
-	struct kvm_vcpu *vcpus[2];
-	struct kvm_vm *vm;
-
-	vm = create_vm(ARRAY_SIZE(vcpus), 0, vcpus);
-
-	test_set_bsp_busy(vcpus[1], "after adding vcpu");
-
-	run_vcpu(vcpus[0]);
-	run_vcpu(vcpus[1]);
-
-	test_set_bsp_busy(vcpus[1], "to a terminated vcpu");
-
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID));
-
-	run_vm_bsp(0);
-	run_vm_bsp(1);
-	run_vm_bsp(0);
-
-	check_set_bsp_busy();
-}
diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
deleted file mode 100644
index c021c0795a96..000000000000
--- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * KVM_SET_SREGS tests
- *
- * Copyright (C) 2018, Google LLC.
- *
- * This is a regression test for the bug fixed by the following commit:
- * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values")
- *
- * That bug allowed a user-mode program that called the KVM_SET_SREGS
- * ioctl to put a VCPU's local APIC into an invalid state.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-
-#define TEST_INVALID_CR_BIT(vcpu, cr, orig, bit)				\
-do {										\
-	struct kvm_sregs new;							\
-	int rc;									\
-										\
-	/* Skip the sub-test, the feature/bit is supported. */			\
-	if (orig.cr & bit)							\
-		break;								\
-										\
-	memcpy(&new, &orig, sizeof(sregs));					\
-	new.cr |= bit;								\
-										\
-	rc = _vcpu_sregs_set(vcpu, &new);					\
-	TEST_ASSERT(rc, "KVM allowed invalid " #cr " bit (0x%lx)", bit);	\
-										\
-	/* Sanity check that KVM didn't change anything. */			\
-	vcpu_sregs_get(vcpu, &new);						\
-	TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs");	\
-} while (0)
-
-static uint64_t calc_supported_cr4_feature_bits(void)
-{
-	uint64_t cr4;
-
-	cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE |
-	      X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE |
-	      X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT;
-	if (kvm_cpu_has(X86_FEATURE_UMIP))
-		cr4 |= X86_CR4_UMIP;
-	if (kvm_cpu_has(X86_FEATURE_LA57))
-		cr4 |= X86_CR4_LA57;
-	if (kvm_cpu_has(X86_FEATURE_VMX))
-		cr4 |= X86_CR4_VMXE;
-	if (kvm_cpu_has(X86_FEATURE_SMX))
-		cr4 |= X86_CR4_SMXE;
-	if (kvm_cpu_has(X86_FEATURE_FSGSBASE))
-		cr4 |= X86_CR4_FSGSBASE;
-	if (kvm_cpu_has(X86_FEATURE_PCID))
-		cr4 |= X86_CR4_PCIDE;
-	if (kvm_cpu_has(X86_FEATURE_XSAVE))
-		cr4 |= X86_CR4_OSXSAVE;
-	if (kvm_cpu_has(X86_FEATURE_SMEP))
-		cr4 |= X86_CR4_SMEP;
-	if (kvm_cpu_has(X86_FEATURE_SMAP))
-		cr4 |= X86_CR4_SMAP;
-	if (kvm_cpu_has(X86_FEATURE_PKU))
-		cr4 |= X86_CR4_PKE;
-
-	return cr4;
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_sregs sregs;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	uint64_t cr4;
-	int rc, i;
-
-	/*
-	 * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
-	 * use it to verify all supported CR4 bits can be set prior to defining
-	 * the vCPU model, i.e. without doing KVM_SET_CPUID2.
-	 */
-	vm = vm_create_barebones();
-	vcpu = __vm_vcpu_add(vm, 0);
-
-	vcpu_sregs_get(vcpu, &sregs);
-
-	sregs.cr0 = 0;
-	sregs.cr4 |= calc_supported_cr4_feature_bits();
-	cr4 = sregs.cr4;
-
-	rc = _vcpu_sregs_set(vcpu, &sregs);
-	TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)",
-		    sregs.cr4, cr4);
-
-	/* Verify all unsupported features are rejected by KVM. */
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMXE);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_FSGSBASE);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PCIDE);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_OSXSAVE);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMEP);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMAP);
-	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PKE);
-
-	for (i = 32; i < 64; i++)
-		TEST_INVALID_CR_BIT(vcpu, cr0, sregs, BIT(i));
-
-	/* NW without CD is illegal, as is PG without PE. */
-	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW);
-	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG);
-
-	kvm_vm_free(vm);
-
-	/* Create a "real" VM and verify APIC_BASE can be set. */
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	sregs.apic_base = 1 << 10;
-	rc = _vcpu_sregs_set(vcpu, &sregs);
-	TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
-		    sregs.apic_base);
-	sregs.apic_base = 1 << 11;
-	rc = _vcpu_sregs_set(vcpu, &sregs);
-	TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
-		    sregs.apic_base);
-
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c
deleted file mode 100644
index 3fb967f40c6a..000000000000
--- a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c
+++ /dev/null
@@ -1,152 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/kvm.h>
-#include <linux/psp-sev.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <pthread.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-#include "kselftest.h"
-
-#define SVM_SEV_FEAT_DEBUG_SWAP 32u
-
-/*
- * Some features may have hidden dependencies, or may only work
- * for certain VM types.  Err on the side of safety and don't
- * expect that all supported features can be passed one by one
- * to KVM_SEV_INIT2.
- *
- * (Well, right now there's only one...)
- */
-#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP
-
-int kvm_fd;
-u64 supported_vmsa_features;
-bool have_sev_es;
-
-static int __sev_ioctl(int vm_fd, int cmd_id, void *data)
-{
-	struct kvm_sev_cmd cmd = {
-		.id = cmd_id,
-		.data = (uint64_t)data,
-		.sev_fd = open_sev_dev_path_or_exit(),
-	};
-	int ret;
-
-	ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd);
-	TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS,
-		    "%d failed: fw error: %d\n",
-		    cmd_id, cmd.error);
-
-	return ret;
-}
-
-static void test_init2(unsigned long vm_type, struct kvm_sev_init *init)
-{
-	struct kvm_vm *vm;
-	int ret;
-
-	vm = vm_create_barebones_type(vm_type);
-	ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init);
-	TEST_ASSERT(ret == 0,
-		    "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d",
-		    ret, errno);
-	kvm_vm_free(vm);
-}
-
-static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg)
-{
-	struct kvm_vm *vm;
-	int ret;
-
-	vm = vm_create_barebones_type(vm_type);
-	ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init);
-	TEST_ASSERT(ret == -1 && errno == EINVAL,
-		    "KVM_SEV_INIT2 should fail, %s.",
-		    msg);
-	kvm_vm_free(vm);
-}
-
-void test_vm_types(void)
-{
-	test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){});
-
-	/*
-	 * TODO: check that unsupported types cannot be created.  Probably
-	 * a separate selftest.
-	 */
-	if (have_sev_es)
-		test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){});
-
-	test_init2_invalid(0, &(struct kvm_sev_init){},
-			   "VM type is KVM_X86_DEFAULT_VM");
-	if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))
-		test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){},
-				   "VM type is KVM_X86_SW_PROTECTED_VM");
-}
-
-void test_flags(uint32_t vm_type)
-{
-	int i;
-
-	for (i = 0; i < 32; i++)
-		test_init2_invalid(vm_type,
-			&(struct kvm_sev_init){ .flags = BIT(i) },
-			"invalid flag");
-}
-
-void test_features(uint32_t vm_type, uint64_t supported_features)
-{
-	int i;
-
-	for (i = 0; i < 64; i++) {
-		if (!(supported_features & BIT_ULL(i)))
-			test_init2_invalid(vm_type,
-				&(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) },
-				"unknown feature");
-		else if (KNOWN_FEATURES & BIT_ULL(i))
-			test_init2(vm_type,
-				&(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) });
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	int kvm_fd = open_kvm_dev_path_or_exit();
-	bool have_sev;
-
-	TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV,
-					   KVM_X86_SEV_VMSA_FEATURES) == 0);
-	kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV,
-			    KVM_X86_SEV_VMSA_FEATURES,
-			    &supported_vmsa_features);
-
-	have_sev = kvm_cpu_has(X86_FEATURE_SEV);
-	TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)),
-		    "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
-		    kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM);
-
-	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM));
-	have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES);
-
-	TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)),
-		    "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
-		    kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM);
-
-	test_vm_types();
-
-	test_flags(KVM_X86_SEV_VM);
-	if (have_sev_es)
-		test_flags(KVM_X86_SEV_ES_VM);
-
-	test_features(KVM_X86_SEV_VM, 0);
-	if (have_sev_es)
-		test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
deleted file mode 100644
index 0a6dfba3905b..000000000000
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ /dev/null
@@ -1,397 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/kvm.h>
-#include <linux/psp-sev.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <pthread.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "sev.h"
-#include "kselftest.h"
-
-#define NR_MIGRATE_TEST_VCPUS 4
-#define NR_MIGRATE_TEST_VMS 3
-#define NR_LOCK_TESTING_THREADS 3
-#define NR_LOCK_TESTING_ITERATIONS 10000
-
-bool have_sev_es;
-
-static struct kvm_vm *sev_vm_create(bool es)
-{
-	struct kvm_vm *vm;
-	int i;
-
-	vm = vm_create_barebones();
-	if (!es)
-		sev_vm_init(vm);
-	else
-		sev_es_vm_init(vm);
-
-	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
-		__vm_vcpu_add(vm, i);
-
-	sev_vm_launch(vm, es ? SEV_POLICY_ES : 0);
-
-	if (es)
-		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
-	return vm;
-}
-
-static struct kvm_vm *aux_vm_create(bool with_vcpus)
-{
-	struct kvm_vm *vm;
-	int i;
-
-	vm = vm_create_barebones();
-	if (!with_vcpus)
-		return vm;
-
-	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
-		__vm_vcpu_add(vm, i);
-
-	return vm;
-}
-
-static int __sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src)
-{
-	return __vm_enable_cap(dst, KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, src->fd);
-}
-
-
-static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src)
-{
-	int ret;
-
-	ret = __sev_migrate_from(dst, src);
-	TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d", ret, errno);
-}
-
-static void test_sev_migrate_from(bool es)
-{
-	struct kvm_vm *src_vm;
-	struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS];
-	int i, ret;
-
-	src_vm = sev_vm_create(es);
-	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
-		dst_vms[i] = aux_vm_create(true);
-
-	/* Initial migration from the src to the first dst. */
-	sev_migrate_from(dst_vms[0], src_vm);
-
-	for (i = 1; i < NR_MIGRATE_TEST_VMS; i++)
-		sev_migrate_from(dst_vms[i], dst_vms[i - 1]);
-
-	/* Migrate the guest back to the original VM. */
-	ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]);
-	TEST_ASSERT(ret == -1 && errno == EIO,
-		    "VM that was migrated from should be dead. ret %d, errno: %d", ret,
-		    errno);
-
-	kvm_vm_free(src_vm);
-	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
-		kvm_vm_free(dst_vms[i]);
-}
-
-struct locking_thread_input {
-	struct kvm_vm *vm;
-	struct kvm_vm *source_vms[NR_LOCK_TESTING_THREADS];
-};
-
-static void *locking_test_thread(void *arg)
-{
-	int i, j;
-	struct locking_thread_input *input = (struct locking_thread_input *)arg;
-
-	for (i = 0; i < NR_LOCK_TESTING_ITERATIONS; ++i) {
-		j = i % NR_LOCK_TESTING_THREADS;
-		__sev_migrate_from(input->vm, input->source_vms[j]);
-	}
-
-	return NULL;
-}
-
-static void test_sev_migrate_locking(void)
-{
-	struct locking_thread_input input[NR_LOCK_TESTING_THREADS];
-	pthread_t pt[NR_LOCK_TESTING_THREADS];
-	int i;
-
-	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) {
-		input[i].vm = sev_vm_create(/* es= */ false);
-		input[0].source_vms[i] = input[i].vm;
-	}
-	for (i = 1; i < NR_LOCK_TESTING_THREADS; ++i)
-		memcpy(input[i].source_vms, input[0].source_vms,
-		       sizeof(input[i].source_vms));
-
-	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
-		pthread_create(&pt[i], NULL, locking_test_thread, &input[i]);
-
-	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
-		pthread_join(pt[i], NULL);
-	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
-		kvm_vm_free(input[i].vm);
-}
-
-static void test_sev_migrate_parameters(void)
-{
-	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_no_sev,
-		*sev_es_vm_no_vmsa;
-	int ret;
-
-	vm_no_vcpu = vm_create_barebones();
-	vm_no_sev = aux_vm_create(true);
-	ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev);
-	TEST_ASSERT(ret == -1 && errno == EINVAL,
-		    "Migrations require SEV enabled. ret %d, errno: %d", ret,
-		    errno);
-
-	if (!have_sev_es)
-		goto out;
-
-	sev_vm = sev_vm_create(/* es= */ false);
-	sev_es_vm = sev_vm_create(/* es= */ true);
-	sev_es_vm_no_vmsa = vm_create_barebones();
-	sev_es_vm_init(sev_es_vm_no_vmsa);
-	__vm_vcpu_add(sev_es_vm_no_vmsa, 1);
-
-	ret = __sev_migrate_from(sev_vm, sev_es_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"Should not be able migrate to SEV enabled VM. ret: %d, errno: %d",
-		ret, errno);
-
-	ret = __sev_migrate_from(sev_es_vm, sev_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d",
-		ret, errno);
-
-	ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d",
-		ret, errno);
-
-	ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm_no_vmsa);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d",
-		ret, errno);
-
-	kvm_vm_free(sev_vm);
-	kvm_vm_free(sev_es_vm);
-	kvm_vm_free(sev_es_vm_no_vmsa);
-out:
-	kvm_vm_free(vm_no_vcpu);
-	kvm_vm_free(vm_no_sev);
-}
-
-static int __sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
-{
-	return __vm_enable_cap(dst, KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, src->fd);
-}
-
-
-static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
-{
-	int ret;
-
-	ret = __sev_mirror_create(dst, src);
-	TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno);
-}
-
-static void verify_mirror_allowed_cmds(struct kvm_vm *vm)
-{
-	struct kvm_sev_guest_status status;
-	int cmd_id;
-
-	for (cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) {
-		int ret;
-
-		/*
-		 * These commands are allowed for mirror VMs, all others are
-		 * not.
-		 */
-		switch (cmd_id) {
-		case KVM_SEV_LAUNCH_UPDATE_VMSA:
-		case KVM_SEV_GUEST_STATUS:
-		case KVM_SEV_DBG_DECRYPT:
-		case KVM_SEV_DBG_ENCRYPT:
-			continue;
-		default:
-			break;
-		}
-
-		/*
-		 * These commands should be disallowed before the data
-		 * parameter is examined so NULL is OK here.
-		 */
-		ret = __vm_sev_ioctl(vm, cmd_id, NULL);
-		TEST_ASSERT(
-			ret == -1 && errno == EINVAL,
-			"Should not be able call command: %d. ret: %d, errno: %d",
-			cmd_id, ret, errno);
-	}
-
-	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
-}
-
-static void test_sev_mirror(bool es)
-{
-	struct kvm_vm *src_vm, *dst_vm;
-	int i;
-
-	src_vm = sev_vm_create(es);
-	dst_vm = aux_vm_create(false);
-
-	sev_mirror_create(dst_vm, src_vm);
-
-	/* Check that we can complete creation of the mirror VM.  */
-	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
-		__vm_vcpu_add(dst_vm, i);
-
-	if (es)
-		vm_sev_ioctl(dst_vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
-
-	verify_mirror_allowed_cmds(dst_vm);
-
-	kvm_vm_free(src_vm);
-	kvm_vm_free(dst_vm);
-}
-
-static void test_sev_mirror_parameters(void)
-{
-	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu;
-	int ret;
-
-	sev_vm = sev_vm_create(/* es= */ false);
-	vm_with_vcpu = aux_vm_create(true);
-	vm_no_vcpu = aux_vm_create(false);
-
-	ret = __sev_mirror_create(sev_vm, sev_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"Should not be able copy context to self. ret: %d, errno: %d",
-		ret, errno);
-
-	ret = __sev_mirror_create(vm_no_vcpu, vm_with_vcpu);
-	TEST_ASSERT(ret == -1 && errno == EINVAL,
-		    "Copy context requires SEV enabled. ret %d, errno: %d", ret,
-		    errno);
-
-	ret = __sev_mirror_create(vm_with_vcpu, sev_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d",
-		ret, errno);
-
-	if (!have_sev_es)
-		goto out;
-
-	sev_es_vm = sev_vm_create(/* es= */ true);
-	ret = __sev_mirror_create(sev_vm, sev_es_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"Should not be able copy context to SEV enabled VM. ret: %d, errno: %d",
-		ret, errno);
-
-	ret = __sev_mirror_create(sev_es_vm, sev_vm);
-	TEST_ASSERT(
-		ret == -1 && errno == EINVAL,
-		"Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d",
-		ret, errno);
-
-	kvm_vm_free(sev_es_vm);
-
-out:
-	kvm_vm_free(sev_vm);
-	kvm_vm_free(vm_with_vcpu);
-	kvm_vm_free(vm_no_vcpu);
-}
-
-static void test_sev_move_copy(void)
-{
-	struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
-		      *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
-
-	sev_vm = sev_vm_create(/* es= */ false);
-	dst_vm = aux_vm_create(true);
-	dst2_vm = aux_vm_create(true);
-	dst3_vm = aux_vm_create(true);
-	mirror_vm = aux_vm_create(false);
-	dst_mirror_vm = aux_vm_create(false);
-	dst2_mirror_vm = aux_vm_create(false);
-	dst3_mirror_vm = aux_vm_create(false);
-
-	sev_mirror_create(mirror_vm, sev_vm);
-
-	sev_migrate_from(dst_mirror_vm, mirror_vm);
-	sev_migrate_from(dst_vm, sev_vm);
-
-	sev_migrate_from(dst2_vm, dst_vm);
-	sev_migrate_from(dst2_mirror_vm, dst_mirror_vm);
-
-	sev_migrate_from(dst3_mirror_vm, dst2_mirror_vm);
-	sev_migrate_from(dst3_vm, dst2_vm);
-
-	kvm_vm_free(dst_vm);
-	kvm_vm_free(sev_vm);
-	kvm_vm_free(dst2_vm);
-	kvm_vm_free(dst3_vm);
-	kvm_vm_free(mirror_vm);
-	kvm_vm_free(dst_mirror_vm);
-	kvm_vm_free(dst2_mirror_vm);
-	kvm_vm_free(dst3_mirror_vm);
-
-	/*
-	 * Run similar test be destroy mirrors before mirrored VMs to ensure
-	 * destruction is done safely.
-	 */
-	sev_vm = sev_vm_create(/* es= */ false);
-	dst_vm = aux_vm_create(true);
-	mirror_vm = aux_vm_create(false);
-	dst_mirror_vm = aux_vm_create(false);
-
-	sev_mirror_create(mirror_vm, sev_vm);
-
-	sev_migrate_from(dst_mirror_vm, mirror_vm);
-	sev_migrate_from(dst_vm, sev_vm);
-
-	kvm_vm_free(mirror_vm);
-	kvm_vm_free(dst_mirror_vm);
-	kvm_vm_free(dst_vm);
-	kvm_vm_free(sev_vm);
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM));
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
-
-	have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES);
-
-	if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
-		test_sev_migrate_from(/* es= */ false);
-		if (have_sev_es)
-			test_sev_migrate_from(/* es= */ true);
-		test_sev_migrate_locking();
-		test_sev_migrate_parameters();
-		if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
-			test_sev_move_copy();
-	}
-	if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
-		test_sev_mirror(/* es= */ false);
-		if (have_sev_es)
-			test_sev_mirror(/* es= */ true);
-		test_sev_mirror_parameters();
-	}
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
deleted file mode 100644
index ae77698e6e97..000000000000
--- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
+++ /dev/null
@@ -1,205 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <math.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-#include "linux/psp-sev.h"
-#include "sev.h"
-
-
-#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)
-
-static void guest_sev_es_code(void)
-{
-	/* TODO: Check CPUID after GHCB-based hypercall support is added. */
-	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
-	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ES_ENABLED);
-
-	/*
-	 * TODO: Add GHCB and ucall support for SEV-ES guests.  For now, simply
-	 * force "termination" to signal "done" via the GHCB MSR protocol.
-	 */
-	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
-	__asm__ __volatile__("rep; vmmcall");
-}
-
-static void guest_sev_code(void)
-{
-	GUEST_ASSERT(this_cpu_has(X86_FEATURE_SEV));
-	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
-
-	GUEST_DONE();
-}
-
-/* Stash state passed via VMSA before any compiled code runs.  */
-extern void guest_code_xsave(void);
-asm("guest_code_xsave:\n"
-    "mov $" __stringify(XFEATURE_MASK_X87_AVX) ", %eax\n"
-    "xor %edx, %edx\n"
-    "xsave (%rdi)\n"
-    "jmp guest_sev_es_code");
-
-static void compare_xsave(u8 *from_host, u8 *from_guest)
-{
-	int i;
-	bool bad = false;
-	for (i = 0; i < 4095; i++) {
-		if (from_host[i] != from_guest[i]) {
-			printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
-			bad = true;
-		}
-	}
-
-	if (bad)
-		abort();
-}
-
-static void test_sync_vmsa(uint32_t policy)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	vm_vaddr_t gva;
-	void *hva;
-
-	double x87val = M_PI;
-	struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 };
-
-	vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu);
-	gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
-				    MEM_REGION_TEST_DATA);
-	hva = addr_gva2hva(vm, gva);
-
-	vcpu_args_set(vcpu, 1, gva);
-
-	asm("fninit\n"
-	    "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n"
-	    "fldl %3\n"
-	    "xsave (%2)\n"
-	    "fstp %%st\n"
-	    : "=m"(xsave)
-	    : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val)
-	    : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
-	vcpu_xsave_set(vcpu, &xsave);
-
-	vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL);
-
-	/* This page is shared, so make it decrypted.  */
-	memset(hva, 0, 4096);
-
-	vcpu_run(vcpu);
-
-	TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
-		    "Wanted SYSTEM_EVENT, got %s",
-		    exit_reason_str(vcpu->run->exit_reason));
-	TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
-	TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
-	TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
-
-	compare_xsave((u8 *)&xsave, (u8 *)hva);
-
-	kvm_vm_free(vm);
-}
-
-static void test_sev(void *guest_code, uint64_t policy)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-
-	uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM;
-
-	vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu);
-
-	/* TODO: Validate the measurement is as expected. */
-	vm_sev_launch(vm, policy, NULL);
-
-	for (;;) {
-		vcpu_run(vcpu);
-
-		if (policy & SEV_POLICY_ES) {
-			TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
-				    "Wanted SYSTEM_EVENT, got %s",
-				    exit_reason_str(vcpu->run->exit_reason));
-			TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
-			TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
-			TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
-			break;
-		}
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_SYNC:
-			continue;
-		case UCALL_DONE:
-			return;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-		default:
-			TEST_FAIL("Unexpected exit: %s",
-				  exit_reason_str(vcpu->run->exit_reason));
-		}
-	}
-
-	kvm_vm_free(vm);
-}
-
-static void guest_shutdown_code(void)
-{
-	struct desc_ptr idt;
-
-	/* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */
-	memset(&idt, 0, sizeof(idt));
-	__asm__ __volatile__("lidt %0" :: "m"(idt));
-
-	__asm__ __volatile__("ud2");
-}
-
-static void test_sev_es_shutdown(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	uint32_t type = KVM_X86_SEV_ES_VM;
-
-	vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu);
-
-	vm_sev_launch(vm, SEV_POLICY_ES, NULL);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN,
-		    "Wanted SHUTDOWN, got %s",
-		    exit_reason_str(vcpu->run->exit_reason));
-
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	const u64 xf_mask = XFEATURE_MASK_X87_AVX;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
-
-	test_sev(guest_sev_code, SEV_POLICY_NO_DBG);
-	test_sev(guest_sev_code, 0);
-
-	if (kvm_cpu_has(X86_FEATURE_SEV_ES)) {
-		test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
-		test_sev(guest_sev_es_code, SEV_POLICY_ES);
-
-		test_sev_es_shutdown();
-
-		if (kvm_has_cap(KVM_CAP_XCRS) &&
-		    (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) {
-			test_sync_vmsa(0);
-			test_sync_vmsa(SEV_POLICY_NO_DBG);
-		}
-	}
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
deleted file mode 100644
index fabeeaddfb3a..000000000000
--- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2020, Google LLC.
- *
- * Test that KVM emulates instructions in response to EPT violations when
- * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR.
- */
-#include "flds_emulation.h"
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "vmx.h"
-
-#define MAXPHYADDR 36
-
-#define MEM_REGION_GVA	0x0000123456789000
-#define MEM_REGION_GPA	0x0000000700000000
-#define MEM_REGION_SLOT	10
-#define MEM_REGION_SIZE PAGE_SIZE
-
-static void guest_code(bool tdp_enabled)
-{
-	uint64_t error_code;
-	uint64_t vector;
-
-	vector = kvm_asm_safe_ec(FLDS_MEM_EAX, error_code, "a"(MEM_REGION_GVA));
-
-	/*
-	 * When TDP is enabled, flds will trigger an emulation failure, exit to
-	 * userspace, and then the selftest host "VMM" skips the instruction.
-	 *
-	 * When TDP is disabled, no instruction emulation is required so flds
-	 * should generate #PF(RSVD).
-	 */
-	if (tdp_enabled) {
-		GUEST_ASSERT(!vector);
-	} else {
-		GUEST_ASSERT_EQ(vector, PF_VECTOR);
-		GUEST_ASSERT(error_code & PFERR_RSVD_MASK);
-	}
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	uint64_t *pte;
-	uint64_t *hva;
-	uint64_t gpa;
-	int rc;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled());
-
-	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR);
-
-	rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE);
-	TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");
-	vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
-
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    MEM_REGION_GPA, MEM_REGION_SLOT,
-				    MEM_REGION_SIZE / PAGE_SIZE, 0);
-	gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE,
-				 MEM_REGION_GPA, MEM_REGION_SLOT);
-	TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc");
-	virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1);
-	hva = addr_gpa2hva(vm, MEM_REGION_GPA);
-	memset(hva, 0, PAGE_SIZE);
-
-	pte = vm_get_page_table_entry(vm, MEM_REGION_GVA);
-	*pte |= BIT_ULL(MAXPHYADDR);
-
-	vcpu_run(vcpu);
-
-	/*
-	 * When TDP is enabled, KVM must emulate in response the guest physical
-	 * address that is illegal from the guest's perspective, but is legal
-	 * from hardware's perspeective.  This should result in an emulation
-	 * failure exit to userspace since KVM doesn't support emulating flds.
-	 */
-	if (kvm_is_tdp_enabled()) {
-		handle_flds_emulation_failure_exit(vcpu);
-		vcpu_run(vcpu);
-	}
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		break;
-	default:
-		TEST_FAIL("Unrecognized ucall: %lu", uc.cmd);
-	}
-
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
deleted file mode 100644
index 55c88d664a94..000000000000
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018, Red Hat, Inc.
- *
- * Tests for SMM.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-
-#include "vmx.h"
-#include "svm_util.h"
-
-#define SMRAM_SIZE 65536
-#define SMRAM_MEMSLOT ((1 << 16) | 1)
-#define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE)
-#define SMRAM_GPA 0x1000000
-#define SMRAM_STAGE 0xfe
-
-#define STR(x) #x
-#define XSTR(s) STR(s)
-
-#define SYNC_PORT 0xe
-#define DONE 0xff
-
-/*
- * This is compiled as normal 64-bit code, however, SMI handler is executed
- * in real-address mode. To stay simple we're limiting ourselves to a mode
- * independent subset of asm here.
- * SMI handler always report back fixed stage SMRAM_STAGE.
- */
-uint8_t smi_handler[] = {
-	0xb0, SMRAM_STAGE,    /* mov $SMRAM_STAGE, %al */
-	0xe4, SYNC_PORT,      /* in $SYNC_PORT, %al */
-	0x0f, 0xaa,           /* rsm */
-};
-
-static inline void sync_with_host(uint64_t phase)
-{
-	asm volatile("in $" XSTR(SYNC_PORT)", %%al \n"
-		     : "+a" (phase));
-}
-
-static void self_smi(void)
-{
-	x2apic_write_reg(APIC_ICR,
-			 APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
-}
-
-static void l2_guest_code(void)
-{
-	sync_with_host(8);
-
-	sync_with_host(10);
-
-	vmcall();
-}
-
-static void guest_code(void *arg)
-{
-	#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	uint64_t apicbase = rdmsr(MSR_IA32_APICBASE);
-	struct svm_test_data *svm = arg;
-	struct vmx_pages *vmx_pages = arg;
-
-	sync_with_host(1);
-
-	wrmsr(MSR_IA32_APICBASE, apicbase | X2APIC_ENABLE);
-
-	sync_with_host(2);
-
-	self_smi();
-
-	sync_with_host(4);
-
-	if (arg) {
-		if (this_cpu_has(X86_FEATURE_SVM)) {
-			generic_svm_setup(svm, l2_guest_code,
-					  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-		} else {
-			GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-			GUEST_ASSERT(load_vmcs(vmx_pages));
-			prepare_vmcs(vmx_pages, l2_guest_code,
-				     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-		}
-
-		sync_with_host(5);
-
-		self_smi();
-
-		sync_with_host(7);
-
-		if (this_cpu_has(X86_FEATURE_SVM)) {
-			run_guest(svm->vmcb, svm->vmcb_gpa);
-			run_guest(svm->vmcb, svm->vmcb_gpa);
-		} else {
-			vmlaunch();
-			vmresume();
-		}
-
-		/* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */
-		sync_with_host(12);
-	}
-
-	sync_with_host(DONE);
-}
-
-void inject_smi(struct kvm_vcpu *vcpu)
-{
-	struct kvm_vcpu_events events;
-
-	vcpu_events_get(vcpu, &events);
-
-	events.smi.pending = 1;
-	events.flags |= KVM_VCPUEVENT_VALID_SMM;
-
-	vcpu_events_set(vcpu, &events);
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t nested_gva = 0;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_regs regs;
-	struct kvm_vm *vm;
-	struct kvm_x86_state *state;
-	int stage, stage_reported;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_SMM));
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA,
-				    SMRAM_MEMSLOT, SMRAM_PAGES, 0);
-	TEST_ASSERT(vm_phy_pages_alloc(vm, SMRAM_PAGES, SMRAM_GPA, SMRAM_MEMSLOT)
-		    == SMRAM_GPA, "could not allocate guest physical addresses?");
-
-	memset(addr_gpa2hva(vm, SMRAM_GPA), 0x0, SMRAM_SIZE);
-	memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler,
-	       sizeof(smi_handler));
-
-	vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA);
-
-	if (kvm_has_cap(KVM_CAP_NESTED_STATE)) {
-		if (kvm_cpu_has(X86_FEATURE_SVM))
-			vcpu_alloc_svm(vm, &nested_gva);
-		else if (kvm_cpu_has(X86_FEATURE_VMX))
-			vcpu_alloc_vmx(vm, &nested_gva);
-	}
-
-	if (!nested_gva)
-		pr_info("will skip SMM test with VMX enabled\n");
-
-	vcpu_args_set(vcpu, 1, nested_gva);
-
-	for (stage = 1;; stage++) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		memset(&regs, 0, sizeof(regs));
-		vcpu_regs_get(vcpu, &regs);
-
-		stage_reported = regs.rax & 0xff;
-
-		if (stage_reported == DONE)
-			goto done;
-
-		TEST_ASSERT(stage_reported == stage ||
-			    stage_reported == SMRAM_STAGE,
-			    "Unexpected stage: #%x, got %x",
-			    stage, stage_reported);
-
-		/*
-		 * Enter SMM during L2 execution and check that we correctly
-		 * return from it. Do not perform save/restore while in SMM yet.
-		 */
-		if (stage == 8) {
-			inject_smi(vcpu);
-			continue;
-		}
-
-		/*
-		 * Perform save/restore while the guest is in SMM triggered
-		 * during L2 execution.
-		 */
-		if (stage == 10)
-			inject_smi(vcpu);
-
-		state = vcpu_save_state(vcpu);
-		kvm_vm_release(vm);
-
-		vcpu = vm_recreate_with_one_vcpu(vm);
-		vcpu_load_state(vcpu, state);
-		kvm_x86_state_cleanup(state);
-	}
-
-done:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
deleted file mode 100644
index 141b7fc0c965..000000000000
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ /dev/null
@@ -1,323 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * KVM_GET/SET_* tests
- *
- * Copyright (C) 2018, Red Hat, Inc.
- *
- * Tests for vCPU state save/restore, including nested guest state.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-#include "svm_util.h"
-
-#define L2_GUEST_STACK_SIZE 256
-
-void svm_l2_guest_code(void)
-{
-	GUEST_SYNC(4);
-	/* Exit to L1 */
-	vmcall();
-	GUEST_SYNC(6);
-	/* Done, exit to L1 and never come back.  */
-	vmcall();
-}
-
-static void svm_l1_guest_code(struct svm_test_data *svm)
-{
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	struct vmcb *vmcb = svm->vmcb;
-
-	GUEST_ASSERT(svm->vmcb_gpa);
-	/* Prepare for L2 execution. */
-	generic_svm_setup(svm, svm_l2_guest_code,
-			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_SYNC(3);
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	GUEST_SYNC(5);
-	vmcb->save.rip += 3;
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	GUEST_SYNC(7);
-}
-
-void vmx_l2_guest_code(void)
-{
-	GUEST_SYNC(6);
-
-	/* Exit to L1 */
-	vmcall();
-
-	/* L1 has now set up a shadow VMCS for us.  */
-	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
-	GUEST_SYNC(10);
-	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
-	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
-	GUEST_SYNC(11);
-	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
-	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
-	GUEST_SYNC(12);
-
-	/* Done, exit to L1 and never come back.  */
-	vmcall();
-}
-
-static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
-{
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	GUEST_ASSERT(vmx_pages->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_SYNC(3);
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
-
-	GUEST_SYNC(4);
-	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
-
-	prepare_vmcs(vmx_pages, vmx_l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_SYNC(5);
-	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	/* Check that the launched state is preserved.  */
-	GUEST_ASSERT(vmlaunch());
-
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	GUEST_SYNC(7);
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
-
-	vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
-	vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa);
-
-	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
-	GUEST_ASSERT(vmlaunch());
-	GUEST_SYNC(8);
-	GUEST_ASSERT(vmlaunch());
-	GUEST_ASSERT(vmresume());
-
-	vmwrite(GUEST_RIP, 0xc0ffee);
-	GUEST_SYNC(9);
-	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
-
-	GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
-	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
-	GUEST_ASSERT(vmlaunch());
-	GUEST_ASSERT(vmresume());
-	GUEST_SYNC(13);
-	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
-	GUEST_ASSERT(vmlaunch());
-	GUEST_ASSERT(vmresume());
-}
-
-static void __attribute__((__flatten__)) guest_code(void *arg)
-{
-	GUEST_SYNC(1);
-
-	if (this_cpu_has(X86_FEATURE_XSAVE)) {
-		uint64_t supported_xcr0 = this_cpu_supported_xcr0();
-		uint8_t buffer[4096];
-
-		memset(buffer, 0xcc, sizeof(buffer));
-
-		/*
-		 * Modify state for all supported xfeatures to take them out of
-		 * their "init" state, i.e. to make them show up in XSTATE_BV.
-		 *
-		 * Note off-by-default features, e.g. AMX, are out of scope for
-		 * this particular testcase as they have a different ABI.
-		 */
-		GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_FP);
-		asm volatile ("fincstp");
-
-		GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_SSE);
-		asm volatile ("vmovdqu %0, %%xmm0" :: "m" (buffer));
-
-		if (supported_xcr0 & XFEATURE_MASK_YMM)
-			asm volatile ("vmovdqu %0, %%ymm0" :: "m" (buffer));
-
-		if (supported_xcr0 & XFEATURE_MASK_AVX512) {
-			asm volatile ("kmovq %0, %%k1" :: "r" (-1ull));
-			asm volatile ("vmovupd %0, %%zmm0" :: "m" (buffer));
-			asm volatile ("vmovupd %0, %%zmm16" :: "m" (buffer));
-		}
-
-		if (this_cpu_has(X86_FEATURE_MPX)) {
-			uint64_t bounds[2] = { 10, 0xffffffffull };
-			uint64_t output[2] = { };
-
-			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDREGS);
-			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDCSR);
-
-			/*
-			 * Don't bother trying to get BNDCSR into the INUSE
-			 * state.  MSR_IA32_BNDCFGS doesn't count as it isn't
-			 * managed via XSAVE/XRSTOR, and BNDCFGU can only be
-			 * modified by XRSTOR.  Stuffing XSTATE_BV in the host
-			 * is simpler than doing XRSTOR here in the guest.
-			 *
-			 * However, temporarily enable MPX in BNDCFGS so that
-			 * BNDMOV actually loads BND1.  If MPX isn't *fully*
-			 * enabled, all MPX instructions are treated as NOPs.
-			 *
-			 * Hand encode "bndmov (%rax),%bnd1" as support for MPX
-			 * mnemonics/registers has been removed from gcc and
-			 * clang (and was never fully supported by clang).
-			 */
-			wrmsr(MSR_IA32_BNDCFGS, BIT_ULL(0));
-			asm volatile (".byte 0x66,0x0f,0x1a,0x08" :: "a" (bounds));
-			/*
-			 * Hand encode "bndmov %bnd1, (%rax)" to sanity check
-			 * that BND1 actually got loaded.
-			 */
-			asm volatile (".byte 0x66,0x0f,0x1b,0x08" :: "a" (output));
-			wrmsr(MSR_IA32_BNDCFGS, 0);
-
-			GUEST_ASSERT_EQ(bounds[0], output[0]);
-			GUEST_ASSERT_EQ(bounds[1], output[1]);
-		}
-		if (this_cpu_has(X86_FEATURE_PKU)) {
-			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_PKRU);
-			set_cr4(get_cr4() | X86_CR4_PKE);
-			GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSPKE));
-
-			wrpkru(-1u);
-		}
-	}
-
-	GUEST_SYNC(2);
-
-	if (arg) {
-		if (this_cpu_has(X86_FEATURE_SVM))
-			svm_l1_guest_code(arg);
-		else
-			vmx_l1_guest_code(arg);
-	}
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	uint64_t *xstate_bv, saved_xstate_bv;
-	vm_vaddr_t nested_gva = 0;
-	struct kvm_cpuid2 empty_cpuid = {};
-	struct kvm_regs regs1, regs2;
-	struct kvm_vcpu *vcpu, *vcpuN;
-	struct kvm_vm *vm;
-	struct kvm_x86_state *state;
-	struct ucall uc;
-	int stage;
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	vcpu_regs_get(vcpu, &regs1);
-
-	if (kvm_has_cap(KVM_CAP_NESTED_STATE)) {
-		if (kvm_cpu_has(X86_FEATURE_SVM))
-			vcpu_alloc_svm(vm, &nested_gva);
-		else if (kvm_cpu_has(X86_FEATURE_VMX))
-			vcpu_alloc_vmx(vm, &nested_gva);
-	}
-
-	if (!nested_gva)
-		pr_info("will skip nested state checks\n");
-
-	vcpu_args_set(vcpu, 1, nested_gva);
-
-	for (stage = 1;; stage++) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		/* UCALL_SYNC is handled here.  */
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage, (ulong)uc.args[1]);
-
-		state = vcpu_save_state(vcpu);
-		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vcpu, &regs1);
-
-		kvm_vm_release(vm);
-
-		/* Restore state in a new VM.  */
-		vcpu = vm_recreate_with_one_vcpu(vm);
-		vcpu_load_state(vcpu, state);
-
-		/*
-		 * Restore XSAVE state in a dummy vCPU, first without doing
-		 * KVM_SET_CPUID2, and then with an empty guest CPUID.  Except
-		 * for off-by-default xfeatures, e.g. AMX, KVM is supposed to
-		 * allow KVM_SET_XSAVE regardless of guest CPUID.  Manually
-		 * load only XSAVE state, MSRs in particular have a much more
-		 * convoluted ABI.
-		 *
-		 * Load two versions of XSAVE state: one with the actual guest
-		 * XSAVE state, and one with all supported features forced "on"
-		 * in xstate_bv, e.g. to ensure that KVM allows loading all
-		 * supported features, even if something goes awry in saving
-		 * the original snapshot.
-		 */
-		xstate_bv = (void *)&((uint8_t *)state->xsave->region)[512];
-		saved_xstate_bv = *xstate_bv;
-
-		vcpuN = __vm_vcpu_add(vm, vcpu->id + 1);
-		vcpu_xsave_set(vcpuN, state->xsave);
-		*xstate_bv = kvm_cpu_supported_xcr0();
-		vcpu_xsave_set(vcpuN, state->xsave);
-
-		vcpu_init_cpuid(vcpuN, &empty_cpuid);
-		vcpu_xsave_set(vcpuN, state->xsave);
-		*xstate_bv = saved_xstate_bv;
-		vcpu_xsave_set(vcpuN, state->xsave);
-
-		kvm_x86_state_cleanup(state);
-
-		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vcpu, &regs2);
-		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
-			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
-			    (ulong) regs2.rdi, (ulong) regs2.rsi);
-	}
-
-done:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
deleted file mode 100644
index 916e04248fbb..000000000000
--- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * svm_int_ctl_test
- *
- * Copyright (C) 2021, Red Hat, Inc.
- *
- * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-#include "apic.h"
-
-bool vintr_irq_called;
-bool intr_irq_called;
-
-#define VINTR_IRQ_NUMBER 0x20
-#define INTR_IRQ_NUMBER 0x30
-
-static void vintr_irq_handler(struct ex_regs *regs)
-{
-	vintr_irq_called = true;
-}
-
-static void intr_irq_handler(struct ex_regs *regs)
-{
-	x2apic_write_reg(APIC_EOI, 0x00);
-	intr_irq_called = true;
-}
-
-static void l2_guest_code(struct svm_test_data *svm)
-{
-	/* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC,
-	 * and since L1 didn't enable virtual interrupt masking,
-	 * L2 should receive it and not L1.
-	 *
-	 * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ
-	 * so it should also receive it after the following 'sti'.
-	 */
-	x2apic_write_reg(APIC_ICR,
-		APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER);
-
-	__asm__ __volatile__(
-		"sti\n"
-		"nop\n"
-	);
-
-	GUEST_ASSERT(vintr_irq_called);
-	GUEST_ASSERT(intr_irq_called);
-
-	__asm__ __volatile__(
-		"vmcall\n"
-	);
-}
-
-static void l1_guest_code(struct svm_test_data *svm)
-{
-	#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	struct vmcb *vmcb = svm->vmcb;
-
-	x2apic_enable();
-
-	/* Prepare for L2 execution. */
-	generic_svm_setup(svm, l2_guest_code,
-			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	/* No virtual interrupt masking */
-	vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
-
-	/* No intercepts for real and virtual interrupts */
-	vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
-
-	/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
-	vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);
-	vmcb->control.int_vector = VINTR_IRQ_NUMBER;
-
-	run_guest(vmcb, svm->vmcb_gpa);
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	vm_vaddr_t svm_gva;
-	struct kvm_vm *vm;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-
-	vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler);
-	vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler);
-
-	vcpu_alloc_svm(vm, &svm_gva);
-	vcpu_args_set(vcpu, 1, svm_gva);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-		/* NOT REACHED */
-	case UCALL_DONE:
-		goto done;
-	default:
-		TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
-	}
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
deleted file mode 100644
index 00135cbba35e..000000000000
--- a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * svm_nested_shutdown_test
- *
- * Copyright (C) 2022, Red Hat, Inc.
- *
- * Nested SVM testing: test that unintercepted shutdown in L2 doesn't crash the host
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-
-static void l2_guest_code(struct svm_test_data *svm)
-{
-	__asm__ __volatile__("ud2");
-}
-
-static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt)
-{
-	#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	struct vmcb *vmcb = svm->vmcb;
-
-	generic_svm_setup(svm, l2_guest_code,
-			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
-
-	idt[6].p   = 0; // #UD is intercepted but its injection will cause #NP
-	idt[11].p  = 0; // #NP is not intercepted and will cause another
-			// #NP that will be converted to #DF
-	idt[8].p   = 0; // #DF will cause #NP which will cause SHUTDOWN
-
-	run_guest(vmcb, svm->vmcb_gpa);
-
-	/* should not reach here */
-	GUEST_ASSERT(0);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	vm_vaddr_t svm_gva;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-	vcpu_alloc_svm(vm, &svm_gva);
-
-	vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
deleted file mode 100644
index 7b6481d6c0d3..000000000000
--- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
+++ /dev/null
@@ -1,210 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2022 Oracle and/or its affiliates.
- *
- * Based on:
- *   svm_int_ctl_test
- *
- *   Copyright (C) 2021, Red Hat, Inc.
- *
- */
-#include <stdatomic.h>
-#include <stdio.h>
-#include <unistd.h>
-#include "apic.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-#include "test_util.h"
-
-#define INT_NR			0x20
-
-static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless");
-
-static unsigned int bp_fired;
-static void guest_bp_handler(struct ex_regs *regs)
-{
-	bp_fired++;
-}
-
-static unsigned int int_fired;
-static void l2_guest_code_int(void);
-
-static void guest_int_handler(struct ex_regs *regs)
-{
-	int_fired++;
-	GUEST_ASSERT_EQ(regs->rip, (unsigned long)l2_guest_code_int);
-}
-
-static void l2_guest_code_int(void)
-{
-	GUEST_ASSERT_EQ(int_fired, 1);
-
-	/*
-         * Same as the vmmcall() function, but with a ud2 sneaked after the
-         * vmmcall.  The caller injects an exception with the return address
-         * increased by 2, so the "pop rbp" must be after the ud2 and we cannot
-	 * use vmmcall() directly.
-         */
-	__asm__ __volatile__("push %%rbp; vmmcall; ud2; pop %%rbp"
-                             : : "a"(0xdeadbeef), "c"(0xbeefdead)
-                             : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
-                               "r10", "r11", "r12", "r13", "r14", "r15");
-
-	GUEST_ASSERT_EQ(bp_fired, 1);
-	hlt();
-}
-
-static atomic_int nmi_stage;
-#define nmi_stage_get() atomic_load_explicit(&nmi_stage, memory_order_acquire)
-#define nmi_stage_inc() atomic_fetch_add_explicit(&nmi_stage, 1, memory_order_acq_rel)
-static void guest_nmi_handler(struct ex_regs *regs)
-{
-	nmi_stage_inc();
-
-	if (nmi_stage_get() == 1) {
-		vmmcall();
-		GUEST_FAIL("Unexpected resume after VMMCALL");
-	} else {
-		GUEST_ASSERT_EQ(nmi_stage_get(), 3);
-		GUEST_DONE();
-	}
-}
-
-static void l2_guest_code_nmi(void)
-{
-	ud2();
-}
-
-static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t idt_alt)
-{
-	#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	struct vmcb *vmcb = svm->vmcb;
-
-	if (is_nmi)
-		x2apic_enable();
-
-	/* Prepare for L2 execution. */
-	generic_svm_setup(svm,
-			  is_nmi ? l2_guest_code_nmi : l2_guest_code_int,
-			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR);
-	vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT);
-
-	if (is_nmi) {
-		vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
-	} else {
-		vmcb->control.event_inj = INT_NR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_SOFT;
-		/* The return address pushed on stack */
-		vmcb->control.next_rip = vmcb->save.rip;
-	}
-
-	run_guest(vmcb, svm->vmcb_gpa);
-	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL,
-		       "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
-		       vmcb->control.exit_code,
-		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
-
-	if (is_nmi) {
-		clgi();
-		x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_NMI);
-
-		GUEST_ASSERT_EQ(nmi_stage_get(), 1);
-		nmi_stage_inc();
-
-		stgi();
-		/* self-NMI happens here */
-		while (true)
-			cpu_relax();
-	}
-
-	/* Skip over VMMCALL */
-	vmcb->save.rip += 3;
-
-	/* Switch to alternate IDT to cause intervening NPF again */
-	vmcb->save.idtr.base = idt_alt;
-	vmcb->control.clean = 0; /* &= ~BIT(VMCB_DT) would be enough */
-
-	vmcb->control.event_inj = BP_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
-	/* The return address pushed on stack, skip over UD2 */
-	vmcb->control.next_rip = vmcb->save.rip + 2;
-
-	run_guest(vmcb, svm->vmcb_gpa);
-	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT,
-		       "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
-		       vmcb->control.exit_code,
-		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
-
-	GUEST_DONE();
-}
-
-static void run_test(bool is_nmi)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	vm_vaddr_t svm_gva;
-	vm_vaddr_t idt_alt_vm;
-	struct kvm_guest_debug debug;
-
-	pr_info("Running %s test\n", is_nmi ? "NMI" : "soft int");
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-
-	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
-	vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler);
-	vm_install_exception_handler(vm, INT_NR, guest_int_handler);
-
-	vcpu_alloc_svm(vm, &svm_gva);
-
-	if (!is_nmi) {
-		void *idt, *idt_alt;
-
-		idt_alt_vm = vm_vaddr_alloc_page(vm);
-		idt_alt = addr_gva2hva(vm, idt_alt_vm);
-		idt = addr_gva2hva(vm, vm->arch.idt);
-		memcpy(idt_alt, idt, getpagesize());
-	} else {
-		idt_alt_vm = 0;
-	}
-	vcpu_args_set(vcpu, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm);
-
-	memset(&debug, 0, sizeof(debug));
-	vcpu_guest_debug_set(vcpu, &debug);
-
-	struct ucall uc;
-
-	alarm(2);
-	vcpu_run(vcpu);
-	alarm(0);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-		/* NOT REACHED */
-	case UCALL_DONE:
-		goto done;
-	default:
-		TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
-	}
-done:
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
-
-	TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS),
-		    "KVM with nSVM is supposed to unconditionally advertise nRIP Save");
-
-	atomic_init(&nmi_stage, 0);
-
-	run_test(false);
-	run_test(true);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
deleted file mode 100644
index 8a62cca28cfb..000000000000
--- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * svm_vmcall_test
- *
- * Copyright (C) 2020, Red Hat, Inc.
- *
- * Nested SVM testing: VMCALL
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "svm_util.h"
-
-static void l2_guest_code(struct svm_test_data *svm)
-{
-	__asm__ __volatile__("vmcall");
-}
-
-static void l1_guest_code(struct svm_test_data *svm)
-{
-	#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	struct vmcb *vmcb = svm->vmcb;
-
-	/* Prepare for L2 execution. */
-	generic_svm_setup(svm, l2_guest_code,
-			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	run_guest(vmcb, svm->vmcb_gpa);
-
-	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	vm_vaddr_t svm_gva;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-
-	vcpu_alloc_svm(vm, &svm_gva);
-	vcpu_args_set(vcpu, 1, svm_gva);
-
-	for (;;) {
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
-		}
-	}
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
deleted file mode 100644
index 8fa3948b0170..000000000000
--- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
+++ /dev/null
@@ -1,411 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Test for x86 KVM_CAP_SYNC_REGS
- *
- * Copyright (C) 2018, Google LLC.
- *
- * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality,
- * including requesting an invalid register set, updates to/from values
- * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <pthread.h>
-
-#include "kvm_test_harness.h"
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-#define UCALL_PIO_PORT ((uint16_t)0x1000)
-
-struct ucall uc_none = {
-	.cmd = UCALL_NONE,
-};
-
-/*
- * ucall is embedded here to protect against compiler reshuffling registers
- * before calling a function. In this test we only need to get KVM_EXIT_IO
- * vmexit and preserve RBX, no additional information is needed.
- */
-void guest_code(void)
-{
-	asm volatile("1: in %[port], %%al\n"
-		     "add $0x1, %%rbx\n"
-		     "jmp 1b"
-		     : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none)
-		     : "rax", "rbx");
-}
-
-KVM_ONE_VCPU_TEST_SUITE(sync_regs_test);
-
-static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
-{
-#define REG_COMPARE(reg) \
-	TEST_ASSERT(left->reg == right->reg, \
-		    "Register " #reg \
-		    " values did not match: 0x%llx, 0x%llx", \
-		    left->reg, right->reg)
-	REG_COMPARE(rax);
-	REG_COMPARE(rbx);
-	REG_COMPARE(rcx);
-	REG_COMPARE(rdx);
-	REG_COMPARE(rsi);
-	REG_COMPARE(rdi);
-	REG_COMPARE(rsp);
-	REG_COMPARE(rbp);
-	REG_COMPARE(r8);
-	REG_COMPARE(r9);
-	REG_COMPARE(r10);
-	REG_COMPARE(r11);
-	REG_COMPARE(r12);
-	REG_COMPARE(r13);
-	REG_COMPARE(r14);
-	REG_COMPARE(r15);
-	REG_COMPARE(rip);
-	REG_COMPARE(rflags);
-#undef REG_COMPARE
-}
-
-static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right)
-{
-}
-
-static void compare_vcpu_events(struct kvm_vcpu_events *left,
-				struct kvm_vcpu_events *right)
-{
-}
-
-#define TEST_SYNC_FIELDS   (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS)
-#define INVALID_SYNC_FIELD 0x80000000
-
-/*
- * Set an exception as pending *and* injected while KVM is processing events.
- * KVM is supposed to ignore/drop pending exceptions if userspace is also
- * requesting that an exception be injected.
- */
-static void *race_events_inj_pen(void *arg)
-{
-	struct kvm_run *run = (struct kvm_run *)arg;
-	struct kvm_vcpu_events *events = &run->s.regs.events;
-
-	WRITE_ONCE(events->exception.nr, UD_VECTOR);
-
-	for (;;) {
-		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS);
-		WRITE_ONCE(events->flags, 0);
-		WRITE_ONCE(events->exception.injected, 1);
-		WRITE_ONCE(events->exception.pending, 1);
-
-		pthread_testcancel();
-	}
-
-	return NULL;
-}
-
-/*
- * Set an invalid exception vector while KVM is processing events.  KVM is
- * supposed to reject any vector >= 32, as well as NMIs (vector 2).
- */
-static void *race_events_exc(void *arg)
-{
-	struct kvm_run *run = (struct kvm_run *)arg;
-	struct kvm_vcpu_events *events = &run->s.regs.events;
-
-	for (;;) {
-		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS);
-		WRITE_ONCE(events->flags, 0);
-		WRITE_ONCE(events->exception.nr, UD_VECTOR);
-		WRITE_ONCE(events->exception.pending, 1);
-		WRITE_ONCE(events->exception.nr, 255);
-
-		pthread_testcancel();
-	}
-
-	return NULL;
-}
-
-/*
- * Toggle CR4.PAE while KVM is processing SREGS, EFER.LME=1 with CR4.PAE=0 is
- * illegal, and KVM's MMU heavily relies on vCPU state being valid.
- */
-static noinline void *race_sregs_cr4(void *arg)
-{
-	struct kvm_run *run = (struct kvm_run *)arg;
-	__u64 *cr4 = &run->s.regs.sregs.cr4;
-	__u64 pae_enabled = *cr4;
-	__u64 pae_disabled = *cr4 & ~X86_CR4_PAE;
-
-	for (;;) {
-		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_SREGS);
-		WRITE_ONCE(*cr4, pae_enabled);
-		asm volatile(".rept 512\n\t"
-			     "nop\n\t"
-			     ".endr");
-		WRITE_ONCE(*cr4, pae_disabled);
-
-		pthread_testcancel();
-	}
-
-	return NULL;
-}
-
-static void race_sync_regs(struct kvm_vcpu *vcpu, void *racer)
-{
-	const time_t TIMEOUT = 2; /* seconds, roughly */
-	struct kvm_x86_state *state;
-	struct kvm_translation tr;
-	struct kvm_run *run;
-	pthread_t thread;
-	time_t t;
-
-	run = vcpu->run;
-
-	run->kvm_valid_regs = KVM_SYNC_X86_SREGS;
-	vcpu_run(vcpu);
-	run->kvm_valid_regs = 0;
-
-	/* Save state *before* spawning the thread that mucks with vCPU state. */
-	state = vcpu_save_state(vcpu);
-
-	/*
-	 * Selftests run 64-bit guests by default, both EFER.LME and CR4.PAE
-	 * should already be set in guest state.
-	 */
-	TEST_ASSERT((run->s.regs.sregs.cr4 & X86_CR4_PAE) &&
-		    (run->s.regs.sregs.efer & EFER_LME),
-		    "vCPU should be in long mode, CR4.PAE=%d, EFER.LME=%d",
-		    !!(run->s.regs.sregs.cr4 & X86_CR4_PAE),
-		    !!(run->s.regs.sregs.efer & EFER_LME));
-
-	TEST_ASSERT_EQ(pthread_create(&thread, NULL, racer, (void *)run), 0);
-
-	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
-		/*
-		 * Reload known good state if the vCPU triple faults, e.g. due
-		 * to the unhandled #GPs being injected.  VMX preserves state
-		 * on shutdown, but SVM synthesizes an INIT as the VMCB state
-		 * is architecturally undefined on triple fault.
-		 */
-		if (!__vcpu_run(vcpu) && run->exit_reason == KVM_EXIT_SHUTDOWN)
-			vcpu_load_state(vcpu, state);
-
-		if (racer == race_sregs_cr4) {
-			tr = (struct kvm_translation) { .linear_address = 0 };
-			__vcpu_ioctl(vcpu, KVM_TRANSLATE, &tr);
-		}
-	}
-
-	TEST_ASSERT_EQ(pthread_cancel(thread), 0);
-	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
-
-	kvm_x86_state_cleanup(state);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, read_invalid, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-	int rv;
-
-	/* Request reading invalid register set from VCPU. */
-	run->kvm_valid_regs = INVALID_SYNC_FIELD;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_valid_regs = 0;
-
-	run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_valid_regs = 0;
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, set_invalid, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-	int rv;
-
-	/* Request setting invalid register set into VCPU. */
-	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_dirty_regs = 0;
-
-	run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vcpu);
-	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
-		    rv);
-	run->kvm_dirty_regs = 0;
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_vcpu_events events;
-	struct kvm_sregs sregs;
-	struct kvm_regs regs;
-
-	/* Request and verify all valid register sets. */
-	/* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	vcpu_regs_get(vcpu, &regs);
-	compare_regs(&regs, &run->s.regs.regs);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	compare_sregs(&sregs, &run->s.regs.sregs);
-
-	vcpu_events_get(vcpu, &events);
-	compare_vcpu_events(&events, &run->s.regs.events);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, set_and_verify_various, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_vcpu_events events;
-	struct kvm_sregs sregs;
-	struct kvm_regs regs;
-
-	/* Run once to get register set */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	/* Set and verify various register values. */
-	run->s.regs.regs.rbx = 0xBAD1DEA;
-	run->s.regs.sregs.apic_base = 1 << 11;
-	/* TODO run->s.regs.events.XYZ = ABC; */
-
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1,
-		    "rbx sync regs value incorrect 0x%llx.",
-		    run->s.regs.regs.rbx);
-	TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11,
-		    "apic_base sync regs value incorrect 0x%llx.",
-		    run->s.regs.sregs.apic_base);
-
-	vcpu_regs_get(vcpu, &regs);
-	compare_regs(&regs, &run->s.regs.regs);
-
-	vcpu_sregs_get(vcpu, &sregs);
-	compare_sregs(&sregs, &run->s.regs.sregs);
-
-	vcpu_events_get(vcpu, &events);
-	compare_vcpu_events(&events, &run->s.regs.events);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_dirty_regs_bits, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-
-	/* Clear kvm_dirty_regs bits, verify new s.regs values are
-	 * overwritten with existing guest values.
-	 */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	run->kvm_dirty_regs = 0;
-	run->s.regs.regs.rbx = 0xDEADBEEF;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF,
-		    "rbx sync regs value incorrect 0x%llx.",
-		    run->s.regs.regs.rbx);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_and_dirty_regs, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_regs regs;
-
-	/* Run once to get register set */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	/* Clear kvm_valid_regs bits and kvm_dirty_bits.
-	 * Verify s.regs values are not overwritten with existing guest values
-	 * and that guest values are not overwritten with kvm_sync_regs values.
-	 */
-	run->kvm_valid_regs = 0;
-	run->kvm_dirty_regs = 0;
-	run->s.regs.regs.rbx = 0xAAAA;
-	vcpu_regs_get(vcpu, &regs);
-	regs.rbx = 0xBAC0;
-	vcpu_regs_set(vcpu, &regs);
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA,
-		    "rbx sync regs value incorrect 0x%llx.",
-		    run->s.regs.regs.rbx);
-	vcpu_regs_get(vcpu, &regs);
-	TEST_ASSERT(regs.rbx == 0xBAC0 + 1,
-		    "rbx guest value incorrect 0x%llx.",
-		    regs.rbx);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_regs_bits, guest_code)
-{
-	struct kvm_run *run = vcpu->run;
-	struct kvm_regs regs;
-
-	/* Run once to get register set */
-	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	/* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
-	 * with existing guest values but that guest values are overwritten
-	 * with kvm_sync_regs values.
-	 */
-	run->kvm_valid_regs = 0;
-	run->kvm_dirty_regs = TEST_SYNC_FIELDS;
-	run->s.regs.regs.rbx = 0xBBBB;
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB,
-		    "rbx sync regs value incorrect 0x%llx.",
-		    run->s.regs.regs.rbx);
-	vcpu_regs_get(vcpu, &regs);
-	TEST_ASSERT(regs.rbx == 0xBBBB + 1,
-		    "rbx guest value incorrect 0x%llx.",
-		    regs.rbx);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, race_cr4, guest_code)
-{
-	race_sync_regs(vcpu, race_sregs_cr4);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, race_exc, guest_code)
-{
-	race_sync_regs(vcpu, race_events_exc);
-}
-
-KVM_ONE_VCPU_TEST(sync_regs_test, race_inj_pen, guest_code)
-{
-	race_sync_regs(vcpu, race_events_inj_pen);
-}
-
-int main(int argc, char *argv[])
-{
-	int cap;
-
-	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
-	TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS);
-	TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD));
-
-	return test_harness_run(argc, argv);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
deleted file mode 100644
index 56306a19144a..000000000000
--- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
+++ /dev/null
@@ -1,124 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-#include "svm_util.h"
-
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "kselftest.h"
-
-#define ARBITRARY_IO_PORT	0x2000
-
-/* The virtual machine object. */
-static struct kvm_vm *vm;
-
-static void l2_guest_code(void)
-{
-	asm volatile("inb %%dx, %%al"
-		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
-}
-
-#define L2_GUEST_STACK_SIZE 64
-unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-void l1_guest_code_vmx(struct vmx_pages *vmx)
-{
-
-	GUEST_ASSERT(vmx->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
-	GUEST_ASSERT(load_vmcs(vmx));
-
-	prepare_vmcs(vmx, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_ASSERT(!vmlaunch());
-	/* L2 should triple fault after a triple fault event injected. */
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
-	GUEST_DONE();
-}
-
-void l1_guest_code_svm(struct svm_test_data *svm)
-{
-	struct vmcb *vmcb = svm->vmcb;
-
-	generic_svm_setup(svm, l2_guest_code,
-			&l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	/* don't intercept shutdown to test the case of SVM allowing to do so */
-	vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
-
-	run_guest(vmcb, svm->vmcb_gpa);
-
-	/* should not reach here, L1 should crash  */
-	GUEST_ASSERT(0);
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_run *run;
-	struct kvm_vcpu_events events;
-	struct ucall uc;
-
-	bool has_vmx = kvm_cpu_has(X86_FEATURE_VMX);
-	bool has_svm = kvm_cpu_has(X86_FEATURE_SVM);
-
-	TEST_REQUIRE(has_vmx || has_svm);
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT));
-
-
-	if (has_vmx) {
-		vm_vaddr_t vmx_pages_gva;
-
-		vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_vmx);
-		vcpu_alloc_vmx(vm, &vmx_pages_gva);
-		vcpu_args_set(vcpu, 1, vmx_pages_gva);
-	} else {
-		vm_vaddr_t svm_gva;
-
-		vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_svm);
-		vcpu_alloc_svm(vm, &svm_gva);
-		vcpu_args_set(vcpu, 1, svm_gva);
-	}
-
-	vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1);
-	run = vcpu->run;
-	vcpu_run(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
-		    "Expected IN from port %d from L2, got port %d",
-		    ARBITRARY_IO_PORT, run->io.port);
-	vcpu_events_get(vcpu, &events);
-	events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
-	events.triple_fault.pending = true;
-	vcpu_events_set(vcpu, &events);
-	run->immediate_exit = true;
-	vcpu_run_complete_io(vcpu);
-
-	vcpu_events_get(vcpu, &events);
-	TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT,
-		    "Triple fault event invalid");
-	TEST_ASSERT(events.triple_fault.pending,
-		    "No triple fault pending");
-	vcpu_run(vcpu);
-
-
-	if (has_svm) {
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
-	} else {
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_DONE:
-			break;
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-		default:
-			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-		}
-	}
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
deleted file mode 100644
index 12b0964f4f13..000000000000
--- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
+++ /dev/null
@@ -1,161 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
- *
- * Copyright (C) 2020, Red Hat, Inc.
- */
-#include <stdio.h>
-#include <string.h>
-#include "kvm_util.h"
-#include "processor.h"
-
-#define UNITY                  (1ull << 30)
-#define HOST_ADJUST            (UNITY * 64)
-#define GUEST_STEP             (UNITY * 4)
-#define ROUND(x)               ((x + UNITY / 2) & -UNITY)
-#define rounded_rdmsr(x)       ROUND(rdmsr(x))
-#define rounded_host_rdmsr(x)  ROUND(vcpu_get_msr(vcpu, x))
-
-static void guest_code(void)
-{
-	u64 val = 0;
-
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
-	val = 1ull * GUEST_STEP;
-	wrmsr(MSR_IA32_TSC, val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
-	GUEST_SYNC(2);
-	val = 2ull * GUEST_STEP;
-	wrmsr(MSR_IA32_TSC_ADJUST, val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/* Host: setting the TSC offset.  */
-	GUEST_SYNC(3);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/*
-	 * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
-	 * host-side offset and affect both MSRs.
-	 */
-	GUEST_SYNC(4);
-	val = 3ull * GUEST_STEP;
-	wrmsr(MSR_IA32_TSC_ADJUST, val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/*
-	 * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
-	 * offset is now visible in MSR_IA32_TSC_ADJUST.
-	 */
-	GUEST_SYNC(5);
-	val = 4ull * GUEST_STEP;
-	wrmsr(MSR_IA32_TSC, val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
-	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
-
-	GUEST_DONE();
-}
-
-static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_SYNC:
-		if (!strcmp((const char *)uc.args[0], "hello") &&
-		    uc.args[1] == stage + 1)
-			ksft_test_result_pass("stage %d passed\n", stage + 1);
-		else
-			ksft_test_result_fail(
-				"stage %d: Unexpected register values vmexit, got %lx",
-				stage + 1, (ulong)uc.args[1]);
-		return;
-	case UCALL_DONE:
-		ksft_test_result_pass("stage %d passed\n", stage + 1);
-		return;
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-	default:
-		TEST_ASSERT(false, "Unexpected exit: %s",
-			    exit_reason_str(vcpu->run->exit_reason));
-	}
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	uint64_t val;
-
-	ksft_print_header();
-	ksft_set_plan(5);
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	val = 0;
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
-	run_vcpu(vcpu, 1);
-	val = 1ull * GUEST_STEP;
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
-	run_vcpu(vcpu, 2);
-	val = 2ull * GUEST_STEP;
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/*
-	 * Host: writes to MSR_IA32_TSC set the host-side offset
-	 * and therefore do not change MSR_IA32_TSC_ADJUST.
-	 */
-	vcpu_set_msr(vcpu, MSR_IA32_TSC, HOST_ADJUST + val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
-	run_vcpu(vcpu, 3);
-
-	/* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC.  */
-	vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, UNITY * 123456);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
-	TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_TSC_ADJUST), UNITY * 123456);
-
-	/* Restore previous value.  */
-	vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/*
-	 * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
-	 * host-side offset and affect both MSRs.
-	 */
-	run_vcpu(vcpu, 4);
-	val = 3ull * GUEST_STEP;
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
-
-	/*
-	 * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
-	 * offset is now visible in MSR_IA32_TSC_ADJUST.
-	 */
-	run_vcpu(vcpu, 5);
-	val = 4ull * GUEST_STEP;
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
-	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
-
-	kvm_vm_free(vm);
-
-	ksft_finished();	/* Print results and exit() accordingly */
-}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
deleted file mode 100644
index 59c7304f805e..000000000000
--- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright © 2021 Amazon.com, Inc. or its affiliates.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-#include <stdint.h>
-#include <time.h>
-#include <sched.h>
-#include <signal.h>
-#include <pthread.h>
-
-#define NR_TEST_VCPUS 20
-
-static struct kvm_vm *vm;
-pthread_spinlock_t create_lock;
-
-#define TEST_TSC_KHZ    2345678UL
-#define TEST_TSC_OFFSET 200000000
-
-uint64_t tsc_sync;
-static void guest_code(void)
-{
-	uint64_t start_tsc, local_tsc, tmp;
-
-	start_tsc = rdtsc();
-	do {
-		tmp = READ_ONCE(tsc_sync);
-		local_tsc = rdtsc();
-		WRITE_ONCE(tsc_sync, local_tsc);
-		if (unlikely(local_tsc < tmp))
-			GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
-
-	} while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
-
-	GUEST_DONE();
-}
-
-
-static void *run_vcpu(void *_cpu_nr)
-{
-	unsigned long vcpu_id = (unsigned long)_cpu_nr;
-	unsigned long failures = 0;
-	static bool first_cpu_done;
-	struct kvm_vcpu *vcpu;
-
-	/* The kernel is fine, but vm_vcpu_add() needs locking */
-	pthread_spin_lock(&create_lock);
-
-	vcpu = vm_vcpu_add(vm, vcpu_id, guest_code);
-
-	if (!first_cpu_done) {
-		first_cpu_done = true;
-		vcpu_set_msr(vcpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
-	}
-
-	pthread_spin_unlock(&create_lock);
-
-	for (;;) {
-                struct ucall uc;
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-                case UCALL_DONE:
-			goto out;
-
-                case UCALL_SYNC:
-			printf("Guest %d sync %lx %lx %ld\n", vcpu->id,
-			       uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
-			failures++;
-			break;
-
-                default:
-                        TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
- out:
-	return (void *)failures;
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_TSC_CONTROL));
-
-	vm = vm_create(NR_TEST_VCPUS);
-	vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
-
-	pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
-	pthread_t cpu_threads[NR_TEST_VCPUS];
-	unsigned long cpu;
-	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
-		pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
-
-	unsigned long failures = 0;
-	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
-		void *this_cpu_failures;
-		pthread_join(cpu_threads[cpu], &this_cpu_failures);
-		failures += (unsigned long)this_cpu_failures;
-	}
-
-	TEST_ASSERT(!failures, "TSC sync failed");
-	pthread_spin_destroy(&create_lock);
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
deleted file mode 100644
index 57f157c06b39..000000000000
--- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ucna_injection_test
- *
- * Copyright (C) 2022, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * Test that user space can inject UnCorrectable No Action required (UCNA)
- * memory errors to the guest.
- *
- * The test starts one vCPU with the MCG_CMCI_P enabled. It verifies that
- * proper UCNA errors can be injected to a vCPU with MCG_CMCI_P and
- * corresponding per-bank control register (MCI_CTL2) bit enabled.
- * The test also checks that the UCNA errors get recorded in the
- * Machine Check bank registers no matter the error signal interrupts get
- * delivered into the guest or not.
- *
- */
-#include <pthread.h>
-#include <inttypes.h>
-#include <string.h>
-#include <time.h>
-
-#include "kvm_util.h"
-#include "mce.h"
-#include "processor.h"
-#include "test_util.h"
-#include "apic.h"
-
-#define SYNC_FIRST_UCNA 9
-#define SYNC_SECOND_UCNA 10
-#define SYNC_GP 11
-#define FIRST_UCNA_ADDR 0xdeadbeef
-#define SECOND_UCNA_ADDR 0xcafeb0ba
-
-/*
- * Vector for the CMCI interrupt.
- * Value is arbitrary. Any value in 0x20-0xFF should work:
- * https://wiki.osdev.org/Interrupt_Vector_Table
- */
-#define CMCI_VECTOR  0xa9
-
-#define UCNA_BANK  0x7	// IMC0 bank
-
-#define MCI_CTL2_RESERVED_BIT BIT_ULL(29)
-
-static uint64_t supported_mcg_caps;
-
-/*
- * Record states about the injected UCNA.
- * The variables started with the 'i_' prefixes are recorded in interrupt
- * handler. Variables without the 'i_' prefixes are recorded in guest main
- * execution thread.
- */
-static volatile uint64_t i_ucna_rcvd;
-static volatile uint64_t i_ucna_addr;
-static volatile uint64_t ucna_addr;
-static volatile uint64_t ucna_addr2;
-
-struct thread_params {
-	struct kvm_vcpu *vcpu;
-	uint64_t *p_i_ucna_rcvd;
-	uint64_t *p_i_ucna_addr;
-	uint64_t *p_ucna_addr;
-	uint64_t *p_ucna_addr2;
-};
-
-static void verify_apic_base_addr(void)
-{
-	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
-	uint64_t base = GET_APIC_BASE(msr);
-
-	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
-}
-
-static void ucna_injection_guest_code(void)
-{
-	uint64_t ctl2;
-	verify_apic_base_addr();
-	xapic_enable();
-
-	/* Sets up the interrupt vector and enables per-bank CMCI sigaling. */
-	xapic_write_reg(APIC_LVTCMCI, CMCI_VECTOR | APIC_DM_FIXED);
-	ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
-	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN);
-
-	/* Enables interrupt in guest. */
-	asm volatile("sti");
-
-	/* Let user space inject the first UCNA */
-	GUEST_SYNC(SYNC_FIRST_UCNA);
-
-	ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
-
-	/* Disables the per-bank CMCI signaling. */
-	ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
-	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 & ~MCI_CTL2_CMCI_EN);
-
-	/* Let the user space inject the second UCNA */
-	GUEST_SYNC(SYNC_SECOND_UCNA);
-
-	ucna_addr2 = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
-	GUEST_DONE();
-}
-
-static void cmci_disabled_guest_code(void)
-{
-	uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
-	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN);
-
-	GUEST_DONE();
-}
-
-static void cmci_enabled_guest_code(void)
-{
-	uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
-	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_RESERVED_BIT);
-
-	GUEST_DONE();
-}
-
-static void guest_cmci_handler(struct ex_regs *regs)
-{
-	i_ucna_rcvd++;
-	i_ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
-	xapic_write_reg(APIC_EOI, 0);
-}
-
-static void guest_gp_handler(struct ex_regs *regs)
-{
-	GUEST_SYNC(SYNC_GP);
-}
-
-static void run_vcpu_expect_gp(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_SYNC,
-		    "Expect UCALL_SYNC");
-	TEST_ASSERT(uc.args[1] == SYNC_GP, "#GP is expected.");
-	printf("vCPU received GP in guest.\n");
-}
-
-static void inject_ucna(struct kvm_vcpu *vcpu, uint64_t addr) {
-	/*
-	 * A UCNA error is indicated with VAL=1, UC=1, PCC=0, S=0 and AR=0 in
-	 * the IA32_MCi_STATUS register.
-	 * MSCOD=1 (BIT[16] - MscodDataRdErr).
-	 * MCACOD=0x0090 (Memory controller error format, channel 0)
-	 */
-	uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
-			  MCI_STATUS_MISCV | MCI_STATUS_ADDRV | 0x10090;
-	struct kvm_x86_mce mce = {};
-	mce.status = status;
-	mce.mcg_status = 0;
-	/*
-	 * MCM_ADDR_PHYS indicates the reported address is a physical address.
-	 * Lowest 6 bits is the recoverable address LSB, i.e., the injected MCE
-	 * is at 4KB granularity.
-	 */
-	mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
-	mce.addr = addr;
-	mce.bank = UCNA_BANK;
-
-	vcpu_ioctl(vcpu, KVM_X86_SET_MCE, &mce);
-}
-
-static void *run_ucna_injection(void *arg)
-{
-	struct thread_params *params = (struct thread_params *)arg;
-	struct ucall uc;
-	int old;
-	int r;
-
-	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
-	TEST_ASSERT(r == 0,
-		    "pthread_setcanceltype failed with errno=%d",
-		    r);
-
-	vcpu_run(params->vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC,
-		    "Expect UCALL_SYNC");
-	TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA.");
-
-	printf("Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR);
-
-	inject_ucna(params->vcpu, FIRST_UCNA_ADDR);
-	vcpu_run(params->vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
-	TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC,
-		    "Expect UCALL_SYNC");
-	TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA.");
-
-	printf("Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR);
-
-	inject_ucna(params->vcpu, SECOND_UCNA_ADDR);
-	vcpu_run(params->vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
-	if (get_ucall(params->vcpu, &uc) == UCALL_ABORT) {
-		TEST_ASSERT(false, "vCPU assertion failure: %s.",
-			    (const char *)uc.args[0]);
-	}
-
-	return NULL;
-}
-
-static void test_ucna_injection(struct kvm_vcpu *vcpu, struct thread_params *params)
-{
-	struct kvm_vm *vm = vcpu->vm;
-	params->vcpu = vcpu;
-	params->p_i_ucna_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_rcvd);
-	params->p_i_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_addr);
-	params->p_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr);
-	params->p_ucna_addr2 = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr2);
-
-	run_ucna_injection(params);
-
-	TEST_ASSERT(*params->p_i_ucna_rcvd == 1, "Only first UCNA get signaled.");
-	TEST_ASSERT(*params->p_i_ucna_addr == FIRST_UCNA_ADDR,
-		    "Only first UCNA reported addr get recorded via interrupt.");
-	TEST_ASSERT(*params->p_ucna_addr == FIRST_UCNA_ADDR,
-		    "First injected UCNAs should get exposed via registers.");
-	TEST_ASSERT(*params->p_ucna_addr2 == SECOND_UCNA_ADDR,
-		    "Second injected UCNAs should get exposed via registers.");
-
-	printf("Test successful.\n"
-	       "UCNA CMCI interrupts received: %ld\n"
-	       "Last UCNA address received via CMCI: %lx\n"
-	       "First UCNA address in vCPU thread: %lx\n"
-	       "Second UCNA address in vCPU thread: %lx\n",
-	       *params->p_i_ucna_rcvd, *params->p_i_ucna_addr,
-	       *params->p_ucna_addr, *params->p_ucna_addr2);
-}
-
-static void setup_mce_cap(struct kvm_vcpu *vcpu, bool enable_cmci_p)
-{
-	uint64_t mcg_caps = MCG_CTL_P | MCG_SER_P | MCG_LMCE_P | KVM_MAX_MCE_BANKS;
-	if (enable_cmci_p)
-		mcg_caps |= MCG_CMCI_P;
-
-	mcg_caps &= supported_mcg_caps | MCG_CAP_BANKS_MASK;
-	vcpu_ioctl(vcpu, KVM_X86_SETUP_MCE, &mcg_caps);
-}
-
-static struct kvm_vcpu *create_vcpu_with_mce_cap(struct kvm_vm *vm, uint32_t vcpuid,
-						 bool enable_cmci_p, void *guest_code)
-{
-	struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpuid, guest_code);
-	setup_mce_cap(vcpu, enable_cmci_p);
-	return vcpu;
-}
-
-int main(int argc, char *argv[])
-{
-	struct thread_params params;
-	struct kvm_vm *vm;
-	struct kvm_vcpu *ucna_vcpu;
-	struct kvm_vcpu *cmcidis_vcpu;
-	struct kvm_vcpu *cmci_vcpu;
-
-	kvm_check_cap(KVM_CAP_MCE);
-
-	vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0);
-
-	kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED,
-		  &supported_mcg_caps);
-
-	if (!(supported_mcg_caps & MCG_CMCI_P)) {
-		print_skip("MCG_CMCI_P is not supported");
-		exit(KSFT_SKIP);
-	}
-
-	ucna_vcpu = create_vcpu_with_mce_cap(vm, 0, true, ucna_injection_guest_code);
-	cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code);
-	cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code);
-
-	vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler);
-	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
-
-	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
-
-	test_ucna_injection(ucna_vcpu, &params);
-	run_vcpu_expect_gp(cmcidis_vcpu);
-	run_vcpu_expect_gp(cmci_vcpu);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
deleted file mode 100644
index 9481cbcf284f..000000000000
--- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-
-static void guest_ins_port80(uint8_t *buffer, unsigned int count)
-{
-	unsigned long end;
-
-	if (count == 2)
-		end = (unsigned long)buffer + 1;
-	else
-		end = (unsigned long)buffer + 8192;
-
-	asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
-	GUEST_ASSERT_EQ(count, 0);
-	GUEST_ASSERT_EQ((unsigned long)buffer, end);
-}
-
-static void guest_code(void)
-{
-	uint8_t buffer[8192];
-	int i;
-
-	/*
-	 * Special case tests.  main() will adjust RCX 2 => 1 and 3 => 8192 to
-	 * test that KVM doesn't explode when userspace modifies the "count" on
-	 * a userspace I/O exit.  KVM isn't required to play nice with the I/O
-	 * itself as KVM doesn't support manipulating the count, it just needs
-	 * to not explode or overflow a buffer.
-	 */
-	guest_ins_port80(buffer, 2);
-	guest_ins_port80(buffer, 3);
-
-	/* Verify KVM fills the buffer correctly when not stuffing RCX. */
-	memset(buffer, 0, sizeof(buffer));
-	guest_ins_port80(buffer, 8192);
-	for (i = 0; i < 8192; i++)
-		__GUEST_ASSERT(buffer[i] == 0xaa,
-			       "Expected '0xaa', got '0x%x' at buffer[%u]",
-			       buffer[i], i);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_regs regs;
-	struct kvm_run *run;
-	struct kvm_vm *vm;
-	struct ucall uc;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	run = vcpu->run;
-
-	memset(&regs, 0, sizeof(regs));
-
-	while (1) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		if (get_ucall(vcpu, &uc))
-			break;
-
-		TEST_ASSERT(run->io.port == 0x80,
-			    "Expected I/O at port 0x80, got port 0x%x", run->io.port);
-
-		/*
-		 * Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
-		 * Note, this abuses KVM's batching of rep string I/O to avoid
-		 * getting stuck in an infinite loop.  That behavior isn't in
-		 * scope from a testing perspective as it's not ABI in any way,
-		 * i.e. it really is abusing internal KVM knowledge.
-		 */
-		vcpu_regs_get(vcpu, &regs);
-		if (regs.rcx == 2)
-			regs.rcx = 1;
-		if (regs.rcx == 3)
-			regs.rcx = 8192;
-		memset((void *)run + run->io.data_offset, 0xaa, 4096);
-		vcpu_regs_set(vcpu, &regs);
-	}
-
-	switch (uc.cmd) {
-	case UCALL_DONE:
-		break;
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-	default:
-		TEST_FAIL("Unknown ucall %lu", uc.cmd);
-	}
-
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
deleted file mode 100644
index 32b2794b78fe..000000000000
--- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
+++ /dev/null
@@ -1,769 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2020, Google LLC.
- *
- * Tests for exiting into userspace on registered MSRs
- */
-#include <sys/ioctl.h>
-
-#include "kvm_test_harness.h"
-#include "test_util.h"
-#include "kvm_util.h"
-#include "vmx.h"
-
-#define MSR_NON_EXISTENT 0x474f4f00
-
-static u64 deny_bits = 0;
-struct kvm_msr_filter filter_allow = {
-	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
-	.ranges = {
-		{
-			.flags = KVM_MSR_FILTER_READ |
-				 KVM_MSR_FILTER_WRITE,
-			.nmsrs = 1,
-			/* Test an MSR the kernel knows about. */
-			.base = MSR_IA32_XSS,
-			.bitmap = (uint8_t*)&deny_bits,
-		}, {
-			.flags = KVM_MSR_FILTER_READ |
-				 KVM_MSR_FILTER_WRITE,
-			.nmsrs = 1,
-			/* Test an MSR the kernel doesn't know about. */
-			.base = MSR_IA32_FLUSH_CMD,
-			.bitmap = (uint8_t*)&deny_bits,
-		}, {
-			.flags = KVM_MSR_FILTER_READ |
-				 KVM_MSR_FILTER_WRITE,
-			.nmsrs = 1,
-			/* Test a fabricated MSR that no one knows about. */
-			.base = MSR_NON_EXISTENT,
-			.bitmap = (uint8_t*)&deny_bits,
-		},
-	},
-};
-
-struct kvm_msr_filter filter_fs = {
-	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
-	.ranges = {
-		{
-			.flags = KVM_MSR_FILTER_READ,
-			.nmsrs = 1,
-			.base = MSR_FS_BASE,
-			.bitmap = (uint8_t*)&deny_bits,
-		},
-	},
-};
-
-struct kvm_msr_filter filter_gs = {
-	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
-	.ranges = {
-		{
-			.flags = KVM_MSR_FILTER_READ,
-			.nmsrs = 1,
-			.base = MSR_GS_BASE,
-			.bitmap = (uint8_t*)&deny_bits,
-		},
-	},
-};
-
-static uint64_t msr_non_existent_data;
-static int guest_exception_count;
-static u32 msr_reads, msr_writes;
-
-static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
-static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
-static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
-static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
-static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
-static u8 bitmap_deadbeef[1] = { 0x1 };
-
-static void deny_msr(uint8_t *bitmap, u32 msr)
-{
-	u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
-
-	bitmap[idx / 8] &= ~(1 << (idx % 8));
-}
-
-static void prepare_bitmaps(void)
-{
-	memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
-	memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
-	memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
-	memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
-	memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
-
-	deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
-	deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
-	deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
-}
-
-struct kvm_msr_filter filter_deny = {
-	.flags = KVM_MSR_FILTER_DEFAULT_DENY,
-	.ranges = {
-		{
-			.flags = KVM_MSR_FILTER_READ,
-			.base = 0x00000000,
-			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
-			.bitmap = bitmap_00000000,
-		}, {
-			.flags = KVM_MSR_FILTER_WRITE,
-			.base = 0x00000000,
-			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
-			.bitmap = bitmap_00000000_write,
-		}, {
-			.flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
-			.base = 0x40000000,
-			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
-			.bitmap = bitmap_40000000,
-		}, {
-			.flags = KVM_MSR_FILTER_READ,
-			.base = 0xc0000000,
-			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
-			.bitmap = bitmap_c0000000_read,
-		}, {
-			.flags = KVM_MSR_FILTER_WRITE,
-			.base = 0xc0000000,
-			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
-			.bitmap = bitmap_c0000000,
-		}, {
-			.flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
-			.base = 0xdeadbeef,
-			.nmsrs = 1,
-			.bitmap = bitmap_deadbeef,
-		},
-	},
-};
-
-struct kvm_msr_filter no_filter_deny = {
-	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
-};
-
-/*
- * Note: Force test_rdmsr() to not be inlined to prevent the labels,
- * rdmsr_start and rdmsr_end, from being defined multiple times.
- */
-static noinline uint64_t test_rdmsr(uint32_t msr)
-{
-	uint32_t a, d;
-
-	guest_exception_count = 0;
-
-	__asm__ __volatile__("rdmsr_start: rdmsr; rdmsr_end:" :
-			"=a"(a), "=d"(d) : "c"(msr) : "memory");
-
-	return a | ((uint64_t) d << 32);
-}
-
-/*
- * Note: Force test_wrmsr() to not be inlined to prevent the labels,
- * wrmsr_start and wrmsr_end, from being defined multiple times.
- */
-static noinline void test_wrmsr(uint32_t msr, uint64_t value)
-{
-	uint32_t a = value;
-	uint32_t d = value >> 32;
-
-	guest_exception_count = 0;
-
-	__asm__ __volatile__("wrmsr_start: wrmsr; wrmsr_end:" ::
-			"a"(a), "d"(d), "c"(msr) : "memory");
-}
-
-extern char rdmsr_start, rdmsr_end;
-extern char wrmsr_start, wrmsr_end;
-
-/*
- * Note: Force test_em_rdmsr() to not be inlined to prevent the labels,
- * rdmsr_start and rdmsr_end, from being defined multiple times.
- */
-static noinline uint64_t test_em_rdmsr(uint32_t msr)
-{
-	uint32_t a, d;
-
-	guest_exception_count = 0;
-
-	__asm__ __volatile__(KVM_FEP "em_rdmsr_start: rdmsr; em_rdmsr_end:" :
-			"=a"(a), "=d"(d) : "c"(msr) : "memory");
-
-	return a | ((uint64_t) d << 32);
-}
-
-/*
- * Note: Force test_em_wrmsr() to not be inlined to prevent the labels,
- * wrmsr_start and wrmsr_end, from being defined multiple times.
- */
-static noinline void test_em_wrmsr(uint32_t msr, uint64_t value)
-{
-	uint32_t a = value;
-	uint32_t d = value >> 32;
-
-	guest_exception_count = 0;
-
-	__asm__ __volatile__(KVM_FEP "em_wrmsr_start: wrmsr; em_wrmsr_end:" ::
-			"a"(a), "d"(d), "c"(msr) : "memory");
-}
-
-extern char em_rdmsr_start, em_rdmsr_end;
-extern char em_wrmsr_start, em_wrmsr_end;
-
-static void guest_code_filter_allow(void)
-{
-	uint64_t data;
-
-	/*
-	 * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_XSS.
-	 *
-	 * A GP is thrown if anything other than 0 is written to
-	 * MSR_IA32_XSS.
-	 */
-	data = test_rdmsr(MSR_IA32_XSS);
-	GUEST_ASSERT(data == 0);
-	GUEST_ASSERT(guest_exception_count == 0);
-
-	test_wrmsr(MSR_IA32_XSS, 0);
-	GUEST_ASSERT(guest_exception_count == 0);
-
-	test_wrmsr(MSR_IA32_XSS, 1);
-	GUEST_ASSERT(guest_exception_count == 1);
-
-	/*
-	 * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_FLUSH_CMD.
-	 *
-	 * A GP is thrown if MSR_IA32_FLUSH_CMD is read
-	 * from or if a value other than 1 is written to it.
-	 */
-	test_rdmsr(MSR_IA32_FLUSH_CMD);
-	GUEST_ASSERT(guest_exception_count == 1);
-
-	test_wrmsr(MSR_IA32_FLUSH_CMD, 0);
-	GUEST_ASSERT(guest_exception_count == 1);
-
-	test_wrmsr(MSR_IA32_FLUSH_CMD, 1);
-	GUEST_ASSERT(guest_exception_count == 0);
-
-	/*
-	 * Test userspace intercepting rdmsr / wrmsr for MSR_NON_EXISTENT.
-	 *
-	 * Test that a fabricated MSR can pass through the kernel
-	 * and be handled in userspace.
-	 */
-	test_wrmsr(MSR_NON_EXISTENT, 2);
-	GUEST_ASSERT(guest_exception_count == 0);
-
-	data = test_rdmsr(MSR_NON_EXISTENT);
-	GUEST_ASSERT(data == 2);
-	GUEST_ASSERT(guest_exception_count == 0);
-
-	if (is_forced_emulation_enabled) {
-		/* Let userspace know we aren't done. */
-		GUEST_SYNC(0);
-
-		/*
-		 * Now run the same tests with the instruction emulator.
-		 */
-		data = test_em_rdmsr(MSR_IA32_XSS);
-		GUEST_ASSERT(data == 0);
-		GUEST_ASSERT(guest_exception_count == 0);
-		test_em_wrmsr(MSR_IA32_XSS, 0);
-		GUEST_ASSERT(guest_exception_count == 0);
-		test_em_wrmsr(MSR_IA32_XSS, 1);
-		GUEST_ASSERT(guest_exception_count == 1);
-
-		test_em_rdmsr(MSR_IA32_FLUSH_CMD);
-		GUEST_ASSERT(guest_exception_count == 1);
-		test_em_wrmsr(MSR_IA32_FLUSH_CMD, 0);
-		GUEST_ASSERT(guest_exception_count == 1);
-		test_em_wrmsr(MSR_IA32_FLUSH_CMD, 1);
-		GUEST_ASSERT(guest_exception_count == 0);
-
-		test_em_wrmsr(MSR_NON_EXISTENT, 2);
-		GUEST_ASSERT(guest_exception_count == 0);
-		data = test_em_rdmsr(MSR_NON_EXISTENT);
-		GUEST_ASSERT(data == 2);
-		GUEST_ASSERT(guest_exception_count == 0);
-	}
-
-	GUEST_DONE();
-}
-
-static void guest_msr_calls(bool trapped)
-{
-	/* This goes into the in-kernel emulation */
-	wrmsr(MSR_SYSCALL_MASK, 0);
-
-	if (trapped) {
-		/* This goes into user space emulation */
-		GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
-		GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
-	} else {
-		GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
-		GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
-	}
-
-	/* If trapped == true, this goes into user space emulation */
-	wrmsr(MSR_IA32_POWER_CTL, 0x1234);
-
-	/* This goes into the in-kernel emulation */
-	rdmsr(MSR_IA32_POWER_CTL);
-
-	/* Invalid MSR, should always be handled by user space exit */
-	GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
-	wrmsr(0xdeadbeef, 0x1234);
-}
-
-static void guest_code_filter_deny(void)
-{
-	guest_msr_calls(true);
-
-	/*
-	 * Disable msr filtering, so that the kernel
-	 * handles everything in the next round
-	 */
-	GUEST_SYNC(0);
-
-	guest_msr_calls(false);
-
-	GUEST_DONE();
-}
-
-static void guest_code_permission_bitmap(void)
-{
-	uint64_t data;
-
-	data = test_rdmsr(MSR_FS_BASE);
-	GUEST_ASSERT(data == MSR_FS_BASE);
-	data = test_rdmsr(MSR_GS_BASE);
-	GUEST_ASSERT(data != MSR_GS_BASE);
-
-	/* Let userspace know to switch the filter */
-	GUEST_SYNC(0);
-
-	data = test_rdmsr(MSR_FS_BASE);
-	GUEST_ASSERT(data != MSR_FS_BASE);
-	data = test_rdmsr(MSR_GS_BASE);
-	GUEST_ASSERT(data == MSR_GS_BASE);
-
-	GUEST_DONE();
-}
-
-static void __guest_gp_handler(struct ex_regs *regs,
-			       char *r_start, char *r_end,
-			       char *w_start, char *w_end)
-{
-	if (regs->rip == (uintptr_t)r_start) {
-		regs->rip = (uintptr_t)r_end;
-		regs->rax = 0;
-		regs->rdx = 0;
-	} else if (regs->rip == (uintptr_t)w_start) {
-		regs->rip = (uintptr_t)w_end;
-	} else {
-		GUEST_ASSERT(!"RIP is at an unknown location!");
-	}
-
-	++guest_exception_count;
-}
-
-static void guest_gp_handler(struct ex_regs *regs)
-{
-	__guest_gp_handler(regs, &rdmsr_start, &rdmsr_end,
-			   &wrmsr_start, &wrmsr_end);
-}
-
-static void guest_fep_gp_handler(struct ex_regs *regs)
-{
-	__guest_gp_handler(regs, &em_rdmsr_start, &em_rdmsr_end,
-			   &em_wrmsr_start, &em_wrmsr_end);
-}
-
-static void check_for_guest_assert(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	if (vcpu->run->exit_reason == KVM_EXIT_IO &&
-	    get_ucall(vcpu, &uc) == UCALL_ABORT) {
-		REPORT_GUEST_ASSERT(uc);
-	}
-}
-
-static void process_rdmsr(struct kvm_vcpu *vcpu, uint32_t msr_index)
-{
-	struct kvm_run *run = vcpu->run;
-
-	check_for_guest_assert(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_RDMSR);
-	TEST_ASSERT(run->msr.index == msr_index,
-			"Unexpected msr (0x%04x), expected 0x%04x",
-			run->msr.index, msr_index);
-
-	switch (run->msr.index) {
-	case MSR_IA32_XSS:
-		run->msr.data = 0;
-		break;
-	case MSR_IA32_FLUSH_CMD:
-		run->msr.error = 1;
-		break;
-	case MSR_NON_EXISTENT:
-		run->msr.data = msr_non_existent_data;
-		break;
-	case MSR_FS_BASE:
-		run->msr.data = MSR_FS_BASE;
-		break;
-	case MSR_GS_BASE:
-		run->msr.data = MSR_GS_BASE;
-		break;
-	default:
-		TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index);
-	}
-}
-
-static void process_wrmsr(struct kvm_vcpu *vcpu, uint32_t msr_index)
-{
-	struct kvm_run *run = vcpu->run;
-
-	check_for_guest_assert(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_WRMSR);
-	TEST_ASSERT(run->msr.index == msr_index,
-			"Unexpected msr (0x%04x), expected 0x%04x",
-			run->msr.index, msr_index);
-
-	switch (run->msr.index) {
-	case MSR_IA32_XSS:
-		if (run->msr.data != 0)
-			run->msr.error = 1;
-		break;
-	case MSR_IA32_FLUSH_CMD:
-		if (run->msr.data != 1)
-			run->msr.error = 1;
-		break;
-	case MSR_NON_EXISTENT:
-		msr_non_existent_data = run->msr.data;
-		break;
-	default:
-		TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index);
-	}
-}
-
-static void process_ucall_done(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	check_for_guest_assert(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE,
-		    "Unexpected ucall command: %lu, expected UCALL_DONE (%d)",
-		    uc.cmd, UCALL_DONE);
-}
-
-static uint64_t process_ucall(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc = {};
-
-	check_for_guest_assert(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_SYNC:
-		break;
-	case UCALL_ABORT:
-		check_for_guest_assert(vcpu);
-		break;
-	case UCALL_DONE:
-		process_ucall_done(vcpu);
-		break;
-	default:
-		TEST_ASSERT(false, "Unexpected ucall");
-	}
-
-	return uc.cmd;
-}
-
-static void run_guest_then_process_rdmsr(struct kvm_vcpu *vcpu,
-					 uint32_t msr_index)
-{
-	vcpu_run(vcpu);
-	process_rdmsr(vcpu, msr_index);
-}
-
-static void run_guest_then_process_wrmsr(struct kvm_vcpu *vcpu,
-					 uint32_t msr_index)
-{
-	vcpu_run(vcpu);
-	process_wrmsr(vcpu, msr_index);
-}
-
-static uint64_t run_guest_then_process_ucall(struct kvm_vcpu *vcpu)
-{
-	vcpu_run(vcpu);
-	return process_ucall(vcpu);
-}
-
-static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu)
-{
-	vcpu_run(vcpu);
-	process_ucall_done(vcpu);
-}
-
-KVM_ONE_VCPU_TEST_SUITE(user_msr);
-
-KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow)
-{
-	struct kvm_vm *vm = vcpu->vm;
-	uint64_t cmd;
-	int rc;
-
-	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
-	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
-	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
-
-	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
-	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
-
-	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow);
-
-	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
-
-	/* Process guest code userspace exits. */
-	run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS);
-	run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
-	run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
-
-	run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD);
-	run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
-	run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
-
-	run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
-	run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
-
-	vcpu_run(vcpu);
-	cmd = process_ucall(vcpu);
-
-	if (is_forced_emulation_enabled) {
-		TEST_ASSERT_EQ(cmd, UCALL_SYNC);
-		vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler);
-
-		/* Process emulated rdmsr and wrmsr instructions. */
-		run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS);
-		run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
-		run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
-
-		run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD);
-		run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
-		run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
-
-		run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
-		run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
-
-		/* Confirm the guest completed without issues. */
-		run_guest_then_process_ucall_done(vcpu);
-	} else {
-		TEST_ASSERT_EQ(cmd, UCALL_DONE);
-		printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n");
-	}
-}
-
-static int handle_ucall(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_SYNC:
-		vm_ioctl(vcpu->vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny);
-		break;
-	case UCALL_DONE:
-		return 1;
-	default:
-		TEST_FAIL("Unknown ucall %lu", uc.cmd);
-	}
-
-	return 0;
-}
-
-static void handle_rdmsr(struct kvm_run *run)
-{
-	run->msr.data = run->msr.index;
-	msr_reads++;
-
-	if (run->msr.index == MSR_SYSCALL_MASK ||
-	    run->msr.index == MSR_GS_BASE) {
-		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
-			    "MSR read trap w/o access fault");
-	}
-
-	if (run->msr.index == 0xdeadbeef) {
-		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
-			    "MSR deadbeef read trap w/o inval fault");
-	}
-}
-
-static void handle_wrmsr(struct kvm_run *run)
-{
-	/* ignore */
-	msr_writes++;
-
-	if (run->msr.index == MSR_IA32_POWER_CTL) {
-		TEST_ASSERT(run->msr.data == 0x1234,
-			    "MSR data for MSR_IA32_POWER_CTL incorrect");
-		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
-			    "MSR_IA32_POWER_CTL trap w/o access fault");
-	}
-
-	if (run->msr.index == 0xdeadbeef) {
-		TEST_ASSERT(run->msr.data == 0x1234,
-			    "MSR data for deadbeef incorrect");
-		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
-			    "deadbeef trap w/o inval fault");
-	}
-}
-
-KVM_ONE_VCPU_TEST(user_msr, msr_filter_deny, guest_code_filter_deny)
-{
-	struct kvm_vm *vm = vcpu->vm;
-	struct kvm_run *run = vcpu->run;
-	int rc;
-
-	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
-	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
-	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_INVAL |
-						      KVM_MSR_EXIT_REASON_UNKNOWN |
-						      KVM_MSR_EXIT_REASON_FILTER);
-
-	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
-	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
-
-	prepare_bitmaps();
-	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_deny);
-
-	while (1) {
-		vcpu_run(vcpu);
-
-		switch (run->exit_reason) {
-		case KVM_EXIT_X86_RDMSR:
-			handle_rdmsr(run);
-			break;
-		case KVM_EXIT_X86_WRMSR:
-			handle_wrmsr(run);
-			break;
-		case KVM_EXIT_IO:
-			if (handle_ucall(vcpu))
-				goto done;
-			break;
-		}
-
-	}
-
-done:
-	TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
-	TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
-}
-
-KVM_ONE_VCPU_TEST(user_msr, msr_permission_bitmap, guest_code_permission_bitmap)
-{
-	struct kvm_vm *vm = vcpu->vm;
-	int rc;
-
-	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
-	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
-	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
-
-	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
-	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
-
-	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_fs);
-	run_guest_then_process_rdmsr(vcpu, MSR_FS_BASE);
-	TEST_ASSERT(run_guest_then_process_ucall(vcpu) == UCALL_SYNC,
-		    "Expected ucall state to be UCALL_SYNC.");
-	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs);
-	run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE);
-	run_guest_then_process_ucall_done(vcpu);
-}
-
-#define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask)	\
-({									\
-	int r = __vm_ioctl(vm, cmd, arg);				\
-									\
-	if (flag & valid_mask)						\
-		TEST_ASSERT(!r, __KVM_IOCTL_ERROR(#cmd, r));		\
-	else								\
-		TEST_ASSERT(r == -1 && errno == EINVAL,			\
-			    "Wanted EINVAL for %s with flag = 0x%llx, got  rc: %i errno: %i (%s)", \
-			    #cmd, flag, r, errno,  strerror(errno));	\
-})
-
-static void run_user_space_msr_flag_test(struct kvm_vm *vm)
-{
-	struct kvm_enable_cap cap = { .cap = KVM_CAP_X86_USER_SPACE_MSR };
-	int nflags = sizeof(cap.args[0]) * BITS_PER_BYTE;
-	int rc;
-	int i;
-
-	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
-	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
-
-	for (i = 0; i < nflags; i++) {
-		cap.args[0] = BIT_ULL(i);
-		test_user_exit_msr_ioctl(vm, KVM_ENABLE_CAP, &cap,
-			   BIT_ULL(i), KVM_MSR_EXIT_REASON_VALID_MASK);
-	}
-}
-
-static void run_msr_filter_flag_test(struct kvm_vm *vm)
-{
-	u64 deny_bits = 0;
-	struct kvm_msr_filter filter = {
-		.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
-		.ranges = {
-			{
-				.flags = KVM_MSR_FILTER_READ,
-				.nmsrs = 1,
-				.base = 0,
-				.bitmap = (uint8_t *)&deny_bits,
-			},
-		},
-	};
-	int nflags;
-	int rc;
-	int i;
-
-	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
-	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
-
-	nflags = sizeof(filter.flags) * BITS_PER_BYTE;
-	for (i = 0; i < nflags; i++) {
-		filter.flags = BIT_ULL(i);
-		test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
-			   BIT_ULL(i), KVM_MSR_FILTER_VALID_MASK);
-	}
-
-	filter.flags = KVM_MSR_FILTER_DEFAULT_ALLOW;
-	nflags = sizeof(filter.ranges[0].flags) * BITS_PER_BYTE;
-	for (i = 0; i < nflags; i++) {
-		filter.ranges[0].flags = BIT_ULL(i);
-		test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
-			   BIT_ULL(i), KVM_MSR_FILTER_RANGE_VALID_MASK);
-	}
-}
-
-/* Test that attempts to write to the unused bits in a flag fails. */
-KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL)
-{
-	struct kvm_vm *vm = vcpu->vm;
-
-	/* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */
-	run_user_space_msr_flag_test(vm);
-
-	/* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */
-	run_msr_filter_flag_test(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	return test_harness_run(argc, argv);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
deleted file mode 100644
index a81a24761aac..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
+++ /dev/null
@@ -1,124 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_apic_access_test
- *
- * Copyright (C) 2020, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * The first subtest simply checks to see that an L2 guest can be
- * launched with a valid APIC-access address that is backed by a
- * page of L1 physical memory.
- *
- * The second subtest sets the APIC-access address to a (valid) L1
- * physical address that is not backed by memory. KVM can't handle
- * this situation, so resuming L2 should result in a KVM exit for
- * internal error (emulation). This is not an architectural
- * requirement. It is just a shortcoming of KVM. The internal error
- * is unfortunate, but it's better than what used to happen!
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "kselftest.h"
-
-static void l2_guest_code(void)
-{
-	/* Exit to L1 */
-	__asm__ __volatile__("vmcall");
-}
-
-static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	uint32_t control;
-
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-
-	/* Prepare the VMCS for L2 execution. */
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
-	control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
-	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
-	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
-	control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
-	vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
-
-	/* Try to launch L2 with the memory-backed APIC-access address. */
-	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	vmwrite(APIC_ACCESS_ADDR, high_gpa);
-
-	/* Try to resume L2 with the unbacked APIC-access address. */
-	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned long apic_access_addr = ~0ul;
-	vm_vaddr_t vmx_pages_gva;
-	unsigned long high_gpa;
-	struct vmx_pages *vmx;
-	bool done = false;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-
-	high_gpa = (vm->max_gfn - 1) << vm->page_shift;
-
-	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	prepare_virtualize_apic_accesses(vmx, vm);
-	vcpu_args_set(vcpu, 2, vmx_pages_gva, high_gpa);
-
-	while (!done) {
-		volatile struct kvm_run *run = vcpu->run;
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-		if (apic_access_addr == high_gpa) {
-			TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
-			TEST_ASSERT(run->internal.suberror ==
-				    KVM_INTERNAL_ERROR_EMULATION,
-				    "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u",
-				    run->internal.suberror);
-			break;
-		}
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			apic_access_addr = uc.args[1];
-			break;
-		case UCALL_DONE:
-			done = true;
-			break;
-		default:
-			TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
-		}
-	}
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
deleted file mode 100644
index dad988351493..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_close_while_nested
- *
- * Copyright (C) 2019, Red Hat, Inc.
- *
- * Verify that nothing bad happens if a KVM user exits with open
- * file descriptors while executing a nested guest.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "kselftest.h"
-
-enum {
-	PORT_L0_EXIT = 0x2000,
-};
-
-static void l2_guest_code(void)
-{
-	/* Exit to L0 */
-	asm volatile("inb %%dx, %%al"
-		     : : [port] "d" (PORT_L0_EXIT) : "rax");
-}
-
-static void l1_guest_code(struct vmx_pages *vmx_pages)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-
-	/* Prepare the VMCS for L2 execution. */
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(0);
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t vmx_pages_gva;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-
-	/* Allocate VMX pages and shared descriptors (vmx_pages). */
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	for (;;) {
-		volatile struct kvm_run *run = vcpu->run;
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		if (run->io.port == PORT_L0_EXIT)
-			break;
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
deleted file mode 100644
index fa512d033205..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * KVM dirty page logging test
- *
- * Copyright (C) 2018, Red Hat, Inc.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <linux/bitmap.h>
-#include <linux/bitops.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-/* The memory slot index to track dirty pages */
-#define TEST_MEM_SLOT_INDEX		1
-#define TEST_MEM_PAGES			3
-
-/* L1 guest test virtual memory offset */
-#define GUEST_TEST_MEM			0xc0000000
-
-/* L2 guest test virtual memory offset */
-#define NESTED_TEST_MEM1		0xc0001000
-#define NESTED_TEST_MEM2		0xc0002000
-
-static void l2_guest_code(u64 *a, u64 *b)
-{
-	READ_ONCE(*a);
-	WRITE_ONCE(*a, 1);
-	GUEST_SYNC(true);
-	GUEST_SYNC(false);
-
-	WRITE_ONCE(*b, 1);
-	GUEST_SYNC(true);
-	WRITE_ONCE(*b, 1);
-	GUEST_SYNC(true);
-	GUEST_SYNC(false);
-
-	/* Exit to L1 and never come back.  */
-	vmcall();
-}
-
-static void l2_guest_code_ept_enabled(void)
-{
-	l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2);
-}
-
-static void l2_guest_code_ept_disabled(void)
-{
-	/* Access the same L1 GPAs as l2_guest_code_ept_enabled() */
-	l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
-}
-
-void l1_guest_code(struct vmx_pages *vmx)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	void *l2_rip;
-
-	GUEST_ASSERT(vmx->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
-	GUEST_ASSERT(load_vmcs(vmx));
-
-	if (vmx->eptp_gpa)
-		l2_rip = l2_guest_code_ept_enabled;
-	else
-		l2_rip = l2_guest_code_ept_disabled;
-
-	prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_SYNC(false);
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_SYNC(false);
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	GUEST_DONE();
-}
-
-static void test_vmx_dirty_log(bool enable_ept)
-{
-	vm_vaddr_t vmx_pages_gva = 0;
-	struct vmx_pages *vmx;
-	unsigned long *bmap;
-	uint64_t *host_test_mem;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	bool done = false;
-
-	pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled");
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	/* Add an extra memory slot for testing dirty logging */
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    GUEST_TEST_MEM,
-				    TEST_MEM_SLOT_INDEX,
-				    TEST_MEM_PAGES,
-				    KVM_MEM_LOG_DIRTY_PAGES);
-
-	/*
-	 * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
-	 * affects both L1 and L2.  However...
-	 */
-	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
-
-	/*
-	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
-	 * 0xc0000000.
-	 *
-	 * Note that prepare_eptp should be called only L1's GPA map is done,
-	 * meaning after the last call to virt_map.
-	 *
-	 * When EPT is disabled, the L2 guest code will still access the same L1
-	 * GPAs as the EPT enabled case.
-	 */
-	if (enable_ept) {
-		prepare_eptp(vmx, vm, 0);
-		nested_map_memslot(vmx, vm, 0);
-		nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096);
-		nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096);
-	}
-
-	bmap = bitmap_zalloc(TEST_MEM_PAGES);
-	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
-
-	while (!done) {
-		memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096);
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			/*
-			 * The nested guest wrote at offset 0x1000 in the memslot, but the
-			 * dirty bitmap must be filled in according to L1 GPA, not L2.
-			 */
-			kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
-			if (uc.args[1]) {
-				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
-				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
-			} else {
-				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
-				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
-			}
-
-			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
-			TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
-			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
-			TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
-			break;
-		case UCALL_DONE:
-			done = true;
-			break;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	test_vmx_dirty_log(/*enable_ept=*/false);
-
-	if (kvm_cpu_has_ept())
-		test_vmx_dirty_log(/*enable_ept=*/true);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
deleted file mode 100644
index 3fd6eceab46f..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-#include <signal.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-
-#include "kselftest.h"
-
-static void guest_ud_handler(struct ex_regs *regs)
-{
-	/* Loop on the ud2 until guest state is made invalid. */
-}
-
-static void guest_code(void)
-{
-	asm volatile("ud2");
-}
-
-static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-
-	vcpu_run(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
-	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
-		    "Expected emulation failure, got %d",
-		    run->emulation_failure.suberror);
-}
-
-static void run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * Always run twice to verify KVM handles the case where _KVM_ queues
-	 * an exception with invalid state and then exits to userspace, i.e.
-	 * that KVM doesn't explode if userspace ignores the initial error.
-	 */
-	__run_vcpu_with_invalid_state(vcpu);
-	__run_vcpu_with_invalid_state(vcpu);
-}
-
-static void set_timer(void)
-{
-	struct itimerval timer;
-
-	timer.it_value.tv_sec  = 0;
-	timer.it_value.tv_usec = 200;
-	timer.it_interval = timer.it_value;
-	TEST_ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
-}
-
-static void set_or_clear_invalid_guest_state(struct kvm_vcpu *vcpu, bool set)
-{
-	static struct kvm_sregs sregs;
-
-	if (!sregs.cr0)
-		vcpu_sregs_get(vcpu, &sregs);
-	sregs.tr.unusable = !!set;
-	vcpu_sregs_set(vcpu, &sregs);
-}
-
-static void set_invalid_guest_state(struct kvm_vcpu *vcpu)
-{
-	set_or_clear_invalid_guest_state(vcpu, true);
-}
-
-static void clear_invalid_guest_state(struct kvm_vcpu *vcpu)
-{
-	set_or_clear_invalid_guest_state(vcpu, false);
-}
-
-static struct kvm_vcpu *get_set_sigalrm_vcpu(struct kvm_vcpu *__vcpu)
-{
-	static struct kvm_vcpu *vcpu = NULL;
-
-	if (__vcpu)
-		vcpu = __vcpu;
-	return vcpu;
-}
-
-static void sigalrm_handler(int sig)
-{
-	struct kvm_vcpu *vcpu = get_set_sigalrm_vcpu(NULL);
-	struct kvm_vcpu_events events;
-
-	TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
-
-	vcpu_events_get(vcpu, &events);
-
-	/*
-	 * If an exception is pending, attempt KVM_RUN with invalid guest,
-	 * otherwise rearm the timer and keep doing so until the timer fires
-	 * between KVM queueing an exception and re-entering the guest.
-	 */
-	if (events.exception.pending) {
-		set_invalid_guest_state(vcpu);
-		run_vcpu_with_invalid_state(vcpu);
-	} else {
-		set_timer();
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(host_cpu_is_intel);
-	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	get_set_sigalrm_vcpu(vcpu);
-
-	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
-
-	/*
-	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
-	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
-	 * emulating invalid guest state for L2.
-	 */
-	set_invalid_guest_state(vcpu);
-	run_vcpu_with_invalid_state(vcpu);
-
-	/*
-	 * Verify KVM also handles the case where userspace gains control while
-	 * an exception is pending and stuffs invalid state.  Run with valid
-	 * guest state and a timer firing every 200us, and attempt to enter the
-	 * guest with invalid state when the handler interrupts KVM with an
-	 * exception pending.
-	 */
-	clear_invalid_guest_state(vcpu);
-	TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
-		    "Failed to register SIGALRM handler, errno = %d (%s)",
-		    errno, strerror(errno));
-
-	set_timer();
-	run_vcpu_with_invalid_state(vcpu);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
deleted file mode 100644
index a100ee5f0009..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "kselftest.h"
-
-#define ARBITRARY_IO_PORT 0x2000
-
-static struct kvm_vm *vm;
-
-static void l2_guest_code(void)
-{
-	/*
-	 * Generate an exit to L0 userspace, i.e. main(), via I/O to an
-	 * arbitrary port.
-	 */
-	asm volatile("inb %%dx, %%al"
-		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
-}
-
-static void l1_guest_code(struct vmx_pages *vmx_pages)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-
-	/* Prepare the VMCS for L2 execution. */
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	/*
-	 * L2 must be run without unrestricted guest, verify that the selftests
-	 * library hasn't enabled it.  Because KVM selftests jump directly to
-	 * 64-bit mode, unrestricted guest support isn't required.
-	 */
-	GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) ||
-		     !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST));
-
-	GUEST_ASSERT(!vmlaunch());
-
-	/* L2 should triple fault after main() stuffs invalid guest state. */
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t vmx_pages_gva;
-	struct kvm_sregs sregs;
-	struct kvm_vcpu *vcpu;
-	struct kvm_run *run;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-
-	/* Allocate VMX pages and shared descriptors (vmx_pages). */
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	vcpu_run(vcpu);
-
-	run = vcpu->run;
-
-	/*
-	 * The first exit to L0 userspace should be an I/O access from L2.
-	 * Running L1 should launch L2 without triggering an exit to userspace.
-	 */
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
-		    "Expected IN from port %d from L2, got port %d",
-		    ARBITRARY_IO_PORT, run->io.port);
-
-	/*
-	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
-	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
-	 * emulating invalid guest state for L2.
-	 */
-	memset(&sregs, 0, sizeof(sregs));
-	vcpu_sregs_get(vcpu, &sregs);
-	sregs.tr.unusable = 1;
-	vcpu_sregs_set(vcpu, &sregs);
-
-	vcpu_run(vcpu);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_DONE:
-		break;
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-	default:
-		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-	}
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c
deleted file mode 100644
index 90720b6205f4..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VMX control MSR test
- *
- * Copyright (C) 2022 Google LLC.
- *
- * Tests for KVM ownership of bits in the VMX entry/exit control MSRs. Checks
- * that KVM will set owned bits where appropriate, and will not if
- * KVM_X86_QUIRK_TWEAK_VMX_CTRL_MSRS is disabled.
- */
-#include <linux/bitmap.h>
-#include "kvm_util.h"
-#include "vmx.h"
-
-static void vmx_fixed1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index,
-				  uint64_t mask)
-{
-	uint64_t val = vcpu_get_msr(vcpu, msr_index);
-	uint64_t bit;
-
-	mask &= val;
-
-	for_each_set_bit(bit, &mask, 64) {
-		vcpu_set_msr(vcpu, msr_index, val & ~BIT_ULL(bit));
-		vcpu_set_msr(vcpu, msr_index, val);
-	}
-}
-
-static void vmx_fixed0_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index,
-				uint64_t mask)
-{
-	uint64_t val = vcpu_get_msr(vcpu, msr_index);
-	uint64_t bit;
-
-	mask = ~mask | val;
-
-	for_each_clear_bit(bit, &mask, 64) {
-		vcpu_set_msr(vcpu, msr_index, val | BIT_ULL(bit));
-		vcpu_set_msr(vcpu, msr_index, val);
-	}
-}
-
-static void vmx_fixed0and1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index)
-{
-	vmx_fixed0_msr_test(vcpu, msr_index, GENMASK_ULL(31, 0));
-	vmx_fixed1_msr_test(vcpu, msr_index, GENMASK_ULL(63, 32));
-}
-
-static void vmx_save_restore_msrs_test(struct kvm_vcpu *vcpu)
-{
-	vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, 0);
-	vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, -1ull);
-
-	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_BASIC,
-			    BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55));
-
-	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_MISC,
-			    BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) |
-			    BIT_ULL(15) | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30));
-
-	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2);
-	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, -1ull);
-	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS);
-	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
-	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS);
-	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS);
-	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_VMFUNC, -1ull);
-}
-
-static void __ia32_feature_control_msr_test(struct kvm_vcpu *vcpu,
-					    uint64_t msr_bit,
-					    struct kvm_x86_cpu_feature feature)
-{
-	uint64_t val;
-
-	vcpu_clear_cpuid_feature(vcpu, feature);
-
-	val = vcpu_get_msr(vcpu, MSR_IA32_FEAT_CTL);
-	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED);
-	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED);
-	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED);
-	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED);
-	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val);
-
-	if (!kvm_cpu_has(feature))
-		return;
-
-	vcpu_set_cpuid_feature(vcpu, feature);
-}
-
-static void ia32_feature_control_msr_test(struct kvm_vcpu *vcpu)
-{
-	uint64_t supported_bits = FEAT_CTL_LOCKED |
-				  FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
-				  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX |
-				  FEAT_CTL_SGX_LC_ENABLED |
-				  FEAT_CTL_SGX_ENABLED |
-				  FEAT_CTL_LMCE_ENABLED;
-	int bit, r;
-
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_SMX);
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_VMX);
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX, X86_FEATURE_VMX);
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX_LC);
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX);
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_ENABLED, X86_FEATURE_SGX);
-	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_LMCE_ENABLED, X86_FEATURE_MCE);
-
-	for_each_clear_bit(bit, &supported_bits, 64) {
-		r = _vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, BIT(bit));
-		TEST_ASSERT(r == 0,
-			    "Setting reserved bit %d in IA32_FEATURE_CONTROL should fail", bit);
-	}
-}
-
-int main(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	/* No need to actually do KVM_RUN, thus no guest code. */
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	vmx_save_restore_msrs_test(vcpu);
-	ia32_feature_control_msr_test(vcpu);
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
deleted file mode 100644
index 1759fa5cb3f2..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_nested_tsc_scaling_test
- *
- * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
- *
- * This test case verifies that nested TSC scaling behaves as expected when
- * both L1 and L2 are scaled using different ratios. For this test we scale
- * L1 down and scale L2 up.
- */
-
-#include <time.h>
-
-#include "kvm_util.h"
-#include "vmx.h"
-#include "kselftest.h"
-
-/* L2 is scaled up (from L1's perspective) by this factor */
-#define L2_SCALE_FACTOR 4ULL
-
-#define TSC_OFFSET_L2 ((uint64_t) -33125236320908)
-#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
-
-#define L2_GUEST_STACK_SIZE 64
-
-enum { USLEEP, UCHECK_L1, UCHECK_L2 };
-#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
-#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
-
-
-/*
- * This function checks whether the "actual" TSC frequency of a guest matches
- * its expected frequency. In order to account for delays in taking the TSC
- * measurements, a difference of 1% between the actual and the expected value
- * is tolerated.
- */
-static void compare_tsc_freq(uint64_t actual, uint64_t expected)
-{
-	uint64_t tolerance, thresh_low, thresh_high;
-
-	tolerance = expected / 100;
-	thresh_low = expected - tolerance;
-	thresh_high = expected + tolerance;
-
-	TEST_ASSERT(thresh_low < actual,
-		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
-		" but it actually is %"PRIu64,
-		thresh_low, thresh_high, actual);
-	TEST_ASSERT(thresh_high > actual,
-		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
-		" but it actually is %"PRIu64,
-		thresh_low, thresh_high, actual);
-}
-
-static void check_tsc_freq(int level)
-{
-	uint64_t tsc_start, tsc_end, tsc_freq;
-
-	/*
-	 * Reading the TSC twice with about a second's difference should give
-	 * us an approximation of the TSC frequency from the guest's
-	 * perspective. Now, this won't be completely accurate, but it should
-	 * be good enough for the purposes of this test.
-	 */
-	tsc_start = rdmsr(MSR_IA32_TSC);
-	GUEST_SLEEP(1);
-	tsc_end = rdmsr(MSR_IA32_TSC);
-
-	tsc_freq = tsc_end - tsc_start;
-
-	GUEST_CHECK(level, tsc_freq);
-}
-
-static void l2_guest_code(void)
-{
-	check_tsc_freq(UCHECK_L2);
-
-	/* exit to L1 */
-	__asm__ __volatile__("vmcall");
-}
-
-static void l1_guest_code(struct vmx_pages *vmx_pages)
-{
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	uint32_t control;
-
-	/* check that L1's frequency looks alright before launching L2 */
-	check_tsc_freq(UCHECK_L1);
-
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-
-	/* prepare the VMCS for L2 execution */
-	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	/* enable TSC offsetting and TSC scaling for L2 */
-	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
-	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
-	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
-
-	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
-	control |= SECONDARY_EXEC_TSC_SCALING;
-	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
-
-	vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
-	vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
-	vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
-
-	/* launch L2 */
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	/* check that L1's frequency still looks good */
-	check_tsc_freq(UCHECK_L1);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	vm_vaddr_t vmx_pages_gva;
-
-	uint64_t tsc_start, tsc_end;
-	uint64_t tsc_khz;
-	uint64_t l1_scale_factor;
-	uint64_t l0_tsc_freq = 0;
-	uint64_t l1_tsc_freq = 0;
-	uint64_t l2_tsc_freq = 0;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
-	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
-
-	/*
-	 * We set L1's scale factor to be a random number from 2 to 10.
-	 * Ideally we would do the same for L2's factor but that one is
-	 * referenced by both main() and l1_guest_code() and using a global
-	 * variable does not work.
-	 */
-	srand(time(NULL));
-	l1_scale_factor = (rand() % 9) + 2;
-	printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor);
-	printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR);
-
-	tsc_start = rdtsc();
-	sleep(1);
-	tsc_end = rdtsc();
-
-	l0_tsc_freq = tsc_end - tsc_start;
-	printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
-
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
-	TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
-
-	/* scale down L1's TSC frequency */
-	vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *) (tsc_khz / l1_scale_factor));
-
-	for (;;) {
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-		case UCALL_SYNC:
-			switch (uc.args[0]) {
-			case USLEEP:
-				sleep(uc.args[1]);
-				break;
-			case UCHECK_L1:
-				l1_tsc_freq = uc.args[1];
-				printf("L1's TSC frequency is around: %"PRIu64
-				       "\n", l1_tsc_freq);
-
-				compare_tsc_freq(l1_tsc_freq,
-						 l0_tsc_freq / l1_scale_factor);
-				break;
-			case UCHECK_L2:
-				l2_tsc_freq = uc.args[1];
-				printf("L2's TSC frequency is around: %"PRIu64
-				       "\n", l2_tsc_freq);
-
-				compare_tsc_freq(l2_tsc_freq,
-						 l1_tsc_freq * L2_SCALE_FACTOR);
-				break;
-			}
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
deleted file mode 100644
index a1f5ff45d518..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
+++ /dev/null
@@ -1,247 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Test for VMX-pmu perf capability msr
- *
- * Copyright (C) 2021 Intel Corporation
- *
- * Test to check the effect of various CPUID settings on
- * MSR_IA32_PERF_CAPABILITIES MSR, and check that what
- * we write with KVM_SET_MSR is _not_ modified by the guest
- * and check it can be retrieved with KVM_GET_MSR, also test
- * the invalid LBR formats are rejected.
- */
-#include <sys/ioctl.h>
-
-#include <linux/bitmap.h>
-
-#include "kvm_test_harness.h"
-#include "kvm_util.h"
-#include "vmx.h"
-
-static union perf_capabilities {
-	struct {
-		u64	lbr_format:6;
-		u64	pebs_trap:1;
-		u64	pebs_arch_reg:1;
-		u64	pebs_format:4;
-		u64	smm_freeze:1;
-		u64	full_width_write:1;
-		u64 pebs_baseline:1;
-		u64	perf_metrics:1;
-		u64	pebs_output_pt_available:1;
-		u64	anythread_deprecated:1;
-	};
-	u64	capabilities;
-} host_cap;
-
-/*
- * The LBR format and most PEBS features are immutable, all other features are
- * fungible (if supported by the host and KVM).
- */
-static const union perf_capabilities immutable_caps = {
-	.lbr_format = -1,
-	.pebs_trap  = 1,
-	.pebs_arch_reg = 1,
-	.pebs_format = -1,
-	.pebs_baseline = 1,
-};
-
-static const union perf_capabilities format_caps = {
-	.lbr_format = -1,
-	.pebs_format = -1,
-};
-
-static void guest_test_perf_capabilities_gp(uint64_t val)
-{
-	uint8_t vector = wrmsr_safe(MSR_IA32_PERF_CAPABILITIES, val);
-
-	__GUEST_ASSERT(vector == GP_VECTOR,
-		       "Expected #GP for value '0x%lx', got vector '0x%x'",
-		       val, vector);
-}
-
-static void guest_code(uint64_t current_val)
-{
-	int i;
-
-	guest_test_perf_capabilities_gp(current_val);
-	guest_test_perf_capabilities_gp(0);
-
-	for (i = 0; i < 64; i++)
-		guest_test_perf_capabilities_gp(current_val ^ BIT_ULL(i));
-
-	GUEST_DONE();
-}
-
-KVM_ONE_VCPU_TEST_SUITE(vmx_pmu_caps);
-
-/*
- * Verify that guest WRMSRs to PERF_CAPABILITIES #GP regardless of the value
- * written, that the guest always sees the userspace controlled value, and that
- * PERF_CAPABILITIES is immutable after KVM_RUN.
- */
-KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code)
-{
-	struct ucall uc;
-	int r, i;
-
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-
-	vcpu_args_set(vcpu, 1, host_cap.capabilities);
-	vcpu_run(vcpu);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_ABORT:
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	case UCALL_DONE:
-		break;
-	default:
-		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
-	}
-
-	TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES),
-			host_cap.capabilities);
-
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-
-	r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
-	TEST_ASSERT(!r, "Post-KVM_RUN write '0' didn't fail");
-
-	for (i = 0; i < 64; i++) {
-		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
-				  host_cap.capabilities ^ BIT_ULL(i));
-		TEST_ASSERT(!r, "Post-KVM_RUN write '0x%llx'didn't fail",
-			    host_cap.capabilities ^ BIT_ULL(i));
-	}
-}
-
-/*
- * Verify KVM allows writing PERF_CAPABILITIES with all KVM-supported features
- * enabled, as well as '0' (to disable all features).
- */
-KVM_ONE_VCPU_TEST(vmx_pmu_caps, basic_perf_capabilities, guest_code)
-{
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-}
-
-KVM_ONE_VCPU_TEST(vmx_pmu_caps, fungible_perf_capabilities, guest_code)
-{
-	const uint64_t fungible_caps = host_cap.capabilities & ~immutable_caps.capabilities;
-	int bit;
-
-	for_each_set_bit(bit, &fungible_caps, 64) {
-		vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(bit));
-		vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
-			     host_cap.capabilities & ~BIT_ULL(bit));
-	}
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-}
-
-/*
- * Verify KVM rejects attempts to set unsupported and/or immutable features in
- * PERF_CAPABILITIES.  Note, LBR format and PEBS format need to be validated
- * separately as they are multi-bit values, e.g. toggling or setting a single
- * bit can generate a false positive without dedicated safeguards.
- */
-KVM_ONE_VCPU_TEST(vmx_pmu_caps, immutable_perf_capabilities, guest_code)
-{
-	const uint64_t reserved_caps = (~host_cap.capabilities |
-					immutable_caps.capabilities) &
-				       ~format_caps.capabilities;
-	union perf_capabilities val = host_cap;
-	int r, bit;
-
-	for_each_set_bit(bit, &reserved_caps, 64) {
-		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
-				  host_cap.capabilities ^ BIT_ULL(bit));
-		TEST_ASSERT(!r, "%s immutable feature 0x%llx (bit %d) didn't fail",
-			    host_cap.capabilities & BIT_ULL(bit) ? "Setting" : "Clearing",
-			    BIT_ULL(bit), bit);
-	}
-
-	/*
-	 * KVM only supports the host's native LBR format, as well as '0' (to
-	 * disable LBR support).  Verify KVM rejects all other LBR formats.
-	 */
-	for (val.lbr_format = 1; val.lbr_format; val.lbr_format++) {
-		if (val.lbr_format == host_cap.lbr_format)
-			continue;
-
-		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities);
-		TEST_ASSERT(!r, "Bad LBR FMT = 0x%x didn't fail, host = 0x%x",
-			    val.lbr_format, host_cap.lbr_format);
-	}
-
-	/* Ditto for the PEBS format. */
-	for (val.pebs_format = 1; val.pebs_format; val.pebs_format++) {
-		if (val.pebs_format == host_cap.pebs_format)
-			continue;
-
-		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities);
-		TEST_ASSERT(!r, "Bad PEBS FMT = 0x%x didn't fail, host = 0x%x",
-			    val.pebs_format, host_cap.pebs_format);
-	}
-}
-
-/*
- * Test that LBR MSRs are writable when LBRs are enabled, and then verify that
- * disabling the vPMU via CPUID also disables LBR support.  Set bits 2:0 of
- * LBR_TOS as those bits are writable across all uarch implementations (arch
- * LBRs will need to poke a different MSR).
- */
-KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code)
-{
-	int r;
-
-	if (!host_cap.lbr_format)
-		return;
-
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-	vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
-
-	vcpu_clear_cpuid_entry(vcpu, X86_PROPERTY_PMU_VERSION.function);
-
-	r = _vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
-	TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU");
-}
-
-KVM_ONE_VCPU_TEST(vmx_pmu_caps, perf_capabilities_unsupported, guest_code)
-{
-	uint64_t val;
-	int i, r;
-
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-	val = vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES);
-	TEST_ASSERT_EQ(val, host_cap.capabilities);
-
-	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_PDCM);
-
-	val = vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES);
-	TEST_ASSERT_EQ(val, 0);
-
-	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
-
-	for (i = 0; i < 64; i++) {
-		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(i));
-		TEST_ASSERT(!r, "Setting PERF_CAPABILITIES bit %d (= 0x%llx) should fail without PDCM",
-			    i, BIT_ULL(i));
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_is_pmu_enabled());
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
-
-	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
-	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
-
-	host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES);
-
-	TEST_ASSERT(host_cap.full_width_write,
-		    "Full-width writes should always be supported");
-
-	return test_harness_run(argc, argv);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
deleted file mode 100644
index 00dd2ac07a61..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
+++ /dev/null
@@ -1,245 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VMX-preemption timer test
- *
- * Copyright (C) 2020, Google, LLC.
- *
- * Test to ensure the VM-Enter after migration doesn't
- * incorrectly restarts the timer with the full timer
- * value instead of partially decayed timer value
- *
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#define PREEMPTION_TIMER_VALUE			100000000ull
-#define PREEMPTION_TIMER_VALUE_THRESHOLD1	 80000000ull
-
-u32 vmx_pt_rate;
-bool l2_save_restore_done;
-static u64 l2_vmx_pt_start;
-volatile u64 l2_vmx_pt_finish;
-
-union vmx_basic basic;
-union vmx_ctrl_msr ctrl_pin_rev;
-union vmx_ctrl_msr ctrl_exit_rev;
-
-void l2_guest_code(void)
-{
-	u64 vmx_pt_delta;
-
-	vmcall();
-	l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
-
-	/*
-	 * Wait until the 1st threshold has passed
-	 */
-	do {
-		l2_vmx_pt_finish = rdtsc();
-		vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
-				vmx_pt_rate;
-	} while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
-
-	/*
-	 * Force L2 through Save and Restore cycle
-	 */
-	GUEST_SYNC(1);
-
-	l2_save_restore_done = 1;
-
-	/*
-	 * Now wait for the preemption timer to fire and
-	 * exit to L1
-	 */
-	while ((l2_vmx_pt_finish = rdtsc()))
-		;
-}
-
-void l1_guest_code(struct vmx_pages *vmx_pages)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	u64 l1_vmx_pt_start;
-	u64 l1_vmx_pt_finish;
-	u64 l1_tsc_deadline, l2_tsc_deadline;
-
-	GUEST_ASSERT(vmx_pages->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
-
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	/*
-	 * Check for Preemption timer support
-	 */
-	basic.val = rdmsr(MSR_IA32_VMX_BASIC);
-	ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
-			: MSR_IA32_VMX_PINBASED_CTLS);
-	ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
-			: MSR_IA32_VMX_EXIT_CTLS);
-
-	if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
-	    !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
-		return;
-
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
-
-	/*
-	 * Turn on PIN control and resume the guest
-	 */
-	GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
-			      vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
-			      PIN_BASED_VMX_PREEMPTION_TIMER));
-
-	GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
-			      PREEMPTION_TIMER_VALUE));
-
-	vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
-
-	l2_save_restore_done = 0;
-
-	l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
-
-	GUEST_ASSERT(!vmresume());
-
-	l1_vmx_pt_finish = rdtsc();
-
-	/*
-	 * Ensure exit from L2 happens after L2 goes through
-	 * save and restore
-	 */
-	GUEST_ASSERT(l2_save_restore_done);
-
-	/*
-	 * Ensure the exit from L2 is due to preemption timer expiry
-	 */
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
-
-	l1_tsc_deadline = l1_vmx_pt_start +
-		(PREEMPTION_TIMER_VALUE << vmx_pt_rate);
-
-	l2_tsc_deadline = l2_vmx_pt_start +
-		(PREEMPTION_TIMER_VALUE << vmx_pt_rate);
-
-	/*
-	 * Sync with the host and pass the l1|l2 pt_expiry_finish times and
-	 * tsc deadlines so that host can verify they are as expected
-	 */
-	GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
-		l2_vmx_pt_finish, l2_tsc_deadline);
-}
-
-void guest_code(struct vmx_pages *vmx_pages)
-{
-	if (vmx_pages)
-		l1_guest_code(vmx_pages);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t vmx_pages_gva = 0;
-
-	struct kvm_regs regs1, regs2;
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	struct kvm_x86_state *state;
-	struct ucall uc;
-	int stage;
-
-	/*
-	 * AMD currently does not implement any VMX features, so for now we
-	 * just early out.
-	 */
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	vcpu_regs_get(vcpu, &regs1);
-
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	for (stage = 1;; stage++) {
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		/* UCALL_SYNC is handled here.  */
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage, (ulong)uc.args[1]);
-		/*
-		 * If this stage 2 then we should verify the vmx pt expiry
-		 * is as expected.
-		 * From L1's perspective verify Preemption timer hasn't
-		 * expired too early.
-		 * From L2's perspective verify Preemption timer hasn't
-		 * expired too late.
-		 */
-		if (stage == 2) {
-
-			pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
-				stage, uc.args[2], uc.args[3]);
-
-			pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
-				stage, uc.args[4], uc.args[5]);
-
-			TEST_ASSERT(uc.args[2] >= uc.args[3],
-				"Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
-				stage, uc.args[2], uc.args[3]);
-
-			TEST_ASSERT(uc.args[4] < uc.args[5],
-				"Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
-				stage, uc.args[4], uc.args[5]);
-		}
-
-		state = vcpu_save_state(vcpu);
-		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vcpu, &regs1);
-
-		kvm_vm_release(vm);
-
-		/* Restore state in a new VM.  */
-		vcpu = vm_recreate_with_one_vcpu(vm);
-		vcpu_load_state(vcpu, state);
-		kvm_x86_state_cleanup(state);
-
-		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vcpu, &regs2);
-		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
-			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
-			    (ulong) regs2.rdi, (ulong) regs2.rsi);
-	}
-
-done:
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
deleted file mode 100644
index 67a62a5a8895..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
+++ /dev/null
@@ -1,304 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_set_nested_state_test
- *
- * Copyright (C) 2019, Google LLC.
- *
- * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <errno.h>
-#include <linux/kvm.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <unistd.h>
-
-/*
- * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
- * changes this should be updated.
- */
-#define VMCS12_REVISION 0x11e57ed0
-
-bool have_evmcs;
-
-void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
-{
-	vcpu_nested_state_set(vcpu, state);
-}
-
-void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
-				    struct kvm_nested_state *state,
-				    int expected_errno)
-{
-	int rv;
-
-	rv = __vcpu_nested_state_set(vcpu, state);
-	TEST_ASSERT(rv == -1 && errno == expected_errno,
-		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
-		strerror(expected_errno), expected_errno, rv, strerror(errno),
-		errno);
-}
-
-void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
-				     struct kvm_nested_state *state)
-{
-	test_nested_state_expect_errno(vcpu, state, EINVAL);
-}
-
-void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
-				     struct kvm_nested_state *state)
-{
-	test_nested_state_expect_errno(vcpu, state, EFAULT);
-}
-
-void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
-				u32 vmcs12_revision)
-{
-	/* Set revision_id in vmcs12 to vmcs12_revision. */
-	memcpy(&state->data, &vmcs12_revision, sizeof(u32));
-}
-
-void set_default_state(struct kvm_nested_state *state)
-{
-	memset(state, 0, sizeof(*state));
-	state->flags = KVM_STATE_NESTED_RUN_PENDING |
-		       KVM_STATE_NESTED_GUEST_MODE;
-	state->format = 0;
-	state->size = sizeof(*state);
-}
-
-void set_default_vmx_state(struct kvm_nested_state *state, int size)
-{
-	memset(state, 0, size);
-	if (have_evmcs)
-		state->flags = KVM_STATE_NESTED_EVMCS;
-	state->format = 0;
-	state->size = size;
-	state->hdr.vmx.vmxon_pa = 0x1000;
-	state->hdr.vmx.vmcs12_pa = 0x2000;
-	state->hdr.vmx.smm.flags = 0;
-	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
-}
-
-void test_vmx_nested_state(struct kvm_vcpu *vcpu)
-{
-	/* Add a page for VMCS12. */
-	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
-	struct kvm_nested_state *state =
-		(struct kvm_nested_state *)malloc(state_sz);
-
-	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
-	set_default_vmx_state(state, state_sz);
-	state->format = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * We cannot virtualize anything if the guest does not have VMX
-	 * enabled.
-	 */
-	set_default_vmx_state(state, state_sz);
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * We cannot virtualize anything if the guest does not have VMX
-	 * enabled.  We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
-	 * is set to -1ull, but the flags must be zero.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	test_nested_state_expect_einval(vcpu, state);
-
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	state->flags = KVM_STATE_NESTED_EVMCS;
-	test_nested_state_expect_einval(vcpu, state);
-
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-
-	/* Enable VMX in the guest CPUID. */
-	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
-
-	/*
-	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
-	 * setting the nested state. When the eVMCS flag is not set, the
-	 * expected return value is '0'.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	test_nested_state(vcpu, state);
-
-	/*
-	 * When eVMCS is supported, the eVMCS flag can only be set if the
-	 * enlightened VMCS capability has been enabled.
-	 */
-	if (have_evmcs) {
-		state->flags = KVM_STATE_NESTED_EVMCS;
-		test_nested_state_expect_einval(vcpu, state);
-		vcpu_enable_evmcs(vcpu);
-		test_nested_state(vcpu, state);
-	}
-
-	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
-	state->hdr.vmx.smm.flags = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* Invalid flags are rejected. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.flags = ~0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->flags = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
-	 * KVM_STATE_NESTED_GUEST_MODE set together.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
-		      KVM_STATE_NESTED_RUN_PENDING;
-	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * It is invalid to have any of the SMM flags set besides:
-	 *	KVM_STATE_NESTED_SMM_GUEST_MODE
-	 *	KVM_STATE_NESTED_SMM_VMXON
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
-				KVM_STATE_NESTED_SMM_VMXON);
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* Outside SMM, SMM flags must be zero. */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * Size must be large enough to fit kvm_nested_state and vmcs12
-	 * if VMCS12 physical address is set
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	state->hdr.vmx.vmcs12_pa = -1;
-	test_nested_state(vcpu, state);
-
-	/*
-	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
-	 * contents but L2 not running.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-
-	/* Invalid flags are rejected, even if no VMCS loaded. */
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	state->hdr.vmx.vmcs12_pa = -1;
-	state->hdr.vmx.flags = ~0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* vmxon_pa cannot be the same address as vmcs_pa. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = 0;
-	state->hdr.vmx.vmcs12_pa = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * Test that if we leave nesting the state reflects that when we get
-	 * it again.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-	vcpu_nested_state_get(vcpu, state);
-	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
-		    "Size must be between %ld and %d.  The size returned was %d.",
-		    sizeof(*state), state_sz, state->size);
-	TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
-	TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
-
-	free(state);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	struct kvm_nested_state state;
-	struct kvm_vcpu *vcpu;
-
-	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
-
-	/*
-	 * AMD currently does not implement set_nested_state, so for now we
-	 * just early out.
-	 */
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	/*
-	 * First run tests with VMX disabled to check error handling.
-	 */
-	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
-
-	/* Passing a NULL kvm_nested_state causes a EFAULT. */
-	test_nested_state_expect_efault(vcpu, NULL);
-
-	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
-	set_default_state(&state);
-	state.size = 0;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	/*
-	 * Setting the flags 0xf fails the flags check.  The only flags that
-	 * can be used are:
-	 *     KVM_STATE_NESTED_GUEST_MODE
-	 *     KVM_STATE_NESTED_RUN_PENDING
-	 *     KVM_STATE_NESTED_EVMCS
-	 */
-	set_default_state(&state);
-	state.flags = 0xf;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	/*
-	 * If KVM_STATE_NESTED_RUN_PENDING is set then
-	 * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
-	 */
-	set_default_state(&state);
-	state.flags = KVM_STATE_NESTED_RUN_PENDING;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	test_vmx_nested_state(vcpu);
-
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
deleted file mode 100644
index 2ceb5c78c442..000000000000
--- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_tsc_adjust_test
- *
- * Copyright (C) 2018, Google LLC.
- *
- * IA32_TSC_ADJUST test
- *
- * According to the SDM, "if an execution of WRMSR to the
- * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC,
- * the logical processor also adds (or subtracts) value X from the
- * IA32_TSC_ADJUST MSR.
- *
- * Note that when L1 doesn't intercept writes to IA32_TSC, a
- * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC
- * value.
- *
- * This test verifies that this unusual case is handled correctly.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "kselftest.h"
-
-#ifndef MSR_IA32_TSC_ADJUST
-#define MSR_IA32_TSC_ADJUST 0x3b
-#endif
-
-#define TSC_ADJUST_VALUE (1ll << 32)
-#define TSC_OFFSET_VALUE -(1ll << 48)
-
-enum {
-	PORT_ABORT = 0x1000,
-	PORT_REPORT,
-	PORT_DONE,
-};
-
-enum {
-	VMXON_PAGE = 0,
-	VMCS_PAGE,
-	MSR_BITMAP_PAGE,
-
-	NUM_VMX_PAGES,
-};
-
-/* The virtual machine object. */
-static struct kvm_vm *vm;
-
-static void check_ia32_tsc_adjust(int64_t max)
-{
-	int64_t adjust;
-
-	adjust = rdmsr(MSR_IA32_TSC_ADJUST);
-	GUEST_SYNC(adjust);
-	GUEST_ASSERT(adjust <= max);
-}
-
-static void l2_guest_code(void)
-{
-	uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE;
-
-	wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE);
-	check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
-
-	/* Exit to L1 */
-	__asm__ __volatile__("vmcall");
-}
-
-static void l1_guest_code(struct vmx_pages *vmx_pages)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	uint32_t control;
-	uintptr_t save_cr3;
-
-	GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
-	wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
-	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
-
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-
-	/* Prepare the VMCS for L2 execution. */
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
-	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
-	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
-	vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
-
-	/* Jump into L2.  First, test failure to load guest CR3.  */
-	save_cr3 = vmreadz(GUEST_CR3);
-	vmwrite(GUEST_CR3, -1ull);
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) ==
-		     (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE));
-	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
-	vmwrite(GUEST_CR3, save_cr3);
-
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-
-	check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
-
-	GUEST_DONE();
-}
-
-static void report(int64_t val)
-{
-	pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
-		val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t vmx_pages_gva;
-	struct kvm_vcpu *vcpu;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code);
-
-	/* Allocate VMX pages and shared descriptors (vmx_pages). */
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	for (;;) {
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			report(uc.args[1]);
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
deleted file mode 100644
index a76078a08ff8..000000000000
--- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
+++ /dev/null
@@ -1,487 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * xapic_ipi_test
- *
- * Copyright (C) 2020, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake
- * another vCPU that is halted when KVM's backing page for the APIC access
- * address has been moved by mm.
- *
- * The test starts two vCPUs: one that sends IPIs and one that continually
- * executes HLT. The sender checks that the halter has woken from the HLT and
- * has reentered HLT before sending the next IPI. While the vCPUs are running,
- * the host continually calls migrate_pages to move all of the process' pages
- * amongst the available numa nodes on the machine.
- *
- * Migration is a command line option. When used on non-numa machines will 
- * exit with error. Test is still usefull on non-numa for testing IPIs.
- */
-#include <getopt.h>
-#include <pthread.h>
-#include <inttypes.h>
-#include <string.h>
-#include <time.h>
-
-#include "kvm_util.h"
-#include "numaif.h"
-#include "processor.h"
-#include "test_util.h"
-#include "vmx.h"
-
-/* Default running time for the test */
-#define DEFAULT_RUN_SECS 3
-
-/* Default delay between migrate_pages calls (microseconds) */
-#define DEFAULT_DELAY_USECS 500000
-
-/*
- * Vector for IPI from sender vCPU to halting vCPU.
- * Value is arbitrary and was chosen for the alternating bit pattern. Any
- * value should work.
- */
-#define IPI_VECTOR	 0xa5
-
-/*
- * Incremented in the IPI handler. Provides evidence to the sender that the IPI
- * arrived at the destination
- */
-static volatile uint64_t ipis_rcvd;
-
-/* Data struct shared between host main thread and vCPUs */
-struct test_data_page {
-	uint32_t halter_apic_id;
-	volatile uint64_t hlt_count;
-	volatile uint64_t wake_count;
-	uint64_t ipis_sent;
-	uint64_t migrations_attempted;
-	uint64_t migrations_completed;
-	uint32_t icr;
-	uint32_t icr2;
-	uint32_t halter_tpr;
-	uint32_t halter_ppr;
-
-	/*
-	 *  Record local version register as a cross-check that APIC access
-	 *  worked. Value should match what KVM reports (APIC_VERSION in
-	 *  arch/x86/kvm/lapic.c). If test is failing, check that values match
-	 *  to determine whether APIC access exits are working.
-	 */
-	uint32_t halter_lvr;
-};
-
-struct thread_params {
-	struct test_data_page *data;
-	struct kvm_vcpu *vcpu;
-	uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */
-};
-
-void verify_apic_base_addr(void)
-{
-	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
-	uint64_t base = GET_APIC_BASE(msr);
-
-	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
-}
-
-static void halter_guest_code(struct test_data_page *data)
-{
-	verify_apic_base_addr();
-	xapic_enable();
-
-	data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
-	data->halter_lvr = xapic_read_reg(APIC_LVR);
-
-	/*
-	 * Loop forever HLTing and recording halts & wakes. Disable interrupts
-	 * each time around to minimize window between signaling the pending
-	 * halt to the sender vCPU and executing the halt. No need to disable on
-	 * first run as this vCPU executes first and the host waits for it to
-	 * signal going into first halt before starting the sender vCPU. Record
-	 * TPR and PPR for diagnostic purposes in case the test fails.
-	 */
-	for (;;) {
-		data->halter_tpr = xapic_read_reg(APIC_TASKPRI);
-		data->halter_ppr = xapic_read_reg(APIC_PROCPRI);
-		data->hlt_count++;
-		asm volatile("sti; hlt; cli");
-		data->wake_count++;
-	}
-}
-
-/*
- * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to
- * enable diagnosing errant writes to the APIC access address backing page in
- * case of test failure.
- */
-static void guest_ipi_handler(struct ex_regs *regs)
-{
-	ipis_rcvd++;
-	xapic_write_reg(APIC_EOI, 77);
-}
-
-static void sender_guest_code(struct test_data_page *data)
-{
-	uint64_t last_wake_count;
-	uint64_t last_hlt_count;
-	uint64_t last_ipis_rcvd_count;
-	uint32_t icr_val;
-	uint32_t icr2_val;
-	uint64_t tsc_start;
-
-	verify_apic_base_addr();
-	xapic_enable();
-
-	/*
-	 * Init interrupt command register for sending IPIs
-	 *
-	 * Delivery mode=fixed, per SDM:
-	 *   "Delivers the interrupt specified in the vector field to the target
-	 *    processor."
-	 *
-	 * Destination mode=physical i.e. specify target by its local APIC
-	 * ID. This vCPU assumes that the halter vCPU has already started and
-	 * set data->halter_apic_id.
-	 */
-	icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR);
-	icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id);
-	data->icr = icr_val;
-	data->icr2 = icr2_val;
-
-	last_wake_count = data->wake_count;
-	last_hlt_count = data->hlt_count;
-	last_ipis_rcvd_count = ipis_rcvd;
-	for (;;) {
-		/*
-		 * Send IPI to halter vCPU.
-		 * First IPI can be sent unconditionally because halter vCPU
-		 * starts earlier.
-		 */
-		xapic_write_reg(APIC_ICR2, icr2_val);
-		xapic_write_reg(APIC_ICR, icr_val);
-		data->ipis_sent++;
-
-		/*
-		 * Wait up to ~1 sec for halter to indicate that it has:
-		 * 1. Received the IPI
-		 * 2. Woken up from the halt
-		 * 3. Gone back into halt
-		 * Current CPUs typically run at 2.x Ghz which is ~2
-		 * billion ticks per second.
-		 */
-		tsc_start = rdtsc();
-		while (rdtsc() - tsc_start < 2000000000) {
-			if ((ipis_rcvd != last_ipis_rcvd_count) &&
-			    (data->wake_count != last_wake_count) &&
-			    (data->hlt_count != last_hlt_count))
-				break;
-		}
-
-		GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) &&
-			     (data->wake_count != last_wake_count) &&
-			     (data->hlt_count != last_hlt_count));
-
-		last_wake_count = data->wake_count;
-		last_hlt_count = data->hlt_count;
-		last_ipis_rcvd_count = ipis_rcvd;
-	}
-}
-
-static void *vcpu_thread(void *arg)
-{
-	struct thread_params *params = (struct thread_params *)arg;
-	struct kvm_vcpu *vcpu = params->vcpu;
-	struct ucall uc;
-	int old;
-	int r;
-
-	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
-	TEST_ASSERT(r == 0,
-		    "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
-		    vcpu->id, r);
-
-	fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id);
-	vcpu_run(vcpu);
-
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-	if (get_ucall(vcpu, &uc) == UCALL_ABORT) {
-		TEST_ASSERT(false,
-			    "vCPU %u exited with error: %s.\n"
-			    "Sending vCPU sent %lu IPIs to halting vCPU\n"
-			    "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
-			    "Halter TPR=%#x PPR=%#x LVR=%#x\n"
-			    "Migrations attempted: %lu\n"
-			    "Migrations completed: %lu",
-			    vcpu->id, (const char *)uc.args[0],
-			    params->data->ipis_sent, params->data->hlt_count,
-			    params->data->wake_count,
-			    *params->pipis_rcvd, params->data->halter_tpr,
-			    params->data->halter_ppr, params->data->halter_lvr,
-			    params->data->migrations_attempted,
-			    params->data->migrations_completed);
-	}
-
-	return NULL;
-}
-
-static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
-{
-	void *retval;
-	int r;
-
-	r = pthread_cancel(thread);
-	TEST_ASSERT(r == 0,
-		    "pthread_cancel on vcpu_id=%d failed with errno=%d",
-		    vcpu->id, r);
-
-	r = pthread_join(thread, &retval);
-	TEST_ASSERT(r == 0,
-		    "pthread_join on vcpu_id=%d failed with errno=%d",
-		    vcpu->id, r);
-	TEST_ASSERT(retval == PTHREAD_CANCELED,
-		    "expected retval=%p, got %p", PTHREAD_CANCELED,
-		    retval);
-}
-
-void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs,
-		   uint64_t *pipis_rcvd)
-{
-	long pages_not_moved;
-	unsigned long nodemask = 0;
-	unsigned long nodemasks[sizeof(nodemask) * 8];
-	int nodes = 0;
-	time_t start_time, last_update, now;
-	time_t interval_secs = 1;
-	int i, r;
-	int from, to;
-	unsigned long bit;
-	uint64_t hlt_count;
-	uint64_t wake_count;
-	uint64_t ipis_sent;
-
-	fprintf(stderr, "Calling migrate_pages every %d microseconds\n",
-		delay_usecs);
-
-	/* Get set of first 64 numa nodes available */
-	r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8,
-			  0, MPOL_F_MEMS_ALLOWED);
-	TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno);
-
-	fprintf(stderr, "Numa nodes found amongst first %lu possible nodes "
-		"(each 1-bit indicates node is present): %#lx\n",
-		sizeof(nodemask) * 8, nodemask);
-
-	/* Init array of masks containing a single-bit in each, one for each
-	 * available node. migrate_pages called below requires specifying nodes
-	 * as bit masks.
-	 */
-	for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) {
-		if (nodemask & bit) {
-			nodemasks[nodes] = nodemask & bit;
-			nodes++;
-		}
-	}
-
-	TEST_ASSERT(nodes > 1,
-		    "Did not find at least 2 numa nodes. Can't do migration");
-
-	fprintf(stderr, "Migrating amongst %d nodes found\n", nodes);
-
-	from = 0;
-	to = 1;
-	start_time = time(NULL);
-	last_update = start_time;
-
-	ipis_sent = data->ipis_sent;
-	hlt_count = data->hlt_count;
-	wake_count = data->wake_count;
-
-	while ((int)(time(NULL) - start_time) < run_secs) {
-		data->migrations_attempted++;
-
-		/*
-		 * migrate_pages with PID=0 will migrate all pages of this
-		 * process between the nodes specified as bitmasks. The page
-		 * backing the APIC access address belongs to this process
-		 * because it is allocated by KVM in the context of the
-		 * KVM_CREATE_VCPU ioctl. If that assumption ever changes this
-		 * test may break or give a false positive signal.
-		 */
-		pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]),
-						&nodemasks[from],
-						&nodemasks[to]);
-		if (pages_not_moved < 0)
-			fprintf(stderr,
-				"migrate_pages failed, errno=%d\n", errno);
-		else if (pages_not_moved > 0)
-			fprintf(stderr,
-				"migrate_pages could not move %ld pages\n",
-				pages_not_moved);
-		else
-			data->migrations_completed++;
-
-		from = to;
-		to++;
-		if (to == nodes)
-			to = 0;
-
-		now = time(NULL);
-		if (((now - start_time) % interval_secs == 0) &&
-		    (now != last_update)) {
-			last_update = now;
-			fprintf(stderr,
-				"%lu seconds: Migrations attempted=%lu completed=%lu, "
-				"IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n",
-				now - start_time, data->migrations_attempted,
-				data->migrations_completed,
-				data->ipis_sent, *pipis_rcvd,
-				data->hlt_count, data->wake_count);
-
-			TEST_ASSERT(ipis_sent != data->ipis_sent &&
-				    hlt_count != data->hlt_count &&
-				    wake_count != data->wake_count,
-				    "IPI, HLT and wake count have not increased "
-				    "in the last %lu seconds. "
-				    "HLTer is likely hung.", interval_secs);
-
-			ipis_sent = data->ipis_sent;
-			hlt_count = data->hlt_count;
-			wake_count = data->wake_count;
-		}
-		usleep(delay_usecs);
-	}
-}
-
-void get_cmdline_args(int argc, char *argv[], int *run_secs,
-		      bool *migrate, int *delay_usecs)
-{
-	for (;;) {
-		int opt = getopt(argc, argv, "s:d:m");
-
-		if (opt == -1)
-			break;
-		switch (opt) {
-		case 's':
-			*run_secs = parse_size(optarg);
-			break;
-		case 'm':
-			*migrate = true;
-			break;
-		case 'd':
-			*delay_usecs = parse_size(optarg);
-			break;
-		default:
-			TEST_ASSERT(false,
-				    "Usage: -s <runtime seconds>. Default is %d seconds.\n"
-				    "-m adds calls to migrate_pages while vCPUs are running."
-				    " Default is no migrations.\n"
-				    "-d <delay microseconds> - delay between migrate_pages() calls."
-				    " Default is %d microseconds.",
-				    DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS);
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	int r;
-	int wait_secs;
-	const int max_halter_wait = 10;
-	int run_secs = 0;
-	int delay_usecs = 0;
-	struct test_data_page *data;
-	vm_vaddr_t test_data_page_vaddr;
-	bool migrate = false;
-	pthread_t threads[2];
-	struct thread_params params[2];
-	struct kvm_vm *vm;
-	uint64_t *pipis_rcvd;
-
-	get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs);
-	if (run_secs <= 0)
-		run_secs = DEFAULT_RUN_SECS;
-	if (delay_usecs <= 0)
-		delay_usecs = DEFAULT_DELAY_USECS;
-
-	vm = vm_create_with_one_vcpu(&params[0].vcpu, halter_guest_code);
-
-	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
-
-	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
-
-	params[1].vcpu = vm_vcpu_add(vm, 1, sender_guest_code);
-
-	test_data_page_vaddr = vm_vaddr_alloc_page(vm);
-	data = addr_gva2hva(vm, test_data_page_vaddr);
-	memset(data, 0, sizeof(*data));
-	params[0].data = data;
-	params[1].data = data;
-
-	vcpu_args_set(params[0].vcpu, 1, test_data_page_vaddr);
-	vcpu_args_set(params[1].vcpu, 1, test_data_page_vaddr);
-
-	pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd);
-	params[0].pipis_rcvd = pipis_rcvd;
-	params[1].pipis_rcvd = pipis_rcvd;
-
-	/* Start halter vCPU thread and wait for it to execute first HLT. */
-	r = pthread_create(&threads[0], NULL, vcpu_thread, &params[0]);
-	TEST_ASSERT(r == 0,
-		    "pthread_create halter failed errno=%d", errno);
-	fprintf(stderr, "Halter vCPU thread started\n");
-
-	wait_secs = 0;
-	while ((wait_secs < max_halter_wait) && !data->hlt_count) {
-		sleep(1);
-		wait_secs++;
-	}
-
-	TEST_ASSERT(data->hlt_count,
-		    "Halter vCPU did not execute first HLT within %d seconds",
-		    max_halter_wait);
-
-	fprintf(stderr,
-		"Halter vCPU thread reported its APIC ID: %u after %d seconds.\n",
-		data->halter_apic_id, wait_secs);
-
-	r = pthread_create(&threads[1], NULL, vcpu_thread, &params[1]);
-	TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno);
-
-	fprintf(stderr,
-		"IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n",
-		run_secs);
-
-	if (!migrate)
-		sleep(run_secs);
-	else
-		do_migrations(data, run_secs, delay_usecs, pipis_rcvd);
-
-	/*
-	 * Cancel threads and wait for them to stop.
-	 */
-	cancel_join_vcpu_thread(threads[0], params[0].vcpu);
-	cancel_join_vcpu_thread(threads[1], params[1].vcpu);
-
-	fprintf(stderr,
-		"Test successful after running for %d seconds.\n"
-		"Sending vCPU sent %lu IPIs to halting vCPU\n"
-		"Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
-		"Halter APIC ID=%#x\n"
-		"Sender ICR value=%#x ICR2 value=%#x\n"
-		"Halter TPR=%#x PPR=%#x LVR=%#x\n"
-		"Migrations attempted: %lu\n"
-		"Migrations completed: %lu\n",
-		run_secs, data->ipis_sent,
-		data->hlt_count, data->wake_count, *pipis_rcvd,
-		data->halter_apic_id,
-		data->icr, data->icr2,
-		data->halter_tpr, data->halter_ppr, data->halter_lvr,
-		data->migrations_attempted, data->migrations_completed);
-
-	kvm_vm_free(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
deleted file mode 100644
index 88bcca188799..000000000000
--- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
+++ /dev/null
@@ -1,262 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "apic.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "test_util.h"
-
-struct xapic_vcpu {
-	struct kvm_vcpu *vcpu;
-	bool is_x2apic;
-	bool has_xavic_errata;
-};
-
-static void xapic_guest_code(void)
-{
-	asm volatile("cli");
-
-	xapic_enable();
-
-	while (1) {
-		uint64_t val = (u64)xapic_read_reg(APIC_IRR) |
-			       (u64)xapic_read_reg(APIC_IRR + 0x10) << 32;
-
-		xapic_write_reg(APIC_ICR2, val >> 32);
-		xapic_write_reg(APIC_ICR, val);
-		GUEST_SYNC(val);
-	}
-}
-
-#define X2APIC_RSVD_BITS_MASK  (GENMASK_ULL(31, 20) | \
-				GENMASK_ULL(17, 16) | \
-				GENMASK_ULL(13, 13))
-
-static void x2apic_guest_code(void)
-{
-	asm volatile("cli");
-
-	x2apic_enable();
-
-	do {
-		uint64_t val = x2apic_read_reg(APIC_IRR) |
-			       x2apic_read_reg(APIC_IRR + 0x10) << 32;
-
-		if (val & X2APIC_RSVD_BITS_MASK) {
-			x2apic_write_reg_fault(APIC_ICR, val);
-		} else {
-			x2apic_write_reg(APIC_ICR, val);
-			GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val);
-		}
-		GUEST_SYNC(val);
-	} while (1);
-}
-
-static void ____test_icr(struct xapic_vcpu *x, uint64_t val)
-{
-	struct kvm_vcpu *vcpu = x->vcpu;
-	struct kvm_lapic_state xapic;
-	struct ucall uc;
-	uint64_t icr;
-
-	/*
-	 * Tell the guest what ICR value to write.  Use the IRR to pass info,
-	 * all bits are valid and should not be modified by KVM (ignoring the
-	 * fact that vectors 0-15 are technically illegal).
-	 */
-	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
-	*((u32 *)&xapic.regs[APIC_IRR]) = val;
-	*((u32 *)&xapic.regs[APIC_IRR + 0x10]) = val >> 32;
-	vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic);
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
-	TEST_ASSERT_EQ(uc.args[1], val);
-
-	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
-	icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) |
-	      (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32;
-	if (!x->is_x2apic) {
-		if (!x->has_xavic_errata)
-			val &= (-1u | (0xffull << (32 + 24)));
-	} else if (val & X2APIC_RSVD_BITS_MASK) {
-		return;
-	}
-
-	if (x->has_xavic_errata)
-		TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY);
-	else
-		TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY);
-}
-
-static void __test_icr(struct xapic_vcpu *x, uint64_t val)
-{
-	/*
-	 * The BUSY bit is reserved on both AMD and Intel, but only AMD treats
-	 * it is as _must_ be zero.  Intel simply ignores the bit.  Don't test
-	 * the BUSY bit for x2APIC, as there is no single correct behavior.
-	 */
-	if (!x->is_x2apic)
-		____test_icr(x, val | APIC_ICR_BUSY);
-
-	____test_icr(x, val & ~(u64)APIC_ICR_BUSY);
-}
-
-static void test_icr(struct xapic_vcpu *x)
-{
-	struct kvm_vcpu *vcpu = x->vcpu;
-	uint64_t icr, i, j;
-
-	icr = APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_FIXED;
-	for (i = 0; i <= 0xff; i++)
-		__test_icr(x, icr | i);
-
-	icr = APIC_INT_ASSERT | APIC_DM_FIXED;
-	for (i = 0; i <= 0xff; i++)
-		__test_icr(x, icr | i);
-
-	/*
-	 * Send all flavors of IPIs to non-existent vCPUs.  TODO: use number of
-	 * vCPUs, not vcpu.id + 1.  Arbitrarily use vector 0xff.
-	 */
-	icr = APIC_INT_ASSERT | 0xff;
-	for (i = 0; i < 0xff; i++) {
-		if (i == vcpu->id)
-			continue;
-		for (j = 0; j < 8; j++)
-			__test_icr(x, i << (32 + 24) | icr | (j << 8));
-	}
-
-	/* And again with a shorthand destination for all types of IPIs. */
-	icr = APIC_DEST_ALLBUT | APIC_INT_ASSERT;
-	for (i = 0; i < 8; i++)
-		__test_icr(x, icr | (i << 8));
-
-	/* And a few garbage value, just make sure it's an IRQ (blocked). */
-	__test_icr(x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK);
-	__test_icr(x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK);
-	__test_icr(x, -1ull & ~APIC_DM_FIXED_MASK);
-}
-
-static void __test_apic_id(struct kvm_vcpu *vcpu, uint64_t apic_base)
-{
-	uint32_t apic_id, expected;
-	struct kvm_lapic_state xapic;
-
-	vcpu_set_msr(vcpu, MSR_IA32_APICBASE, apic_base);
-
-	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
-
-	expected = apic_base & X2APIC_ENABLE ? vcpu->id : vcpu->id << 24;
-	apic_id = *((u32 *)&xapic.regs[APIC_ID]);
-
-	TEST_ASSERT(apic_id == expected,
-		    "APIC_ID not set back to %s format; wanted = %x, got = %x",
-		    (apic_base & X2APIC_ENABLE) ? "x2APIC" : "xAPIC",
-		    expected, apic_id);
-}
-
-/*
- * Verify that KVM switches the APIC_ID between xAPIC and x2APIC when userspace
- * stuffs MSR_IA32_APICBASE.  Setting the APIC_ID when x2APIC is enabled and
- * when the APIC transitions for DISABLED to ENABLED is architectural behavior
- * (on Intel), whereas the x2APIC => xAPIC transition behavior is KVM ABI since
- * attempted to transition from x2APIC to xAPIC without disabling the APIC is
- * architecturally disallowed.
- */
-static void test_apic_id(void)
-{
-	const uint32_t NR_VCPUS = 3;
-	struct kvm_vcpu *vcpus[NR_VCPUS];
-	uint64_t apic_base;
-	struct kvm_vm *vm;
-	int i;
-
-	vm = vm_create_with_vcpus(NR_VCPUS, NULL, vcpus);
-	vm_enable_cap(vm, KVM_CAP_X2APIC_API, KVM_X2APIC_API_USE_32BIT_IDS);
-
-	for (i = 0; i < NR_VCPUS; i++) {
-		apic_base = vcpu_get_msr(vcpus[i], MSR_IA32_APICBASE);
-
-		TEST_ASSERT(apic_base & MSR_IA32_APICBASE_ENABLE,
-			    "APIC not in ENABLED state at vCPU RESET");
-		TEST_ASSERT(!(apic_base & X2APIC_ENABLE),
-			    "APIC not in xAPIC mode at vCPU RESET");
-
-		__test_apic_id(vcpus[i], apic_base);
-		__test_apic_id(vcpus[i], apic_base | X2APIC_ENABLE);
-		__test_apic_id(vcpus[i], apic_base);
-	}
-
-	kvm_vm_free(vm);
-}
-
-static void test_x2apic_id(void)
-{
-	struct kvm_lapic_state lapic = {};
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	int i;
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-	vcpu_set_msr(vcpu, MSR_IA32_APICBASE, MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
-
-	/*
-	 * Try stuffing a modified x2APIC ID, KVM should ignore the value and
-	 * always return the vCPU's default/readonly x2APIC ID.
-	 */
-	for (i = 0; i <= 0xff; i++) {
-		*(u32 *)(lapic.regs + APIC_ID) = i << 24;
-		*(u32 *)(lapic.regs + APIC_SPIV) = APIC_SPIV_APIC_ENABLED;
-		vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
-
-		vcpu_ioctl(vcpu, KVM_GET_LAPIC, &lapic);
-		TEST_ASSERT(*((u32 *)&lapic.regs[APIC_ID]) == vcpu->id << 24,
-			    "x2APIC ID should be fully readonly");
-	}
-
-	kvm_vm_free(vm);
-}
-
-int main(int argc, char *argv[])
-{
-	struct xapic_vcpu x = {
-		.vcpu = NULL,
-		.is_x2apic = true,
-	};
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code);
-	test_icr(&x);
-	kvm_vm_free(vm);
-
-	/*
-	 * Use a second VM for the xAPIC test so that x2APIC can be hidden from
-	 * the guest in order to test AVIC.  KVM disallows changing CPUID after
-	 * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC.
-	 */
-	vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code);
-	x.is_x2apic = false;
-
-	/*
-	 * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit),
-	 * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel
-	 * drops writes, AMD does not).  Account for the errata when checking
-	 * that KVM reads back what was written.
-	 */
-	x.has_xavic_errata = host_cpu_is_amd &&
-			     get_kvm_amd_param_bool("avic");
-
-	vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC);
-
-	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
-	test_icr(&x);
-	kvm_vm_free(vm);
-
-	test_apic_id();
-	test_x2apic_id();
-}
diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
deleted file mode 100644
index c8a5c5e51661..000000000000
--- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * XCR0 cpuid test
- *
- * Copyright (C) 2022, Google LLC.
- */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-#include "processor.h"
-
-/*
- * Assert that architectural dependency rules are satisfied, e.g. that AVX is
- * supported if and only if SSE is supported.
- */
-#define ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0, xfeatures, dependencies)		\
-do {											\
-	uint64_t __supported = (supported_xcr0) & ((xfeatures) | (dependencies));	\
-											\
-	__GUEST_ASSERT((__supported & (xfeatures)) != (xfeatures) ||			\
-		       __supported == ((xfeatures) | (dependencies)),			\
-		       "supported = 0x%lx, xfeatures = 0x%llx, dependencies = 0x%llx",	\
-		       __supported, (xfeatures), (dependencies));			\
-} while (0)
-
-/*
- * Assert that KVM reports a sane, usable as-is XCR0.  Architecturally, a CPU
- * isn't strictly required to _support_ all XFeatures related to a feature, but
- * at the same time XSETBV will #GP if bundled XFeatures aren't enabled and
- * disabled coherently.  E.g. a CPU can technically enumerate supported for
- * XTILE_CFG but not XTILE_DATA, but attempting to enable XTILE_CFG without
- * XTILE_DATA will #GP.
- */
-#define ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, xfeatures)		\
-do {									\
-	uint64_t __supported = (supported_xcr0) & (xfeatures);		\
-									\
-	__GUEST_ASSERT(!__supported || __supported == (xfeatures),	\
-		       "supported = 0x%lx, xfeatures = 0x%llx",		\
-		       __supported, (xfeatures));			\
-} while (0)
-
-static void guest_code(void)
-{
-	uint64_t initial_xcr0;
-	uint64_t supported_xcr0;
-	int i, vector;
-
-	set_cr4(get_cr4() | X86_CR4_OSXSAVE);
-
-	initial_xcr0 = xgetbv(0);
-	supported_xcr0 = this_cpu_supported_xcr0();
-
-	GUEST_ASSERT(initial_xcr0 == supported_xcr0);
-
-	/* Check AVX */
-	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
-				     XFEATURE_MASK_YMM,
-				     XFEATURE_MASK_SSE);
-
-	/* Check MPX */
-	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
-				    XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
-	/* Check AVX-512 */
-	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
-				     XFEATURE_MASK_AVX512,
-				     XFEATURE_MASK_SSE | XFEATURE_MASK_YMM);
-	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
-				    XFEATURE_MASK_AVX512);
-
-	/* Check AMX */
-	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
-				    XFEATURE_MASK_XTILE);
-
-	vector = xsetbv_safe(0, XFEATURE_MASK_FP);
-	__GUEST_ASSERT(!vector,
-		       "Expected success on XSETBV(FP), got vector '0x%x'",
-		       vector);
-
-	vector = xsetbv_safe(0, supported_xcr0);
-	__GUEST_ASSERT(!vector,
-		       "Expected success on XSETBV(0x%lx), got vector '0x%x'",
-		       supported_xcr0, vector);
-
-	for (i = 0; i < 64; i++) {
-		if (supported_xcr0 & BIT_ULL(i))
-			continue;
-
-		vector = xsetbv_safe(0, supported_xcr0 | BIT_ULL(i));
-		__GUEST_ASSERT(vector == GP_VECTOR,
-			       "Expected #GP on XSETBV(0x%llx), supported XCR0 = %lx, got vector '0x%x'",
-			       BIT_ULL(i), supported_xcr0, vector);
-	}
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_run *run;
-	struct kvm_vm *vm;
-	struct ucall uc;
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	run = vcpu->run;
-
-	while (1) {
-		vcpu_run(vcpu);
-
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Unexpected exit reason: %u (%s),",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
deleted file mode 100644
index a59b3c799bb2..000000000000
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ /dev/null
@@ -1,1161 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright © 2021 Amazon.com, Inc. or its affiliates.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-#include <stdint.h>
-#include <time.h>
-#include <sched.h>
-#include <signal.h>
-#include <pthread.h>
-
-#include <sys/eventfd.h>
-
-#define SHINFO_REGION_GVA	0xc0000000ULL
-#define SHINFO_REGION_GPA	0xc0000000ULL
-#define SHINFO_REGION_SLOT	10
-
-#define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (3 * PAGE_SIZE))
-#define DUMMY_REGION_SLOT	11
-
-#define DUMMY_REGION_GPA_2	(SHINFO_REGION_GPA + (4 * PAGE_SIZE))
-#define DUMMY_REGION_SLOT_2	12
-
-#define SHINFO_ADDR	(SHINFO_REGION_GPA)
-#define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
-#define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
-#define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15)
-
-#define SHINFO_VADDR	(SHINFO_REGION_GVA)
-#define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
-#define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15)
-
-#define EVTCHN_VECTOR	0x10
-
-#define EVTCHN_TEST1 15
-#define EVTCHN_TEST2 66
-#define EVTCHN_TIMER 13
-
-enum {
-	TEST_INJECT_VECTOR = 0,
-	TEST_RUNSTATE_runnable,
-	TEST_RUNSTATE_blocked,
-	TEST_RUNSTATE_offline,
-	TEST_RUNSTATE_ADJUST,
-	TEST_RUNSTATE_DATA,
-	TEST_STEAL_TIME,
-	TEST_EVTCHN_MASKED,
-	TEST_EVTCHN_UNMASKED,
-	TEST_EVTCHN_SLOWPATH,
-	TEST_EVTCHN_SEND_IOCTL,
-	TEST_EVTCHN_HCALL,
-	TEST_EVTCHN_HCALL_SLOWPATH,
-	TEST_EVTCHN_HCALL_EVENTFD,
-	TEST_TIMER_SETUP,
-	TEST_TIMER_WAIT,
-	TEST_TIMER_RESTORE,
-	TEST_POLL_READY,
-	TEST_POLL_TIMEOUT,
-	TEST_POLL_MASKED,
-	TEST_POLL_WAKE,
-	SET_VCPU_INFO,
-	TEST_TIMER_PAST,
-	TEST_LOCKING_SEND_RACE,
-	TEST_LOCKING_POLL_RACE,
-	TEST_LOCKING_POLL_TIMEOUT,
-	TEST_DONE,
-
-	TEST_GUEST_SAW_IRQ,
-};
-
-#define XEN_HYPERCALL_MSR	0x40000000
-
-#define MIN_STEAL_TIME		50000
-
-#define SHINFO_RACE_TIMEOUT	2	/* seconds */
-
-#define __HYPERVISOR_set_timer_op	15
-#define __HYPERVISOR_sched_op		29
-#define __HYPERVISOR_event_channel_op	32
-
-#define SCHEDOP_poll			3
-
-#define EVTCHNOP_send			4
-
-#define EVTCHNSTAT_interdomain		2
-
-struct evtchn_send {
-	u32 port;
-};
-
-struct sched_poll {
-	u32 *ports;
-	unsigned int nr_ports;
-	u64 timeout;
-};
-
-struct pvclock_vcpu_time_info {
-	u32   version;
-	u32   pad0;
-	u64   tsc_timestamp;
-	u64   system_time;
-	u32   tsc_to_system_mul;
-	s8    tsc_shift;
-	u8    flags;
-	u8    pad[2];
-} __attribute__((__packed__)); /* 32 bytes */
-
-struct pvclock_wall_clock {
-	u32   version;
-	u32   sec;
-	u32   nsec;
-} __attribute__((__packed__));
-
-struct vcpu_runstate_info {
-	uint32_t state;
-	uint64_t state_entry_time;
-	uint64_t time[5]; /* Extra field for overrun check */
-};
-
-struct compat_vcpu_runstate_info {
-	uint32_t state;
-	uint64_t state_entry_time;
-	uint64_t time[5];
-} __attribute__((__packed__));
-
-struct arch_vcpu_info {
-	unsigned long cr2;
-	unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
-};
-
-struct vcpu_info {
-	uint8_t evtchn_upcall_pending;
-	uint8_t evtchn_upcall_mask;
-	unsigned long evtchn_pending_sel;
-	struct arch_vcpu_info arch;
-	struct pvclock_vcpu_time_info time;
-}; /* 64 bytes (x86) */
-
-struct shared_info {
-	struct vcpu_info vcpu_info[32];
-	unsigned long evtchn_pending[64];
-	unsigned long evtchn_mask[64];
-	struct pvclock_wall_clock wc;
-	uint32_t wc_sec_hi;
-	/* arch_shared_info here */
-};
-
-#define RUNSTATE_running  0
-#define RUNSTATE_runnable 1
-#define RUNSTATE_blocked  2
-#define RUNSTATE_offline  3
-
-static const char *runstate_names[] = {
-	"running",
-	"runnable",
-	"blocked",
-	"offline"
-};
-
-struct {
-	struct kvm_irq_routing info;
-	struct kvm_irq_routing_entry entries[2];
-} irq_routes;
-
-static volatile bool guest_saw_irq;
-
-static void evtchn_handler(struct ex_regs *regs)
-{
-	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
-
-	vcpu_arch_put_guest(vi->evtchn_upcall_pending, 0);
-	vcpu_arch_put_guest(vi->evtchn_pending_sel, 0);
-	guest_saw_irq = true;
-
-	GUEST_SYNC(TEST_GUEST_SAW_IRQ);
-}
-
-static void guest_wait_for_irq(void)
-{
-	while (!guest_saw_irq)
-		__asm__ __volatile__ ("rep nop" : : : "memory");
-	guest_saw_irq = false;
-}
-
-static void guest_code(void)
-{
-	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
-	int i;
-
-	__asm__ __volatile__(
-		"sti\n"
-		"nop\n"
-	);
-
-	/* Trigger an interrupt injection */
-	GUEST_SYNC(TEST_INJECT_VECTOR);
-
-	guest_wait_for_irq();
-
-	/* Test having the host set runstates manually */
-	GUEST_SYNC(TEST_RUNSTATE_runnable);
-	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
-	GUEST_ASSERT(rs->state == 0);
-
-	GUEST_SYNC(TEST_RUNSTATE_blocked);
-	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
-	GUEST_ASSERT(rs->state == 0);
-
-	GUEST_SYNC(TEST_RUNSTATE_offline);
-	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
-	GUEST_ASSERT(rs->state == 0);
-
-	/* Test runstate time adjust */
-	GUEST_SYNC(TEST_RUNSTATE_ADJUST);
-	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
-	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
-
-	/* Test runstate time set */
-	GUEST_SYNC(TEST_RUNSTATE_DATA);
-	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
-	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
-	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
-	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
-
-	/* sched_yield() should result in some 'runnable' time */
-	GUEST_SYNC(TEST_STEAL_TIME);
-	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
-
-	/* Attempt to deliver a *masked* interrupt */
-	GUEST_SYNC(TEST_EVTCHN_MASKED);
-
-	/* Wait until we see the bit set */
-	struct shared_info *si = (void *)SHINFO_VADDR;
-	while (!si->evtchn_pending[0])
-		__asm__ __volatile__ ("rep nop" : : : "memory");
-
-	/* Now deliver an *unmasked* interrupt */
-	GUEST_SYNC(TEST_EVTCHN_UNMASKED);
-
-	guest_wait_for_irq();
-
-	/* Change memslots and deliver an interrupt */
-	GUEST_SYNC(TEST_EVTCHN_SLOWPATH);
-
-	guest_wait_for_irq();
-
-	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
-	GUEST_SYNC(TEST_EVTCHN_SEND_IOCTL);
-
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_EVTCHN_HCALL);
-
-	/* Our turn. Deliver event channel (to ourselves) with
-	 * EVTCHNOP_send hypercall. */
-	struct evtchn_send s = { .port = 127 };
-	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
-
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_EVTCHN_HCALL_SLOWPATH);
-
-	/*
-	 * Same again, but this time the host has messed with memslots so it
-	 * should take the slow path in kvm_xen_set_evtchn().
-	 */
-	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
-
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_EVTCHN_HCALL_EVENTFD);
-
-	/* Deliver "outbound" event channel to an eventfd which
-	 * happens to be one of our own irqfds. */
-	s.port = 197;
-	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
-
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_TIMER_SETUP);
-
-	/* Set a timer 100ms in the future. */
-	xen_hypercall(__HYPERVISOR_set_timer_op,
-		      rs->state_entry_time + 100000000, NULL);
-
-	GUEST_SYNC(TEST_TIMER_WAIT);
-
-	/* Now wait for the timer */
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_TIMER_RESTORE);
-
-	/* The host has 'restored' the timer. Just wait for it. */
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_POLL_READY);
-
-	/* Poll for an event channel port which is already set */
-	u32 ports[1] = { EVTCHN_TIMER };
-	struct sched_poll p = {
-		.ports = ports,
-		.nr_ports = 1,
-		.timeout = 0,
-	};
-
-	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
-
-	GUEST_SYNC(TEST_POLL_TIMEOUT);
-
-	/* Poll for an unset port and wait for the timeout. */
-	p.timeout = 100000000;
-	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
-
-	GUEST_SYNC(TEST_POLL_MASKED);
-
-	/* A timer will wake the masked port we're waiting on, while we poll */
-	p.timeout = 0;
-	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
-
-	GUEST_SYNC(TEST_POLL_WAKE);
-
-	/* Set the vcpu_info to point at exactly the place it already is to
-	 * make sure the attribute is functional. */
-	GUEST_SYNC(SET_VCPU_INFO);
-
-	/* A timer wake an *unmasked* port which should wake us with an
-	 * actual interrupt, while we're polling on a different port. */
-	ports[0]++;
-	p.timeout = 0;
-	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
-
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_TIMER_PAST);
-
-	/* Timer should have fired already */
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_LOCKING_SEND_RACE);
-	/* Racing host ioctls */
-
-	guest_wait_for_irq();
-
-	GUEST_SYNC(TEST_LOCKING_POLL_RACE);
-	/* Racing vmcall against host ioctl */
-
-	ports[0] = 0;
-
-	p = (struct sched_poll) {
-		.ports = ports,
-		.nr_ports = 1,
-		.timeout = 0
-	};
-
-wait_for_timer:
-	/*
-	 * Poll for a timer wake event while the worker thread is mucking with
-	 * the shared info.  KVM XEN drops timer IRQs if the shared info is
-	 * invalid when the timer expires.  Arbitrarily poll 100 times before
-	 * giving up and asking the VMM to re-arm the timer.  100 polls should
-	 * consume enough time to beat on KVM without taking too long if the
-	 * timer IRQ is dropped due to an invalid event channel.
-	 */
-	for (i = 0; i < 100 && !guest_saw_irq; i++)
-		__xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
-
-	/*
-	 * Re-send the timer IRQ if it was (likely) dropped due to the timer
-	 * expiring while the event channel was invalid.
-	 */
-	if (!guest_saw_irq) {
-		GUEST_SYNC(TEST_LOCKING_POLL_TIMEOUT);
-		goto wait_for_timer;
-	}
-	guest_saw_irq = false;
-
-	GUEST_SYNC(TEST_DONE);
-}
-
-static struct shared_info *shinfo;
-static struct vcpu_info *vinfo;
-static struct kvm_vcpu *vcpu;
-
-static void handle_alrm(int sig)
-{
-	if (vinfo)
-		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
-	vcpu_dump(stdout, vcpu, 0);
-	TEST_FAIL("IRQ delivery timed out");
-}
-
-static void *juggle_shinfo_state(void *arg)
-{
-	struct kvm_vm *vm = (struct kvm_vm *)arg;
-
-	struct kvm_xen_hvm_attr cache_activate_gfn = {
-		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
-		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
-	};
-
-	struct kvm_xen_hvm_attr cache_deactivate_gfn = {
-		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
-		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
-	};
-
-	struct kvm_xen_hvm_attr cache_activate_hva = {
-		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA,
-		.u.shared_info.hva = (unsigned long)shinfo
-	};
-
-	struct kvm_xen_hvm_attr cache_deactivate_hva = {
-		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
-		.u.shared_info.hva = 0
-	};
-
-	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
-
-	for (;;) {
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn);
-		pthread_testcancel();
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn);
-
-		if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) {
-			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva);
-			pthread_testcancel();
-			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva);
-		}
-	}
-
-	return NULL;
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_xen_hvm_attr evt_reset;
-	struct kvm_vm *vm;
-	pthread_t thread;
-	bool verbose;
-	int ret;
-
-	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
-			       !strncmp(argv[1], "--verbose", 10));
-
-	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
-	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO);
-
-	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
-	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
-	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
-	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
-	bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA);
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	/* Map a region for the shared_info page */
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
-	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
-
-	shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
-
-	int zero_fd = open("/dev/zero", O_RDONLY);
-	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
-
-	struct kvm_xen_hvm_config hvmc = {
-		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
-		.msr = XEN_HYPERCALL_MSR,
-	};
-
-	/* Let the kernel know that we *will* use it for sending all
-	 * event channels, which lets it intercept SCHEDOP_poll */
-	if (do_evtchn_tests)
-		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
-
-	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
-
-	struct kvm_xen_hvm_attr lm = {
-		.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
-		.u.long_mode = 1,
-	};
-	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
-
-	if (do_runstate_flag) {
-		struct kvm_xen_hvm_attr ruf = {
-			.type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG,
-			.u.runstate_update_flag = 1,
-		};
-		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf);
-
-		ruf.u.runstate_update_flag = 0;
-		vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf);
-		TEST_ASSERT(ruf.u.runstate_update_flag == 1,
-			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
-	}
-
-	struct kvm_xen_hvm_attr ha = {};
-
-	if (has_shinfo_hva) {
-		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA;
-		ha.u.shared_info.hva = (unsigned long)shinfo;
-	} else {
-		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO;
-		ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE;
-	}
-
-	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
-
-	/*
-	 * Test what happens when the HVA of the shinfo page is remapped after
-	 * the kernel has a reference to it. But make sure we copy the clock
-	 * info over since that's only set at setup time, and we test it later.
-	 */
-	struct pvclock_wall_clock wc_copy = shinfo->wc;
-	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
-	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
-	shinfo->wc = wc_copy;
-
-	struct kvm_xen_vcpu_attr vi = {
-		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
-		.u.gpa = VCPU_INFO_ADDR,
-	};
-	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
-
-	struct kvm_xen_vcpu_attr pvclock = {
-		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
-		.u.gpa = PVTIME_ADDR,
-	};
-	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock);
-
-	struct kvm_xen_hvm_attr vec = {
-		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
-		.u.vector = EVTCHN_VECTOR,
-	};
-	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
-
-	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
-
-	if (do_runstate_tests) {
-		struct kvm_xen_vcpu_attr st = {
-			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
-			.u.gpa = RUNSTATE_ADDR,
-		};
-		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
-	}
-
-	int irq_fd[2] = { -1, -1 };
-
-	if (do_eventfd_tests) {
-		irq_fd[0] = eventfd(0, 0);
-		irq_fd[1] = eventfd(0, 0);
-
-		/* Unexpected, but not a KVM failure */
-		if (irq_fd[0] == -1 || irq_fd[1] == -1)
-			do_evtchn_tests = do_eventfd_tests = false;
-	}
-
-	if (do_eventfd_tests) {
-		irq_routes.info.nr = 2;
-
-		irq_routes.entries[0].gsi = 32;
-		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
-		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
-		irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id;
-		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
-
-		irq_routes.entries[1].gsi = 33;
-		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
-		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
-		irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id;
-		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
-
-		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
-
-		struct kvm_irqfd ifd = { };
-
-		ifd.fd = irq_fd[0];
-		ifd.gsi = 32;
-		vm_ioctl(vm, KVM_IRQFD, &ifd);
-
-		ifd.fd = irq_fd[1];
-		ifd.gsi = 33;
-		vm_ioctl(vm, KVM_IRQFD, &ifd);
-
-		struct sigaction sa = { };
-		sa.sa_handler = handle_alrm;
-		sigaction(SIGALRM, &sa, NULL);
-	}
-
-	struct kvm_xen_vcpu_attr tmr = {
-		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
-		.u.timer.port = EVTCHN_TIMER,
-		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
-		.u.timer.expires_ns = 0
-	};
-
-	if (do_evtchn_tests) {
-		struct kvm_xen_hvm_attr inj = {
-			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
-			.u.evtchn.send_port = 127,
-			.u.evtchn.type = EVTCHNSTAT_interdomain,
-			.u.evtchn.flags = 0,
-			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
-			.u.evtchn.deliver.port.vcpu = vcpu->id + 1,
-			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
-		};
-		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
-
-		/* Test migration to a different vCPU */
-		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
-		inj.u.evtchn.deliver.port.vcpu = vcpu->id;
-		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
-
-		inj.u.evtchn.send_port = 197;
-		inj.u.evtchn.deliver.eventfd.port = 0;
-		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
-		inj.u.evtchn.flags = 0;
-		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
-
-		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-	}
-	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
-	vinfo->evtchn_upcall_pending = 0;
-
-	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
-	rs->state = 0x5a;
-
-	bool evtchn_irq_expected = false;
-
-	for (;;) {
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC: {
-			struct kvm_xen_vcpu_attr rst;
-			long rundelay;
-
-			if (do_runstate_tests)
-				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
-					    rs->time[1] + rs->time[2] + rs->time[3],
-					    "runstate times don't add up");
-
-			switch (uc.args[1]) {
-			case TEST_INJECT_VECTOR:
-				if (verbose)
-					printf("Delivering evtchn upcall\n");
-				evtchn_irq_expected = true;
-				vinfo->evtchn_upcall_pending = 1;
-				break;
-
-			case TEST_RUNSTATE_runnable...TEST_RUNSTATE_offline:
-				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
-				if (!do_runstate_tests)
-					goto done;
-				if (verbose)
-					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
-				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
-				rst.u.runstate.state = uc.args[1] + RUNSTATE_runnable -
-					TEST_RUNSTATE_runnable;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
-				break;
-
-			case TEST_RUNSTATE_ADJUST:
-				if (verbose)
-					printf("Testing RUNSTATE_ADJUST\n");
-				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
-				memset(&rst.u, 0, sizeof(rst.u));
-				rst.u.runstate.state = (uint64_t)-1;
-				rst.u.runstate.time_blocked =
-					0x5a - rs->time[RUNSTATE_blocked];
-				rst.u.runstate.time_offline =
-					0x6b6b - rs->time[RUNSTATE_offline];
-				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
-					rst.u.runstate.time_offline;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
-				break;
-
-			case TEST_RUNSTATE_DATA:
-				if (verbose)
-					printf("Testing RUNSTATE_DATA\n");
-				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
-				memset(&rst.u, 0, sizeof(rst.u));
-				rst.u.runstate.state = RUNSTATE_running;
-				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
-				rst.u.runstate.time_blocked = 0x6b6b;
-				rst.u.runstate.time_offline = 0x5a;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
-				break;
-
-			case TEST_STEAL_TIME:
-				if (verbose)
-					printf("Testing steal time\n");
-				/* Yield until scheduler delay exceeds target */
-				rundelay = get_run_delay() + MIN_STEAL_TIME;
-				do {
-					sched_yield();
-				} while (get_run_delay() < rundelay);
-				break;
-
-			case TEST_EVTCHN_MASKED:
-				if (!do_eventfd_tests)
-					goto done;
-				if (verbose)
-					printf("Testing masked event channel\n");
-				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
-				eventfd_write(irq_fd[0], 1UL);
-				alarm(1);
-				break;
-
-			case TEST_EVTCHN_UNMASKED:
-				if (verbose)
-					printf("Testing unmasked event channel\n");
-				/* Unmask that, but deliver the other one */
-				shinfo->evtchn_pending[0] = 0;
-				shinfo->evtchn_mask[0] = 0;
-				eventfd_write(irq_fd[1], 1UL);
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_EVTCHN_SLOWPATH:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				shinfo->evtchn_pending[1] = 0;
-				if (verbose)
-					printf("Testing event channel after memslot change\n");
-				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
-				eventfd_write(irq_fd[0], 1UL);
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_EVTCHN_SEND_IOCTL:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				if (!do_evtchn_tests)
-					goto done;
-
-				shinfo->evtchn_pending[0] = 0;
-				if (verbose)
-					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
-
-				struct kvm_irq_routing_xen_evtchn e;
-				e.port = EVTCHN_TEST2;
-				e.vcpu = vcpu->id;
-				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
-
-				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_EVTCHN_HCALL:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				shinfo->evtchn_pending[1] = 0;
-
-				if (verbose)
-					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_EVTCHN_HCALL_SLOWPATH:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				shinfo->evtchn_pending[0] = 0;
-
-				if (verbose)
-					printf("Testing guest EVTCHNOP_send direct to evtchn after memslot change\n");
-				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-							    DUMMY_REGION_GPA_2, DUMMY_REGION_SLOT_2, 1, 0);
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_EVTCHN_HCALL_EVENTFD:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				shinfo->evtchn_pending[0] = 0;
-
-				if (verbose)
-					printf("Testing guest EVTCHNOP_send to eventfd\n");
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_TIMER_SETUP:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				shinfo->evtchn_pending[1] = 0;
-
-				if (verbose)
-					printf("Testing guest oneshot timer\n");
-				break;
-
-			case TEST_TIMER_WAIT:
-				memset(&tmr, 0, sizeof(tmr));
-				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
-				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
-					    "Timer port not returned");
-				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
-					    "Timer priority not returned");
-				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
-					    "Timer expiry not returned");
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_TIMER_RESTORE:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				shinfo->evtchn_pending[0] = 0;
-
-				if (verbose)
-					printf("Testing restored oneshot timer\n");
-
-				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-				evtchn_irq_expected = true;
-				alarm(1);
-				break;
-
-			case TEST_POLL_READY:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-
-				if (verbose)
-					printf("Testing SCHEDOP_poll with already pending event\n");
-				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
-				alarm(1);
-				break;
-
-			case TEST_POLL_TIMEOUT:
-				if (verbose)
-					printf("Testing SCHEDOP_poll timeout\n");
-				shinfo->evtchn_pending[0] = 0;
-				alarm(1);
-				break;
-
-			case TEST_POLL_MASKED:
-				if (verbose)
-					printf("Testing SCHEDOP_poll wake on masked event\n");
-
-				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-				alarm(1);
-				break;
-
-			case TEST_POLL_WAKE:
-				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
-				if (verbose)
-					printf("Testing SCHEDOP_poll wake on unmasked event\n");
-
-				evtchn_irq_expected = true;
-				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-
-				/* Read it back and check the pending time is reported correctly */
-				tmr.u.timer.expires_ns = 0;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
-				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
-					    "Timer not reported pending");
-				alarm(1);
-				break;
-
-			case SET_VCPU_INFO:
-				if (has_shinfo_hva) {
-					struct kvm_xen_vcpu_attr vih = {
-						.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA,
-						.u.hva = (unsigned long)vinfo
-					};
-					vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih);
-				}
-				break;
-
-			case TEST_TIMER_PAST:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				/* Read timer and check it is no longer pending */
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
-				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
-
-				shinfo->evtchn_pending[0] = 0;
-				if (verbose)
-					printf("Testing timer in the past\n");
-
-				evtchn_irq_expected = true;
-				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-				alarm(1);
-				break;
-
-			case TEST_LOCKING_SEND_RACE:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-				alarm(0);
-
-				if (verbose)
-					printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
-
-				ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
-				TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
-
-				struct kvm_irq_routing_xen_evtchn uxe = {
-					.port = 1,
-					.vcpu = vcpu->id,
-					.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
-				};
-
-				evtchn_irq_expected = true;
-				for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
-					__vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
-				break;
-
-			case TEST_LOCKING_POLL_RACE:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-
-				if (verbose)
-					printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
-
-				shinfo->evtchn_pending[0] = 1;
-
-				evtchn_irq_expected = true;
-				tmr.u.timer.expires_ns = rs->state_entry_time +
-							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-				break;
-
-			case TEST_LOCKING_POLL_TIMEOUT:
-				/*
-				 * Optional and possibly repeated sync point.
-				 * Injecting the timer IRQ may fail if the
-				 * shinfo is invalid when the timer expires.
-				 * If the timer has expired but the IRQ hasn't
-				 * been delivered, rearm the timer and retry.
-				 */
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
-
-				/* Resume the guest if the timer is still pending. */
-				if (tmr.u.timer.expires_ns)
-					break;
-
-				/* All done if the IRQ was delivered. */
-				if (!evtchn_irq_expected)
-					break;
-
-				tmr.u.timer.expires_ns = rs->state_entry_time +
-							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
-				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
-				break;
-			case TEST_DONE:
-				TEST_ASSERT(!evtchn_irq_expected,
-					    "Expected event channel IRQ but it didn't happen");
-
-				ret = pthread_cancel(thread);
-				TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
-
-				ret = pthread_join(thread, 0);
-				TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
-				goto done;
-
-			case TEST_GUEST_SAW_IRQ:
-				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
-				evtchn_irq_expected = false;
-				break;
-			}
-			break;
-		}
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
-		}
-	}
-
- done:
-	evt_reset.type = KVM_XEN_ATTR_TYPE_EVTCHN;
-	evt_reset.u.evtchn.flags = KVM_XEN_EVTCHN_RESET;
-	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset);
-
-	alarm(0);
-
-	/*
-	 * Just a *really* basic check that things are being put in the
-	 * right place. The actual calculations are much the same for
-	 * Xen as they are for the KVM variants, so no need to check.
-	 */
-	struct pvclock_wall_clock *wc;
-	struct pvclock_vcpu_time_info *ti, *ti2;
-	struct kvm_clock_data kcdata;
-	long long delta;
-
-	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
-	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
-	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
-
-	if (verbose) {
-		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
-		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
-		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
-		       ti->tsc_shift, ti->flags);
-		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
-		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
-		       ti2->tsc_shift, ti2->flags);
-	}
-
-	TEST_ASSERT(wc->version && !(wc->version & 1),
-		    "Bad wallclock version %x", wc->version);
-
-	vm_ioctl(vm, KVM_GET_CLOCK, &kcdata);
-
-	if (kcdata.flags & KVM_CLOCK_REALTIME) {
-		if (verbose) {
-			printf("KVM_GET_CLOCK clock: %lld.%09lld\n",
-			       kcdata.clock / NSEC_PER_SEC, kcdata.clock % NSEC_PER_SEC);
-			printf("KVM_GET_CLOCK realtime: %lld.%09lld\n",
-			       kcdata.realtime / NSEC_PER_SEC, kcdata.realtime % NSEC_PER_SEC);
-		}
-
-		delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock);
-
-		/*
-		 * KVM_GET_CLOCK gives CLOCK_REALTIME which jumps on leap seconds updates but
-		 * unfortunately KVM doesn't currently offer a CLOCK_TAI alternative. Accept 1s
-		 * delta as testing clock accuracy is not the goal here. The test just needs to
-		 * check that the value in shinfo is somewhat sane.
-		 */
-		TEST_ASSERT(llabs(delta) < NSEC_PER_SEC,
-			    "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld",
-			    wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC,
-			    (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC);
-	} else {
-		pr_info("Missing KVM_CLOCK_REALTIME, skipping shinfo epoch sanity check\n");
-	}
-
-	TEST_ASSERT(ti->version && !(ti->version & 1),
-		    "Bad time_info version %x", ti->version);
-	TEST_ASSERT(ti2->version && !(ti2->version & 1),
-		    "Bad time_info version %x", ti->version);
-
-	if (do_runstate_tests) {
-		/*
-		 * Fetch runstate and check sanity. Strictly speaking in the
-		 * general case we might not expect the numbers to be identical
-		 * but in this case we know we aren't running the vCPU any more.
-		 */
-		struct kvm_xen_vcpu_attr rst = {
-			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
-		};
-		vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst);
-
-		if (verbose) {
-			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
-			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
-			       rs->state, rs->state_entry_time);
-			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
-				printf("State %s: %" PRIu64 " ns\n",
-				       runstate_names[i], rs->time[i]);
-			}
-		}
-
-		/*
-		 * Exercise runstate info at all points across the page boundary, in
-		 * 32-bit and 64-bit mode. In particular, test the case where it is
-		 * configured in 32-bit mode and then switched to 64-bit mode while
-		 * active, which takes it onto the second page.
-		 */
-		unsigned long runstate_addr;
-		struct compat_vcpu_runstate_info *crs;
-		for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4;
-		     runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) {
-
-			rs = addr_gpa2hva(vm, runstate_addr);
-			crs = (void *)rs;
-
-			memset(rs, 0xa5, sizeof(*rs));
-
-			/* Set to compatibility mode */
-			lm.u.long_mode = 0;
-			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
-
-			/* Set runstate to new address (kernel will write it) */
-			struct kvm_xen_vcpu_attr st = {
-				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
-				.u.gpa = runstate_addr,
-			};
-			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
-
-			if (verbose)
-				printf("Compatibility runstate at %08lx\n", runstate_addr);
-
-			TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch");
-			TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time,
-				    "State entry time mismatch");
-			TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running,
-				    "Running time mismatch");
-			TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
-				    "Runnable time mismatch");
-			TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
-				    "Blocked time mismatch");
-			TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
-				    "Offline time mismatch");
-			TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
-				    "Structure overrun");
-			TEST_ASSERT(crs->state_entry_time == crs->time[0] +
-				    crs->time[1] + crs->time[2] + crs->time[3],
-				    "runstate times don't add up");
-
-
-			/* Now switch to 64-bit mode */
-			lm.u.long_mode = 1;
-			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
-
-			memset(rs, 0xa5, sizeof(*rs));
-
-			/* Don't change the address, just trigger a write */
-			struct kvm_xen_vcpu_attr adj = {
-				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST,
-				.u.runstate.state = (uint64_t)-1
-			};
-			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj);
-
-			if (verbose)
-				printf("64-bit runstate at %08lx\n", runstate_addr);
-
-			TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
-			TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
-				    "State entry time mismatch");
-			TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
-				    "Running time mismatch");
-			TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
-				    "Runnable time mismatch");
-			TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
-				    "Blocked time mismatch");
-			TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
-				    "Offline time mismatch");
-			TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
-				    "Structure overrun");
-
-			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
-				    rs->time[1] + rs->time[2] + rs->time[3],
-				    "runstate times don't add up");
-		}
-	}
-
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
deleted file mode 100644
index 2585087cdf5c..000000000000
--- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * xen_vmcall_test
- *
- * Copyright © 2020 Amazon.com, Inc. or its affiliates.
- *
- * Userspace hypercall testing
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "hyperv.h"
-
-#define HCALL_REGION_GPA	0xc0000000ULL
-#define HCALL_REGION_SLOT	10
-
-#define INPUTVALUE 17
-#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x)
-#define RETVALUE 0xcafef00dfbfbffffUL
-
-#define XEN_HYPERCALL_MSR	0x40000200
-#define HV_GUEST_OS_ID_MSR	0x40000000
-#define HV_HYPERCALL_MSR	0x40000001
-
-#define HVCALL_SIGNAL_EVENT		0x005d
-#define HV_STATUS_INVALID_ALIGNMENT	4
-
-static void guest_code(void)
-{
-	unsigned long rax = INPUTVALUE;
-	unsigned long rdi = ARGVALUE(1);
-	unsigned long rsi = ARGVALUE(2);
-	unsigned long rdx = ARGVALUE(3);
-	unsigned long rcx;
-	register unsigned long r10 __asm__("r10") = ARGVALUE(4);
-	register unsigned long r8 __asm__("r8") = ARGVALUE(5);
-	register unsigned long r9 __asm__("r9") = ARGVALUE(6);
-
-	/* First a direct invocation of 'vmcall' */
-	__asm__ __volatile__("vmcall" :
-			     "=a"(rax) :
-			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
-			     "r"(r10), "r"(r8), "r"(r9));
-	GUEST_ASSERT(rax == RETVALUE);
-
-	/* Fill in the Xen hypercall page */
-	__asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR),
-			     "a" (HCALL_REGION_GPA & 0xffffffff),
-			     "d" (HCALL_REGION_GPA >> 32));
-
-	/* Set Hyper-V Guest OS ID */
-	__asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR),
-			     "a" (0x5a), "d" (0));
-
-	/* Hyper-V hypercall page */
-	u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1;
-	__asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR),
-			     "a" (msrval & 0xffffffff),
-			     "d" (msrval >> 32));
-
-	/* Invoke a Xen hypercall */
-	__asm__ __volatile__("call *%1" : "=a"(rax) :
-			     "r"(HCALL_REGION_GPA + INPUTVALUE * 32),
-			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
-			     "r"(r10), "r"(r8), "r"(r9));
-	GUEST_ASSERT(rax == RETVALUE);
-
-	/* Invoke a Hyper-V hypercall */
-	rax = 0;
-	rcx = HVCALL_SIGNAL_EVENT;	/* code */
-	rdx = 0x5a5a5a5a;		/* ingpa (badly aligned) */
-	__asm__ __volatile__("call *%1" : "=a"(rax) :
-			     "r"(HCALL_REGION_GPA + PAGE_SIZE),
-			     "a"(rax), "c"(rcx), "d"(rdx),
-			     "r"(r8));
-	GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT);
-
-	GUEST_DONE();
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned int xen_caps;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
-	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-	vcpu_set_hv_cpuid(vcpu);
-
-	struct kvm_xen_hvm_config hvmc = {
-		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
-		.msr = XEN_HYPERCALL_MSR,
-	};
-	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
-
-	/* Map a region for the hypercall pages */
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0);
-	virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2);
-
-	for (;;) {
-		volatile struct kvm_run *run = vcpu->run;
-		struct ucall uc;
-
-		vcpu_run(vcpu);
-
-		if (run->exit_reason == KVM_EXIT_XEN) {
-			TEST_ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL);
-			TEST_ASSERT_EQ(run->xen.u.hcall.cpl, 0);
-			TEST_ASSERT_EQ(run->xen.u.hcall.longmode, 1);
-			TEST_ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE);
-			TEST_ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1));
-			TEST_ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2));
-			TEST_ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3));
-			TEST_ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4));
-			TEST_ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5));
-			TEST_ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6));
-			run->xen.u.hcall.result = RETVALUE;
-			continue;
-		}
-
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto done;
-		default:
-			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
-		}
-	}
-done:
-	kvm_vm_free(vm);
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
deleted file mode 100644
index f331a4e9bae3..000000000000
--- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2019, Google LLC.
- *
- * Tests for the IA32_XSS MSR.
- */
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "vmx.h"
-
-#define MSR_BITS      64
-
-int main(int argc, char *argv[])
-{
-	bool xss_in_msr_list;
-	struct kvm_vm *vm;
-	struct kvm_vcpu *vcpu;
-	uint64_t xss_val;
-	int i, r;
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVES));
-
-	xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS);
-	TEST_ASSERT(xss_val == 0,
-		    "MSR_IA32_XSS should be initialized to zero");
-
-	vcpu_set_msr(vcpu, MSR_IA32_XSS, xss_val);
-
-	/*
-	 * At present, KVM only supports a guest IA32_XSS value of 0. Verify
-	 * that trying to set the guest IA32_XSS to an unsupported value fails.
-	 * Also, in the future when a non-zero value succeeds check that
-	 * IA32_XSS is in the list of MSRs to save/restore.
-	 */
-	xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS);
-	for (i = 0; i < MSR_BITS; ++i) {
-		r = _vcpu_set_msr(vcpu, MSR_IA32_XSS, 1ull << i);
-
-		/*
-		 * Setting a list of MSRs returns the entry that "faulted", or
-		 * the last entry +1 if all MSRs were successfully written.
-		 */
-		TEST_ASSERT(!r || r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r));
-		TEST_ASSERT(r != 1 || xss_in_msr_list,
-			    "IA32_XSS was able to be set, but was not in save/restore list");
-	}
-
-	kvm_vm_free(vm);
-}
-- 
cgit v1.2.3


From 9af04539d474dda4984ff4909d4568e6123c8cba Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 16:55:47 -0800
Subject: KVM: selftests: Override ARCH for x86_64 instead of using ARCH_DIR

Now that KVM selftests uses the kernel's canonical arch paths, directly
override ARCH to 'x86' when targeting x86_64 instead of defining ARCH_DIR
to redirect to appropriate paths.  ARCH_DIR was originally added to deal
with KVM selftests using the target triple ARCH for directories, e.g.
s390x and aarch64; keeping it around just to deal with the one-off alias
from x86_64=>x86 is unnecessary and confusing.

Note, even when selftests are built from the top-level Makefile, ARCH is
scoped to KVM's makefiles, i.e. overriding ARCH won't trip up some other
selftests that (somehow) expects x86_64 and can't work with x86.

Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Link: https://lore.kernel.org/r/20241128005547.4077116-17-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile     |  4 +---
 tools/testing/selftests/kvm/Makefile.kvm | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 9bc2eba1af1c..20af35a91d6f 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -6,9 +6,7 @@ ARCH            ?= $(SUBARCH)
 ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64))
 # Top-level selftests allows ARCH=x86_64 :-(
 ifeq ($(ARCH),x86_64)
-	ARCH_DIR := x86
-else
-	ARCH_DIR := $(ARCH)
+	ARCH := x86
 endif
 include Makefile.kvm
 else
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 9888dd6bb483..4277b983cace 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -207,10 +207,10 @@ TEST_GEN_PROGS_riscv += steal_time
 SPLIT_TESTS += arch_timer
 SPLIT_TESTS += get-reg-list
 
-TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR))
-TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
-TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
-LIBKVM += $(LIBKVM_$(ARCH_DIR))
+TEST_PROGS += $(TEST_PROGS_$(ARCH))
+TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH))
+TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH))
+LIBKVM += $(LIBKVM_$(ARCH))
 
 OVERRIDE_TARGETS = 1
 
@@ -222,14 +222,14 @@ include ../lib.mk
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
 LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
-LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH_DIR)/include
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
 	-fno-builtin-memcmp -fno-builtin-memcpy \
 	-fno-builtin-memset -fno-builtin-strnlen \
 	-fno-stack-protector -fno-PIE -fno-strict-aliasing \
 	-I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
-	-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
+	-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH) \
 	-I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
 ifeq ($(ARCH),s390)
 	CFLAGS += -march=z10
@@ -273,7 +273,7 @@ LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
 LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
 SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
-SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH_DIR)/%.o, $(SPLIT_TESTS))
+SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS))
 
 TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS))
 TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED))
@@ -282,7 +282,7 @@ TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS))
 TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TEST_GEN_OBJ))
 -include $(TEST_DEP_FILES)
 
-$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
+$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
 
 $(filter-out $(SPLIT_TEST_GEN_PROGS), $(TEST_GEN_PROGS)) \
 $(TEST_GEN_PROGS_EXTENDED): %: %.o
@@ -290,9 +290,9 @@ $(TEST_GEN_PROGS_EXTENDED): %: %.o
 $(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
-$(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH_DIR)/%.o
+$(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH)/%.o
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $^ $(LDLIBS) -o $@
-$(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH_DIR)/%.o: $(ARCH_DIR)/%.c
+$(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH)/%.o: $(ARCH)/%.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
 EXTRA_CLEAN += $(GEN_HDRS) \
-- 
cgit v1.2.3


From bf4dfc3aa875c082dd70b979417f0729a35969a0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 17:33:33 -0800
Subject: KVM: selftests: Update x86's set_sregs_test to match KVM's CPUID
 enforcement

Rework x86's set sregs test to verify that KVM enforces CPUID vs. CR4
features even if userspace hasn't explicitly set guest CPUID.  KVM used to
allow userspace to set any KVM-supported CR4 value prior to KVM_SET_CPUID2,
and the test verified that behavior.

However, the testcase was written purely to verify KVM's existing behavior,
i.e. was NOT written to match the needs of real world VMMs.

Opportunistically verify that KVM continues to reject unsupported features
after KVM_SET_CPUID2 (using KVM_GET_SUPPORTED_CPUID).

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/r/20241128013424.4096668-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/set_sregs_test.c | 53 ++++++++++++++----------
 1 file changed, 30 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/set_sregs_test.c b/tools/testing/selftests/kvm/x86/set_sregs_test.c
index c021c0795a96..96fd690d479a 100644
--- a/tools/testing/selftests/kvm/x86/set_sregs_test.c
+++ b/tools/testing/selftests/kvm/x86/set_sregs_test.c
@@ -41,13 +41,15 @@ do {										\
 	TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs");	\
 } while (0)
 
+#define KVM_ALWAYS_ALLOWED_CR4 (X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD |	\
+				X86_CR4_DE | X86_CR4_PSE | X86_CR4_PAE |	\
+				X86_CR4_MCE | X86_CR4_PGE | X86_CR4_PCE |	\
+				X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT)
+
 static uint64_t calc_supported_cr4_feature_bits(void)
 {
-	uint64_t cr4;
+	uint64_t cr4 = KVM_ALWAYS_ALLOWED_CR4;
 
-	cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE |
-	      X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE |
-	      X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT;
 	if (kvm_cpu_has(X86_FEATURE_UMIP))
 		cr4 |= X86_CR4_UMIP;
 	if (kvm_cpu_has(X86_FEATURE_LA57))
@@ -72,28 +74,14 @@ static uint64_t calc_supported_cr4_feature_bits(void)
 	return cr4;
 }
 
-int main(int argc, char *argv[])
+static void test_cr_bits(struct kvm_vcpu *vcpu, uint64_t cr4)
 {
 	struct kvm_sregs sregs;
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	uint64_t cr4;
 	int rc, i;
 
-	/*
-	 * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
-	 * use it to verify all supported CR4 bits can be set prior to defining
-	 * the vCPU model, i.e. without doing KVM_SET_CPUID2.
-	 */
-	vm = vm_create_barebones();
-	vcpu = __vm_vcpu_add(vm, 0);
-
 	vcpu_sregs_get(vcpu, &sregs);
-
-	sregs.cr0 = 0;
-	sregs.cr4 |= calc_supported_cr4_feature_bits();
-	cr4 = sregs.cr4;
-
+	sregs.cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+	sregs.cr4 |= cr4;
 	rc = _vcpu_sregs_set(vcpu, &sregs);
 	TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4);
 
@@ -101,7 +89,6 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)",
 		    sregs.cr4, cr4);
 
-	/* Verify all unsupported features are rejected by KVM. */
 	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP);
 	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57);
 	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE);
@@ -119,10 +106,28 @@ int main(int argc, char *argv[])
 	/* NW without CD is illegal, as is PG without PE. */
 	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW);
 	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG);
+}
 
+int main(int argc, char *argv[])
+{
+	struct kvm_sregs sregs;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int rc;
+
+	/*
+	 * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
+	 * use it to verify KVM enforces guest CPUID even if *userspace* never
+	 * sets CPUID.
+	 */
+	vm = vm_create_barebones();
+	vcpu = __vm_vcpu_add(vm, 0);
+	test_cr_bits(vcpu, KVM_ALWAYS_ALLOWED_CR4);
 	kvm_vm_free(vm);
 
-	/* Create a "real" VM and verify APIC_BASE can be set. */
+	/* Create a "real" VM with a fully populated guest CPUID and verify
+	 * APIC_BASE and all supported CR4 can be set.
+	 */
 	vm = vm_create_with_one_vcpu(&vcpu, NULL);
 
 	vcpu_sregs_get(vcpu, &sregs);
@@ -135,6 +140,8 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
 		    sregs.apic_base);
 
+	test_cr_bits(vcpu, calc_supported_cr4_feature_bits());
+
 	kvm_vm_free(vm);
 
 	return 0;
-- 
cgit v1.2.3


From 08833719e77041e331f6193878f1b944744b9068 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 17:33:34 -0800
Subject: KVM: selftests: Assert that vcpu->cpuid is non-NULL when getting
 CPUID entries

Add a sanity check in __vcpu_get_cpuid_entry() to provide a friendlier
error than a segfault when a test developer tries to use a vCPU CPUID
helper on a barebones vCPU.

Link: https://lore.kernel.org/r/20241128013424.4096668-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 9ec984cf8674..71aa290c583f 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1014,6 +1014,8 @@ static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *v
 							      uint32_t function,
 							      uint32_t index)
 {
+	TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first (or equivalent)");
+
 	return (struct kvm_cpuid_entry2 *)get_cpuid_entry(vcpu->cpuid,
 							  function, index);
 }
-- 
cgit v1.2.3


From a2a791e8208623b1575f21c7ec559df095c0a96e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 17:33:35 -0800
Subject: KVM: selftests: Refresh vCPU CPUID cache in __vcpu_get_cpuid_entry()

Refresh selftests' CPUID cache in the vCPU structure when querying a CPUID
entry so that tests don't consume stale data when KVM modifies CPUID as a
side effect to a completely unrelated change.  E.g. KVM adjusts OSXSAVE in
response to CR4.OSXSAVE changes.

Unnecessarily invoking KVM_GET_CPUID is suboptimal, but vcpu->cpuid exists
to simplify selftests development, not for performance reasons.  And,
unfortunately, trying to handle the side effects in tests or other flows
is unpleasant, e.g. selftests could manually refresh if KVM_SET_SREGS is
successful, but that would still leave a gap with respect to guest CR4
changes.

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Link: https://lore.kernel.org/r/20241128013424.4096668-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 71aa290c583f..b6753e27dfb6 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1010,12 +1010,19 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries)
 
 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid);
 
+static inline void vcpu_get_cpuid(struct kvm_vcpu *vcpu)
+{
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+}
+
 static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu,
 							      uint32_t function,
 							      uint32_t index)
 {
 	TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first (or equivalent)");
 
+	vcpu_get_cpuid(vcpu);
+
 	return (struct kvm_cpuid_entry2 *)get_cpuid_entry(vcpu->cpuid,
 							  function, index);
 }
@@ -1036,7 +1043,7 @@ static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu)
 		return r;
 
 	/* On success, refresh the cache to pick up adjustments made by KVM. */
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+	vcpu_get_cpuid(vcpu);
 	return 0;
 }
 
@@ -1046,12 +1053,7 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid);
 
 	/* Refresh the cache to pick up adjustments made by KVM. */
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
-}
-
-static inline void vcpu_get_cpuid(struct kvm_vcpu *vcpu)
-{
-	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
+	vcpu_get_cpuid(vcpu);
 }
 
 void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 01bcd829c63fdde92d9d6c32b2ed3ba34ead0930 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 17:33:36 -0800
Subject: KVM: selftests: Verify KVM stuffs runtime CPUID OS bits on CR4 writes

Extend x86's set sregs test to verify that KVM sets/clears OSXSAVE and
OSKPKE according to CR4.XSAVE and CR4.PKE respectively.  For performance
reasons, KVM is responsible for emulating the architectural behavior of
the OS CPUID bits tracking CR4.

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/r/20241128013424.4096668-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/set_sregs_test.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/set_sregs_test.c b/tools/testing/selftests/kvm/x86/set_sregs_test.c
index 96fd690d479a..f4095a3d1278 100644
--- a/tools/testing/selftests/kvm/x86/set_sregs_test.c
+++ b/tools/testing/selftests/kvm/x86/set_sregs_test.c
@@ -85,6 +85,16 @@ static void test_cr_bits(struct kvm_vcpu *vcpu, uint64_t cr4)
 	rc = _vcpu_sregs_set(vcpu, &sregs);
 	TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4);
 
+	TEST_ASSERT(!!(sregs.cr4 & X86_CR4_OSXSAVE) ==
+		    (vcpu->cpuid && vcpu_cpuid_has(vcpu, X86_FEATURE_OSXSAVE)),
+		    "KVM didn't %s OSXSAVE in CPUID as expected",
+		    (sregs.cr4 & X86_CR4_OSXSAVE) ? "set" : "clear");
+
+	TEST_ASSERT(!!(sregs.cr4 & X86_CR4_PKE) ==
+		    (vcpu->cpuid && vcpu_cpuid_has(vcpu, X86_FEATURE_OSPKE)),
+		    "KVM didn't %s OSPKE in CPUID as expected",
+		    (sregs.cr4 & X86_CR4_PKE) ? "set" : "clear");
+
 	vcpu_sregs_get(vcpu, &sregs);
 	TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)",
 		    sregs.cr4, cr4);
-- 
cgit v1.2.3


From 7b2658cb33c744ca41358ada2421a86774914764 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 17:33:43 -0800
Subject: KVM: selftests: Fix a bad TEST_REQUIRE() in x86's KVM PV test

Actually check for KVM support for disabling HLT-exiting instead of
effectively checking that KVM_CAP_X86_DISABLE_EXITS is #defined to a
non-zero value, and convert the TEST_REQUIRE() to a simple return so
that only the sub-test is skipped if HLT-exiting is mandatory.

The goof has likely gone unnoticed because all x86 CPUs support disabling
HLT-exiting, only systems with the opt-in mitigate_smt_rsb KVM module
param disallow HLT-exiting.

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/r/20241128013424.4096668-17-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/kvm_pv_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/kvm_pv_test.c b/tools/testing/selftests/kvm/x86/kvm_pv_test.c
index 78878b3a2725..2aee93108a54 100644
--- a/tools/testing/selftests/kvm/x86/kvm_pv_test.c
+++ b/tools/testing/selftests/kvm/x86/kvm_pv_test.c
@@ -140,9 +140,10 @@ static void test_pv_unhalt(void)
 	struct kvm_cpuid_entry2 *ent;
 	u32 kvm_sig_old;
 
-	pr_info("testing KVM_FEATURE_PV_UNHALT\n");
+	if (!(kvm_check_cap(KVM_CAP_X86_DISABLE_EXITS) & KVM_X86_DISABLE_EXITS_HLT))
+		return;
 
-	TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS);
+	pr_info("testing KVM_FEATURE_PV_UNHALT\n");
 
 	/* KVM_PV_UNHALT test */
 	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-- 
cgit v1.2.3


From 59cb3acdb316130c7247a3d3a20d7d6e75e2896a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 27 Nov 2024 17:33:44 -0800
Subject: KVM: selftests: Update x86's KVM PV test to match KVM's disabling
 exits behavior

Rework x86's KVM PV features test to align with KVM's new, fixed behavior
of not allowing userspace to disable HLT-exiting after vCPUs have been
created.  Rework the core testcase to disable HLT-exiting before creating
a vCPU, and opportunistically modify keep the paired VM+vCPU creation to
verify that KVM rejects KVM_CAP_X86_DISABLE_EXITS as expected.

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/r/20241128013424.4096668-18-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/kvm_pv_test.c | 33 ++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/kvm_pv_test.c b/tools/testing/selftests/kvm/x86/kvm_pv_test.c
index 2aee93108a54..1b805cbdb47b 100644
--- a/tools/testing/selftests/kvm/x86/kvm_pv_test.c
+++ b/tools/testing/selftests/kvm/x86/kvm_pv_test.c
@@ -139,6 +139,7 @@ static void test_pv_unhalt(void)
 	struct kvm_vm *vm;
 	struct kvm_cpuid_entry2 *ent;
 	u32 kvm_sig_old;
+	int r;
 
 	if (!(kvm_check_cap(KVM_CAP_X86_DISABLE_EXITS) & KVM_X86_DISABLE_EXITS_HLT))
 		return;
@@ -152,19 +153,45 @@ static void test_pv_unhalt(void)
 	TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
 		    "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect");
 
-	/* Make sure KVM clears vcpu->arch.kvm_cpuid */
+	/* Verify KVM disallows disabling exits after vCPU creation. */
+	r = __vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT);
+	TEST_ASSERT(r && errno == EINVAL,
+		    "Disabling exits after vCPU creation didn't fail as expected");
+
+	kvm_vm_free(vm);
+
+	/* Verify that KVM clear PV_UNHALT from guest CPUID. */
+	vm = vm_create(1);
+	vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT);
+
+	vcpu = vm_vcpu_add(vm, 0, NULL);
+	TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "vCPU created with PV_UNHALT set by default");
+
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT);
+	TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "PV_UNHALT set in guest CPUID when HLT-exiting is disabled");
+
+	/*
+	 * Clobber the KVM PV signature and verify KVM does NOT clear PV_UNHALT
+	 * when KVM PV is not present, and DOES clear PV_UNHALT when switching
+	 * back to the correct signature..
+	 */
 	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
 	kvm_sig_old = ent->ebx;
 	ent->ebx = 0xdeadbeef;
 	vcpu_set_cpuid(vcpu);
 
-	vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT);
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT);
+	TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "PV_UNHALT cleared when using bogus KVM PV signature");
+
 	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
 	ent->ebx = kvm_sig_old;
 	vcpu_set_cpuid(vcpu);
 
 	TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
-		    "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS");
+		    "PV_UNHALT set in guest CPUID when HLT-exiting is disabled");
 
 	/* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */
 
-- 
cgit v1.2.3


From 7a9b65ab0abd52ae646ba327522315d7500a7d4f Mon Sep 17 00:00:00 2001
From: Amit Vadhavana <av2082000@gmail.com>
Date: Sat, 16 Nov 2024 20:51:36 +0530
Subject: selftests: refactor the lsm `flags_overset_lsm_set_self_attr` test

Remove the temporary context variable `tctx` to simplify the code. use
the original context `ctx` directly in calls to `lsm_get_self_attr`,
eliminating redundancy without any functional changes.

Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Amit Vadhavana <av2082000@gmail.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 tools/testing/selftests/lsm/lsm_set_self_attr_test.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c
index 66dec47e3ca3..732e89fe99c0 100644
--- a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c
+++ b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c
@@ -56,16 +56,15 @@ TEST(flags_zero_lsm_set_self_attr)
 TEST(flags_overset_lsm_set_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
-	char *ctx = calloc(page_size, 1);
+	struct lsm_ctx *ctx = calloc(page_size, 1);
 	__u32 size = page_size;
-	struct lsm_ctx *tctx = (struct lsm_ctx *)ctx;
 
 	ASSERT_NE(NULL, ctx);
 	if (attr_lsm_count()) {
-		ASSERT_LE(1, lsm_get_self_attr(LSM_ATTR_CURRENT, tctx, &size,
+		ASSERT_LE(1, lsm_get_self_attr(LSM_ATTR_CURRENT, ctx, &size,
 					       0));
 	}
-	ASSERT_EQ(-1, lsm_set_self_attr(LSM_ATTR_CURRENT | LSM_ATTR_PREV, tctx,
+	ASSERT_EQ(-1, lsm_set_self_attr(LSM_ATTR_CURRENT | LSM_ATTR_PREV, ctx,
 					size, 0));
 
 	free(ctx);
-- 
cgit v1.2.3


From 4e9427aeb9572a4023b42e64ca2cd2ca3cbf7e20 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <iorlov@amazon.com>
Date: Tue, 17 Dec 2024 18:14:57 +0000
Subject: KVM: selftests: Add and use a helper function for x86's LIDT

Implement a function for setting the IDT descriptor from the guest
code. Replace the existing lidt occurrences with calls to this function
as `lidt` is used in multiple places.

Signed-off-by: Ivan Orlov <iorlov@amazon.com>
Link: https://lore.kernel.org/r/20241217181458.68690-7-iorlov@amazon.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h  | 5 +++++
 tools/testing/selftests/kvm/set_memory_region_test.c | 2 +-
 tools/testing/selftests/kvm/x86/sev_smoke_test.c     | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index b6753e27dfb6..daa02b851273 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -569,6 +569,11 @@ static inline void set_cr4(uint64_t val)
 	__asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
 }
 
+static inline void set_idt(const struct desc_ptr *idt_desc)
+{
+	__asm__ __volatile__("lidt %0"::"m"(*idt_desc));
+}
+
 static inline u64 xgetbv(u32 index)
 {
 	u32 eax, edx;
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index 86ee3385e860..621a227e2bf2 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -235,7 +235,7 @@ static void guest_code_delete_memory_region(void)
 	 * in the guest will never succeed, and so isn't an option.
 	 */
 	memset(&idt, 0, sizeof(idt));
-	__asm__ __volatile__("lidt %0" :: "m"(idt));
+	set_idt(&idt);
 
 	GUEST_SYNC(0);
 
diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
index ae77698e6e97..a1a688e75266 100644
--- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -155,7 +155,7 @@ static void guest_shutdown_code(void)
 
 	/* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */
 	memset(&idt, 0, sizeof(idt));
-	__asm__ __volatile__("lidt %0" :: "m"(idt));
+	set_idt(&idt);
 
 	__asm__ __volatile__("ud2");
 }
-- 
cgit v1.2.3


From 62e41f6b4f3697e5909cdf70d56e9a7ebd958732 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <iorlov@amazon.com>
Date: Tue, 17 Dec 2024 18:14:58 +0000
Subject: KVM: selftests: Add test case for MMIO during vectoring on x86

Extend the 'set_memory_region_test' with an x86-only test case which
covers emulated MMIO during event vectoring error handling. The test case

1) Sets an IDT descriptor base to point to an MMIO address
2) Generates a #GP in the guest
3) Verifies userspace gets the correct exit reason, suberror code, and
   GPA in internal.data[3]

Opportunistically add a definition for a non-canonical address to
processor.h so that the source of the #GP is somewhat self-documenting,
and so that future tests don't have to reinvent the wheel.

Signed-off-by: Ivan Orlov <iorlov@amazon.com>
Link: https://lore.kernel.org/r/20241217181458.68690-8-iorlov@amazon.com
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/x86/processor.h  |  2 +
 .../testing/selftests/kvm/set_memory_region_test.c | 51 ++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index daa02b851273..d60da8966772 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -27,6 +27,8 @@ extern uint64_t guest_tsc_khz;
 #define MAX_NR_CPUID_ENTRIES 100
 #endif
 
+#define NONCANONICAL 0xaaaaaaaaaaaaaaaaull
+
 /* Forced emulation prefix, used to invoke the emulator unconditionally. */
 #define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
 
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index 621a227e2bf2..bc440d5aba57 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -553,6 +553,56 @@ static void test_add_overlapping_private_memory_regions(void)
 	close(memfd);
 	kvm_vm_free(vm);
 }
+
+static void guest_code_mmio_during_vectoring(void)
+{
+	const struct desc_ptr idt_desc = {
+		.address = MEM_REGION_GPA,
+		.size = 0xFFF,
+	};
+
+	set_idt(&idt_desc);
+
+	/* Generate a #GP by dereferencing a non-canonical address */
+	*((uint8_t *)NONCANONICAL) = 0x1;
+
+	GUEST_ASSERT(0);
+}
+
+/*
+ * This test points the IDT descriptor base to an MMIO address. It should cause
+ * a KVM internal error when an event occurs in the guest.
+ */
+static void test_mmio_during_vectoring(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	u64 expected_gpa;
+
+	pr_info("Testing MMIO during vectoring error handling\n");
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_mmio_during_vectoring);
+	virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 1);
+
+	run = vcpu->run;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+	TEST_ASSERT(run->internal.suberror == KVM_INTERNAL_ERROR_DELIVERY_EV,
+		    "Unexpected suberror = %d", vcpu->run->internal.suberror);
+	TEST_ASSERT(run->internal.ndata != 4, "Unexpected internal error data array size = %d",
+		    run->internal.ndata);
+
+	/* The reported GPA should be IDT base + offset of the GP vector */
+	expected_gpa = MEM_REGION_GPA + GP_VECTOR * sizeof(struct idt_entry);
+
+	TEST_ASSERT(run->internal.data[3] == expected_gpa,
+		    "Unexpected GPA = %llx (expected %lx)",
+		    vcpu->run->internal.data[3], expected_gpa);
+
+	kvm_vm_free(vm);
+}
 #endif
 
 int main(int argc, char *argv[])
@@ -568,6 +618,7 @@ int main(int argc, char *argv[])
 	 * KVM_RUN fails with ENOEXEC or EFAULT.
 	 */
 	test_zero_memory_regions();
+	test_mmio_during_vectoring();
 #endif
 
 	test_invalid_memory_region_flags();
-- 
cgit v1.2.3


From b083cc815376a8ccfba6535b4d59a396b77601d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 12 Dec 2024 18:42:18 +0100
Subject: selftests/exec: Add 32 tests for AT_EXECVE_CHECK and exec securebits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test that checks performed by execveat(..., AT_EXECVE_CHECK) are
consistent with noexec mount points and file execute permissions.

Test that SECBIT_EXEC_RESTRICT_FILE and SECBIT_EXEC_DENY_INTERACTIVE are
inherited by child processes and that they can be pinned with the
appropriate SECBIT_EXEC_RESTRICT_FILE_LOCKED and
SECBIT_EXEC_DENY_INTERACTIVE_LOCKED bits.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20241212174223.389435-4-mic@digikod.net
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/testing/selftests/exec/.gitignore   |   2 +
 tools/testing/selftests/exec/Makefile     |   7 +
 tools/testing/selftests/exec/check-exec.c | 456 ++++++++++++++++++++++++++++++
 tools/testing/selftests/exec/config       |   2 +
 tools/testing/selftests/exec/false.c      |   5 +
 5 files changed, 472 insertions(+)
 create mode 100644 tools/testing/selftests/exec/check-exec.c
 create mode 100644 tools/testing/selftests/exec/config
 create mode 100644 tools/testing/selftests/exec/false.c

(limited to 'tools')

diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
index a0dc5d4bf733..a32c63bb4df1 100644
--- a/tools/testing/selftests/exec/.gitignore
+++ b/tools/testing/selftests/exec/.gitignore
@@ -9,6 +9,8 @@ execveat.ephemeral
 execveat.denatured
 non-regular
 null-argv
+/check-exec
+/false
 /load_address.*
 !load_address.c
 /recursion-depth
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index ba012bc5aab9..8713d1c862ae 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS = -Wall
 CFLAGS += -Wno-nonnull
+CFLAGS += $(KHDR_INCLUDES)
+
+LDLIBS += -lcap
 
 ALIGNS := 0x1000 0x200000 0x1000000
 ALIGN_PIES        := $(patsubst %,load_address.%,$(ALIGNS))
@@ -9,12 +12,14 @@ ALIGNMENT_TESTS   := $(ALIGN_PIES) $(ALIGN_STATIC_PIES)
 
 TEST_PROGS := binfmt_script.py
 TEST_GEN_PROGS := execveat non-regular $(ALIGNMENT_TESTS)
+TEST_GEN_PROGS_EXTENDED := false
 TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir
 # Makefile is a run-time dependency, since it's accessed by the execveat test
 TEST_FILES := Makefile
 
 TEST_GEN_PROGS += recursion-depth
 TEST_GEN_PROGS += null-argv
+TEST_GEN_PROGS += check-exec
 
 EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx*	\
 	       $(OUTPUT)/S_I*.test
@@ -38,3 +43,5 @@ $(OUTPUT)/load_address.0x%: load_address.c
 $(OUTPUT)/load_address.static.0x%: load_address.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=$(lastword $(subst ., ,$@)) \
 		-fPIE -static-pie $< -o $@
+$(OUTPUT)/false: false.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -static $< -o $@
diff --git a/tools/testing/selftests/exec/check-exec.c b/tools/testing/selftests/exec/check-exec.c
new file mode 100644
index 000000000000..4d3f4525e1e1
--- /dev/null
+++ b/tools/testing/selftests/exec/check-exec.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test execveat(2) with AT_EXECVE_CHECK, and prctl(2) with
+ * SECBIT_EXEC_RESTRICT_FILE, SECBIT_EXEC_DENY_INTERACTIVE, and their locked
+ * counterparts.
+ *
+ * Copyright © 2018-2020 ANSSI
+ * Copyright © 2024 Microsoft Corporation
+ *
+ * Author: Mickaël Salaün <mic@digikod.net>
+ */
+
+#include <asm-generic/unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/prctl.h>
+#include <linux/securebits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/capability.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+/* Defines AT_EXECVE_CHECK without type conflicts. */
+#define _ASM_GENERIC_FCNTL_H
+#include <linux/fcntl.h>
+
+#include "../kselftest_harness.h"
+
+static void drop_privileges(struct __test_metadata *const _metadata)
+{
+	const unsigned int noroot = SECBIT_NOROOT | SECBIT_NOROOT_LOCKED;
+	cap_t cap_p;
+
+	if ((cap_get_secbits() & noroot) != noroot)
+		EXPECT_EQ(0, cap_set_secbits(noroot));
+
+	cap_p = cap_get_proc();
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_clear(cap_p));
+
+	/*
+	 * Drops everything, especially CAP_SETPCAP, CAP_DAC_OVERRIDE, and
+	 * CAP_DAC_READ_SEARCH.
+	 */
+	EXPECT_NE(-1, cap_set_proc(cap_p));
+	EXPECT_NE(-1, cap_free(cap_p));
+}
+
+static int test_secbits_set(const unsigned int secbits)
+{
+	int err;
+
+	err = prctl(PR_SET_SECUREBITS, secbits);
+	if (err)
+		return errno;
+	return 0;
+}
+
+FIXTURE(access)
+{
+	int memfd, pipefd;
+	int pipe_fds[2], socket_fds[2];
+};
+
+FIXTURE_VARIANT(access)
+{
+	const bool mount_exec;
+	const bool file_exec;
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(access, mount_exec_file_exec) {
+	/* clang-format on */
+	.mount_exec = true,
+	.file_exec = true,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(access, mount_exec_file_noexec) {
+	/* clang-format on */
+	.mount_exec = true,
+	.file_exec = false,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(access, mount_noexec_file_exec) {
+	/* clang-format on */
+	.mount_exec = false,
+	.file_exec = true,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(access, mount_noexec_file_noexec) {
+	/* clang-format on */
+	.mount_exec = false,
+	.file_exec = false,
+};
+
+static const char binary_path[] = "./false";
+static const char workdir_path[] = "./test-mount";
+static const char reg_file_path[] = "./test-mount/regular_file";
+static const char dir_path[] = "./test-mount/directory";
+static const char block_dev_path[] = "./test-mount/block_device";
+static const char char_dev_path[] = "./test-mount/character_device";
+static const char fifo_path[] = "./test-mount/fifo";
+
+FIXTURE_SETUP(access)
+{
+	int procfd_path_size;
+	static const char path_template[] = "/proc/self/fd/%d";
+	char procfd_path[sizeof(path_template) + 10];
+
+	/* Makes sure we are not already restricted nor locked. */
+	EXPECT_EQ(0, test_secbits_set(0));
+
+	/*
+	 * Cleans previous workspace if any error previously happened (don't
+	 * check errors).
+	 */
+	umount(workdir_path);
+	rmdir(workdir_path);
+
+	/* Creates a clean mount point. */
+	ASSERT_EQ(0, mkdir(workdir_path, 00700));
+	ASSERT_EQ(0, mount("test", workdir_path, "tmpfs",
+			   MS_MGC_VAL | (variant->mount_exec ? 0 : MS_NOEXEC),
+			   "mode=0700,size=9m"));
+
+	/* Creates a regular file. */
+	ASSERT_EQ(0, mknod(reg_file_path,
+			   S_IFREG | (variant->file_exec ? 0700 : 0600), 0));
+	/* Creates a directory. */
+	ASSERT_EQ(0, mkdir(dir_path, variant->file_exec ? 0700 : 0600));
+	/* Creates a character device: /dev/null. */
+	ASSERT_EQ(0, mknod(char_dev_path, S_IFCHR | 0400, makedev(1, 3)));
+	/* Creates a block device: /dev/loop0 */
+	ASSERT_EQ(0, mknod(block_dev_path, S_IFBLK | 0400, makedev(7, 0)));
+	/* Creates a fifo. */
+	ASSERT_EQ(0, mknod(fifo_path, S_IFIFO | 0600, 0));
+
+	/* Creates a regular file without user mount point. */
+	self->memfd = memfd_create("test-exec-probe", MFD_CLOEXEC);
+	ASSERT_LE(0, self->memfd);
+	/* Sets mode, which must be ignored by the exec check. */
+	ASSERT_EQ(0, fchmod(self->memfd, variant->file_exec ? 0700 : 0600));
+
+	/* Creates a pipefs file descriptor. */
+	ASSERT_EQ(0, pipe(self->pipe_fds));
+	procfd_path_size = snprintf(procfd_path, sizeof(procfd_path),
+				    path_template, self->pipe_fds[0]);
+	ASSERT_LT(procfd_path_size, sizeof(procfd_path));
+	self->pipefd = open(procfd_path, O_RDWR | O_CLOEXEC);
+	ASSERT_LE(0, self->pipefd);
+	ASSERT_EQ(0, fchmod(self->pipefd, variant->file_exec ? 0700 : 0600));
+
+	/* Creates a socket file descriptor. */
+	ASSERT_EQ(0, socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0,
+				self->socket_fds));
+}
+
+FIXTURE_TEARDOWN_PARENT(access)
+{
+	/* There is no need to unlink the test files. */
+	EXPECT_EQ(0, umount(workdir_path));
+	EXPECT_EQ(0, rmdir(workdir_path));
+}
+
+static void fill_exec_fd(struct __test_metadata *_metadata, const int fd_out)
+{
+	char buf[1024];
+	size_t len;
+	int fd_in;
+
+	fd_in = open(binary_path, O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd_in);
+	/* Cannot use copy_file_range(2) because of EXDEV. */
+	len = read(fd_in, buf, sizeof(buf));
+	EXPECT_LE(0, len);
+	while (len > 0) {
+		EXPECT_EQ(len, write(fd_out, buf, len))
+		{
+			TH_LOG("Failed to write: %s (%d)", strerror(errno),
+			       errno);
+		}
+		len = read(fd_in, buf, sizeof(buf));
+		EXPECT_LE(0, len);
+	}
+	EXPECT_EQ(0, close(fd_in));
+}
+
+static void fill_exec_path(struct __test_metadata *_metadata,
+			   const char *const path)
+{
+	int fd_out;
+
+	fd_out = open(path, O_CLOEXEC | O_WRONLY);
+	ASSERT_LE(0, fd_out)
+	{
+		TH_LOG("Failed to open %s: %s", path, strerror(errno));
+	}
+	fill_exec_fd(_metadata, fd_out);
+	EXPECT_EQ(0, close(fd_out));
+}
+
+static void test_exec_fd(struct __test_metadata *_metadata, const int fd,
+			 const int err_code)
+{
+	char *const argv[] = { "", NULL };
+	int access_ret, access_errno;
+
+	/*
+	 * If we really execute fd, filled with the "false" binary, the current
+	 * thread will exits with an error, which will be interpreted by the
+	 * test framework as an error.  With AT_EXECVE_CHECK, we only check a
+	 * potential successful execution.
+	 */
+	access_ret =
+		execveat(fd, "", argv, NULL, AT_EMPTY_PATH | AT_EXECVE_CHECK);
+	access_errno = errno;
+	if (err_code) {
+		EXPECT_EQ(-1, access_ret);
+		EXPECT_EQ(err_code, access_errno)
+		{
+			TH_LOG("Wrong error for execveat(2): %s (%d)",
+			       strerror(access_errno), errno);
+		}
+	} else {
+		EXPECT_EQ(0, access_ret)
+		{
+			TH_LOG("Access denied: %s", strerror(access_errno));
+		}
+	}
+}
+
+static void test_exec_path(struct __test_metadata *_metadata,
+			   const char *const path, const int err_code)
+{
+	int flags = O_CLOEXEC;
+	int fd;
+
+	/* Do not block on pipes. */
+	if (path == fifo_path)
+		flags |= O_NONBLOCK;
+
+	fd = open(path, flags | O_RDONLY);
+	ASSERT_LE(0, fd)
+	{
+		TH_LOG("Failed to open %s: %s", path, strerror(errno));
+	}
+	test_exec_fd(_metadata, fd, err_code);
+	EXPECT_EQ(0, close(fd));
+}
+
+/* Tests that we don't get ENOEXEC. */
+TEST_F(access, regular_file_empty)
+{
+	const int exec = variant->mount_exec && variant->file_exec;
+
+	test_exec_path(_metadata, reg_file_path, exec ? 0 : EACCES);
+
+	drop_privileges(_metadata);
+	test_exec_path(_metadata, reg_file_path, exec ? 0 : EACCES);
+}
+
+TEST_F(access, regular_file_elf)
+{
+	const int exec = variant->mount_exec && variant->file_exec;
+
+	fill_exec_path(_metadata, reg_file_path);
+
+	test_exec_path(_metadata, reg_file_path, exec ? 0 : EACCES);
+
+	drop_privileges(_metadata);
+	test_exec_path(_metadata, reg_file_path, exec ? 0 : EACCES);
+}
+
+/* Tests that we don't get ENOEXEC. */
+TEST_F(access, memfd_empty)
+{
+	const int exec = variant->file_exec;
+
+	test_exec_fd(_metadata, self->memfd, exec ? 0 : EACCES);
+
+	drop_privileges(_metadata);
+	test_exec_fd(_metadata, self->memfd, exec ? 0 : EACCES);
+}
+
+TEST_F(access, memfd_elf)
+{
+	const int exec = variant->file_exec;
+
+	fill_exec_fd(_metadata, self->memfd);
+
+	test_exec_fd(_metadata, self->memfd, exec ? 0 : EACCES);
+
+	drop_privileges(_metadata);
+	test_exec_fd(_metadata, self->memfd, exec ? 0 : EACCES);
+}
+
+TEST_F(access, non_regular_files)
+{
+	test_exec_path(_metadata, dir_path, EACCES);
+	test_exec_path(_metadata, block_dev_path, EACCES);
+	test_exec_path(_metadata, char_dev_path, EACCES);
+	test_exec_path(_metadata, fifo_path, EACCES);
+	test_exec_fd(_metadata, self->socket_fds[0], EACCES);
+	test_exec_fd(_metadata, self->pipefd, EACCES);
+}
+
+/* clang-format off */
+FIXTURE(secbits) {};
+/* clang-format on */
+
+FIXTURE_VARIANT(secbits)
+{
+	const bool is_privileged;
+	const int error;
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(secbits, priv) {
+	/* clang-format on */
+	.is_privileged = true,
+	.error = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(secbits, unpriv) {
+	/* clang-format on */
+	.is_privileged = false,
+	.error = EPERM,
+};
+
+FIXTURE_SETUP(secbits)
+{
+	/* Makes sure no exec bits are set. */
+	EXPECT_EQ(0, test_secbits_set(0));
+	EXPECT_EQ(0, prctl(PR_GET_SECUREBITS));
+
+	if (!variant->is_privileged)
+		drop_privileges(_metadata);
+}
+
+FIXTURE_TEARDOWN(secbits)
+{
+}
+
+TEST_F(secbits, legacy)
+{
+	EXPECT_EQ(variant->error, test_secbits_set(0));
+}
+
+#define CHILD(...)                     \
+	do {                           \
+		pid_t child = vfork(); \
+		EXPECT_LE(0, child);   \
+		if (child == 0) {      \
+			__VA_ARGS__;   \
+			_exit(0);      \
+		}                      \
+	} while (0)
+
+TEST_F(secbits, exec)
+{
+	unsigned int secbits = prctl(PR_GET_SECUREBITS);
+
+	secbits |= SECBIT_EXEC_RESTRICT_FILE;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+	EXPECT_EQ(secbits, prctl(PR_GET_SECUREBITS));
+	CHILD(EXPECT_EQ(secbits, prctl(PR_GET_SECUREBITS)));
+
+	secbits |= SECBIT_EXEC_DENY_INTERACTIVE;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+	EXPECT_EQ(secbits, prctl(PR_GET_SECUREBITS));
+	CHILD(EXPECT_EQ(secbits, prctl(PR_GET_SECUREBITS)));
+
+	secbits &= ~(SECBIT_EXEC_RESTRICT_FILE | SECBIT_EXEC_DENY_INTERACTIVE);
+	EXPECT_EQ(0, test_secbits_set(secbits));
+	EXPECT_EQ(secbits, prctl(PR_GET_SECUREBITS));
+	CHILD(EXPECT_EQ(secbits, prctl(PR_GET_SECUREBITS)));
+}
+
+TEST_F(secbits, check_locked_set)
+{
+	unsigned int secbits = prctl(PR_GET_SECUREBITS);
+
+	secbits |= SECBIT_EXEC_RESTRICT_FILE;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+	secbits |= SECBIT_EXEC_RESTRICT_FILE_LOCKED;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+
+	/* Checks lock set but unchanged. */
+	EXPECT_EQ(variant->error, test_secbits_set(secbits));
+	CHILD(EXPECT_EQ(variant->error, test_secbits_set(secbits)));
+
+	secbits &= ~SECBIT_EXEC_RESTRICT_FILE;
+	EXPECT_EQ(EPERM, test_secbits_set(0));
+	CHILD(EXPECT_EQ(EPERM, test_secbits_set(0)));
+}
+
+TEST_F(secbits, check_locked_unset)
+{
+	unsigned int secbits = prctl(PR_GET_SECUREBITS);
+
+	secbits |= SECBIT_EXEC_RESTRICT_FILE_LOCKED;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+
+	/* Checks lock unset but unchanged. */
+	EXPECT_EQ(variant->error, test_secbits_set(secbits));
+	CHILD(EXPECT_EQ(variant->error, test_secbits_set(secbits)));
+
+	secbits &= ~SECBIT_EXEC_RESTRICT_FILE;
+	EXPECT_EQ(EPERM, test_secbits_set(0));
+	CHILD(EXPECT_EQ(EPERM, test_secbits_set(0)));
+}
+
+TEST_F(secbits, restrict_locked_set)
+{
+	unsigned int secbits = prctl(PR_GET_SECUREBITS);
+
+	secbits |= SECBIT_EXEC_DENY_INTERACTIVE;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+	secbits |= SECBIT_EXEC_DENY_INTERACTIVE_LOCKED;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+
+	/* Checks lock set but unchanged. */
+	EXPECT_EQ(variant->error, test_secbits_set(secbits));
+	CHILD(EXPECT_EQ(variant->error, test_secbits_set(secbits)));
+
+	secbits &= ~SECBIT_EXEC_DENY_INTERACTIVE;
+	EXPECT_EQ(EPERM, test_secbits_set(0));
+	CHILD(EXPECT_EQ(EPERM, test_secbits_set(0)));
+}
+
+TEST_F(secbits, restrict_locked_unset)
+{
+	unsigned int secbits = prctl(PR_GET_SECUREBITS);
+
+	secbits |= SECBIT_EXEC_DENY_INTERACTIVE_LOCKED;
+	EXPECT_EQ(0, test_secbits_set(secbits));
+
+	/* Checks lock unset but unchanged. */
+	EXPECT_EQ(variant->error, test_secbits_set(secbits));
+	CHILD(EXPECT_EQ(variant->error, test_secbits_set(secbits)));
+
+	secbits &= ~SECBIT_EXEC_DENY_INTERACTIVE;
+	EXPECT_EQ(EPERM, test_secbits_set(0));
+	CHILD(EXPECT_EQ(EPERM, test_secbits_set(0)));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/exec/config b/tools/testing/selftests/exec/config
new file mode 100644
index 000000000000..c308079867b3
--- /dev/null
+++ b/tools/testing/selftests/exec/config
@@ -0,0 +1,2 @@
+CONFIG_BLK_DEV=y
+CONFIG_BLK_DEV_LOOP=y
diff --git a/tools/testing/selftests/exec/false.c b/tools/testing/selftests/exec/false.c
new file mode 100644
index 000000000000..104383ec3a79
--- /dev/null
+++ b/tools/testing/selftests/exec/false.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+int main(void)
+{
+	return 1;
+}
-- 
cgit v1.2.3


From 0e7f90f34cf79bf329d6d08edea3403544498843 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 12 Dec 2024 18:42:19 +0100
Subject: selftests/landlock: Add tests for execveat + AT_EXECVE_CHECK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend layout1.execute with the new AT_EXECVE_CHECK flag.  The semantic
with AT_EXECVE_CHECK is the same as with a simple execve(2),
LANDLOCK_ACCESS_FS_EXECUTE is enforced the same way.

Cc: Günther Noack <gnoack@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20241212174223.389435-5-mic@digikod.net
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/testing/selftests/landlock/fs_test.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 6788762188fe..cd66901be612 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -37,6 +37,10 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 
+/* Defines AT_EXECVE_CHECK without type conflicts. */
+#define _ASM_GENERIC_FCNTL_H
+#include <linux/fcntl.h>
+
 #include "common.h"
 
 #ifndef renameat2
@@ -2008,6 +2012,22 @@ static void test_execute(struct __test_metadata *const _metadata, const int err,
 	};
 }
 
+static void test_check_exec(struct __test_metadata *const _metadata,
+			    const int err, const char *const path)
+{
+	int ret;
+	char *const argv[] = { (char *)path, NULL };
+
+	ret = execveat(AT_FDCWD, path, argv, NULL,
+		       AT_EMPTY_PATH | AT_EXECVE_CHECK);
+	if (err) {
+		EXPECT_EQ(-1, ret);
+		EXPECT_EQ(errno, err);
+	} else {
+		EXPECT_EQ(0, ret);
+	}
+}
+
 TEST_F_FORK(layout1, execute)
 {
 	const struct rule rules[] = {
@@ -2025,20 +2045,27 @@ TEST_F_FORK(layout1, execute)
 	copy_binary(_metadata, file1_s1d2);
 	copy_binary(_metadata, file1_s1d3);
 
+	/* Checks before file1_s1d1 being denied. */
+	test_execute(_metadata, 0, file1_s1d1);
+	test_check_exec(_metadata, 0, file1_s1d1);
+
 	enforce_ruleset(_metadata, ruleset_fd);
 	ASSERT_EQ(0, close(ruleset_fd));
 
 	ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
 	ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY));
 	test_execute(_metadata, EACCES, file1_s1d1);
+	test_check_exec(_metadata, EACCES, file1_s1d1);
 
 	ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY));
 	ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
 	test_execute(_metadata, 0, file1_s1d2);
+	test_check_exec(_metadata, 0, file1_s1d2);
 
 	ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY));
 	ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
 	test_execute(_metadata, 0, file1_s1d3);
+	test_check_exec(_metadata, 0, file1_s1d3);
 }
 
 TEST_F_FORK(layout1, link)
-- 
cgit v1.2.3


From 3e707b07f582c12ed78fa5516a97bf701bf0634c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 12 Dec 2024 18:42:21 +0100
Subject: selftests: ktap_helpers: Fix uninitialized variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__ktap_test() may be called without the optional third argument which is
an issue for scripts using `set -u` to detect uninitialized variables
and potential bugs.

Fix this optional "directive" argument by either using the third
argument or an empty string.

This is required for the next commit to properly test script execution
control.

Cc: Kees Cook <kees@kernel.org>
Cc: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Fixes: 14571ab1ad21 ("kselftest: Add new test for detecting unprobed Devicetree devices")
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20241212174223.389435-7-mic@digikod.net
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/testing/selftests/kselftest/ktap_helpers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh
index 79a125eb24c2..14e7f3ec3f84 100644
--- a/tools/testing/selftests/kselftest/ktap_helpers.sh
+++ b/tools/testing/selftests/kselftest/ktap_helpers.sh
@@ -40,7 +40,7 @@ ktap_skip_all() {
 __ktap_test() {
 	result="$1"
 	description="$2"
-	directive="$3" # optional
+	directive="${3:-}" # optional
 
 	local directive_str=
 	[ ! -z "$directive" ] && directive_str="# $directive"
-- 
cgit v1.2.3


From 2a69962be4a7e97ab347e05826480a3352c6fbc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 12 Dec 2024 18:42:22 +0100
Subject: samples/check-exec: Add an enlighten "inc" interpreter and 28 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a very simple script interpreter called "inc" that can evaluate two
different commands (one per line):
- "?" to initialize a counter from user's input;
- "+" to increment the counter (which is set to 0 by default).

It is enlighten to only interpret executable files according to
AT_EXECVE_CHECK and the related securebits:

  # Executing a script with RESTRICT_FILE is only allowed if the script
  # is executable:
  ./set-exec -f -- ./inc script-exec.inc # Allowed
  ./set-exec -f -- ./inc script-noexec.inc # Denied

  # Executing stdin with DENY_INTERACTIVE is only allowed if stdin is an
  # executable regular file:
  ./set-exec -i -- ./inc -i < script-exec.inc # Allowed
  ./set-exec -i -- ./inc -i < script-noexec.inc # Denied

  # However, a pipe is not executable and it is then denied:
  cat script-noexec.inc | ./set-exec -i -- ./inc -i # Denied

  # Executing raw data (e.g. command argument) with DENY_INTERACTIVE is
  # always denied.
  ./set-exec -i -- ./inc -c "+" # Denied
  ./inc -c "$(<script-ask.inc)" # Allowed

  # To directly execute a script, we can update $PATH (used by `env`):
  PATH="${PATH}:." ./script-exec.inc

  # To execute several commands passed as argument:

Add a complete test suite to check the script interpreter against all
possible execution cases:

  make TARGETS=exec kselftest-install
  ./tools/testing/selftests/kselftest_install/run_kselftest.sh

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20241212174223.389435-8-mic@digikod.net
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/testing/selftests/exec/.gitignore          |   2 +
 tools/testing/selftests/exec/Makefile            |  14 +-
 tools/testing/selftests/exec/check-exec-tests.sh | 205 +++++++++++++++++++++++
 3 files changed, 219 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/exec/check-exec-tests.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
index a32c63bb4df1..7f3d1ae762ec 100644
--- a/tools/testing/selftests/exec/.gitignore
+++ b/tools/testing/selftests/exec/.gitignore
@@ -11,9 +11,11 @@ non-regular
 null-argv
 /check-exec
 /false
+/inc
 /load_address.*
 !load_address.c
 /recursion-depth
+/set-exec
 xxxxxxxx*
 pipe
 S_I*.test
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index 8713d1c862ae..45a3cfc435cf 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -10,9 +10,9 @@ ALIGN_PIES        := $(patsubst %,load_address.%,$(ALIGNS))
 ALIGN_STATIC_PIES := $(patsubst %,load_address.static.%,$(ALIGNS))
 ALIGNMENT_TESTS   := $(ALIGN_PIES) $(ALIGN_STATIC_PIES)
 
-TEST_PROGS := binfmt_script.py
+TEST_PROGS := binfmt_script.py check-exec-tests.sh
 TEST_GEN_PROGS := execveat non-regular $(ALIGNMENT_TESTS)
-TEST_GEN_PROGS_EXTENDED := false
+TEST_GEN_PROGS_EXTENDED := false inc set-exec script-exec.inc script-noexec.inc
 TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir
 # Makefile is a run-time dependency, since it's accessed by the execveat test
 TEST_FILES := Makefile
@@ -26,6 +26,8 @@ EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx*
 
 include ../lib.mk
 
+CHECK_EXEC_SAMPLES := $(top_srcdir)/samples/check-exec
+
 $(OUTPUT)/subdir:
 	mkdir -p $@
 $(OUTPUT)/script: Makefile
@@ -45,3 +47,11 @@ $(OUTPUT)/load_address.static.0x%: load_address.c
 		-fPIE -static-pie $< -o $@
 $(OUTPUT)/false: false.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -static $< -o $@
+$(OUTPUT)/inc: $(CHECK_EXEC_SAMPLES)/inc.c
+	$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
+$(OUTPUT)/set-exec: $(CHECK_EXEC_SAMPLES)/set-exec.c
+	$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
+$(OUTPUT)/script-exec.inc: $(CHECK_EXEC_SAMPLES)/script-exec.inc
+	cp $< $@
+$(OUTPUT)/script-noexec.inc: $(CHECK_EXEC_SAMPLES)/script-noexec.inc
+	cp $< $@
diff --git a/tools/testing/selftests/exec/check-exec-tests.sh b/tools/testing/selftests/exec/check-exec-tests.sh
new file mode 100755
index 000000000000..87102906ae3c
--- /dev/null
+++ b/tools/testing/selftests/exec/check-exec-tests.sh
@@ -0,0 +1,205 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test the "inc" interpreter.
+#
+# See include/uapi/linux/securebits.h, include/uapi/linux/fcntl.h and
+# samples/check-exec/inc.c
+#
+# Copyright © 2024 Microsoft Corporation
+
+set -u -e -o pipefail
+
+EXPECTED_OUTPUT="1"
+exec 2>/dev/null
+
+DIR="$(dirname $(readlink -f "$0"))"
+source "${DIR}"/../kselftest/ktap_helpers.sh
+
+exec_direct() {
+	local expect="$1"
+	local script="$2"
+	shift 2
+	local ret=0
+	local out
+
+	# Updates PATH for `env` to execute the `inc` interpreter.
+	out="$(PATH="." "$@" "${script}")" || ret=$?
+
+	if [[ ${ret} -ne ${expect} ]]; then
+		echo "ERROR: Wrong expectation for direct file execution: ${ret}"
+		return 1
+	fi
+	if [[ ${ret} -eq 0 && "${out}" != "${EXPECTED_OUTPUT}" ]]; then
+		echo "ERROR: Wrong output for direct file execution: ${out}"
+		return 1
+	fi
+}
+
+exec_indirect() {
+	local expect="$1"
+	local script="$2"
+	shift 2
+	local ret=0
+	local out
+
+	# Script passed as argument.
+	out="$("$@" ./inc "${script}")" || ret=$?
+
+	if [[ ${ret} -ne ${expect} ]]; then
+		echo "ERROR: Wrong expectation for indirect file execution: ${ret}"
+		return 1
+	fi
+	if [[ ${ret} -eq 0 && "${out}" != "${EXPECTED_OUTPUT}" ]]; then
+		echo "ERROR: Wrong output for indirect file execution: ${out}"
+		return 1
+	fi
+}
+
+exec_stdin_reg() {
+	local expect="$1"
+	local script="$2"
+	shift 2
+	local ret=0
+	local out
+
+	# Executing stdin must be allowed if the related file is executable.
+	out="$("$@" ./inc -i < "${script}")" || ret=$?
+
+	if [[ ${ret} -ne ${expect} ]]; then
+		echo "ERROR: Wrong expectation for stdin regular file execution: ${ret}"
+		return 1
+	fi
+	if [[ ${ret} -eq 0 && "${out}" != "${EXPECTED_OUTPUT}" ]]; then
+		echo "ERROR: Wrong output for stdin regular file execution: ${out}"
+		return 1
+	fi
+}
+
+exec_stdin_pipe() {
+	local expect="$1"
+	shift
+	local ret=0
+	local out
+
+	# A pipe is not executable.
+	out="$(cat script-exec.inc | "$@" ./inc -i)" || ret=$?
+
+	if [[ ${ret} -ne ${expect} ]]; then
+		echo "ERROR: Wrong expectation for stdin pipe execution: ${ret}"
+		return 1
+	fi
+}
+
+exec_argument() {
+	local expect="$1"
+	local ret=0
+	shift
+	local out
+
+	# Script not coming from a file must not be executed.
+	out="$("$@" ./inc -c "$(< script-exec.inc)")" || ret=$?
+
+	if [[ ${ret} -ne ${expect} ]]; then
+		echo "ERROR: Wrong expectation for arbitrary argument execution: ${ret}"
+		return 1
+	fi
+	if [[ ${ret} -eq 0 && "${out}" != "${EXPECTED_OUTPUT}" ]]; then
+		echo "ERROR: Wrong output for arbitrary argument execution: ${out}"
+		return 1
+	fi
+}
+
+exec_interactive() {
+	exec_stdin_pipe "$@"
+	exec_argument "$@"
+}
+
+ktap_test() {
+	ktap_test_result "$*" "$@"
+}
+
+ktap_print_header
+ktap_set_plan 28
+
+# Without secbit configuration, nothing is changed.
+
+ktap_print_msg "By default, executable scripts are allowed to be interpreted and executed."
+ktap_test exec_direct 0 script-exec.inc
+ktap_test exec_indirect 0 script-exec.inc
+
+ktap_print_msg "By default, executable stdin is allowed to be interpreted."
+ktap_test exec_stdin_reg 0 script-exec.inc
+
+ktap_print_msg "By default, non-executable scripts are allowed to be interpreted, but not directly executed."
+# We get 126 because of direct execution by Bash.
+ktap_test exec_direct 126 script-noexec.inc
+ktap_test exec_indirect 0 script-noexec.inc
+
+ktap_print_msg "By default, non-executable stdin is allowed to be interpreted."
+ktap_test exec_stdin_reg 0 script-noexec.inc
+
+ktap_print_msg "By default, interactive commands are allowed to be interpreted."
+ktap_test exec_interactive 0
+
+# With only file restriction: protect non-malicious users from inadvertent errors (e.g. python ~/Downloads/*.py).
+
+ktap_print_msg "With -f, executable scripts are allowed to be interpreted and executed."
+ktap_test exec_direct 0 script-exec.inc ./set-exec -f --
+ktap_test exec_indirect 0 script-exec.inc ./set-exec -f --
+
+ktap_print_msg "With -f, executable stdin is allowed to be interpreted."
+ktap_test exec_stdin_reg 0 script-exec.inc ./set-exec -f --
+
+ktap_print_msg "With -f, non-executable scripts are not allowed to be executed nor interpreted."
+# Direct execution of non-executable script is alwayse denied by the kernel.
+ktap_test exec_direct 1 script-noexec.inc ./set-exec -f --
+ktap_test exec_indirect 1 script-noexec.inc ./set-exec -f --
+
+ktap_print_msg "With -f, non-executable stdin is allowed to be interpreted."
+ktap_test exec_stdin_reg 0 script-noexec.inc ./set-exec -f --
+
+ktap_print_msg "With -f, interactive commands are allowed to be interpreted."
+ktap_test exec_interactive 0 ./set-exec -f --
+
+# With only denied interactive commands: check or monitor script content (e.g. with LSM).
+
+ktap_print_msg "With -i, executable scripts are allowed to be interpreted and executed."
+ktap_test exec_direct 0 script-exec.inc ./set-exec -i --
+ktap_test exec_indirect 0 script-exec.inc ./set-exec -i --
+
+ktap_print_msg "With -i, executable stdin is allowed to be interpreted."
+ktap_test exec_stdin_reg 0 script-exec.inc ./set-exec -i --
+
+ktap_print_msg "With -i, non-executable scripts are allowed to be interpreted, but not directly executed."
+# Direct execution of non-executable script is alwayse denied by the kernel.
+ktap_test exec_direct 1 script-noexec.inc ./set-exec -i --
+ktap_test exec_indirect 0 script-noexec.inc ./set-exec -i --
+
+ktap_print_msg "With -i, non-executable stdin is not allowed to be interpreted."
+ktap_test exec_stdin_reg 1 script-noexec.inc ./set-exec -i --
+
+ktap_print_msg "With -i, interactive commands are not allowed to be interpreted."
+ktap_test exec_interactive 1 ./set-exec -i --
+
+# With both file restriction and denied interactive commands: only allow executable scripts.
+
+ktap_print_msg "With -fi, executable scripts are allowed to be interpreted and executed."
+ktap_test exec_direct 0 script-exec.inc ./set-exec -fi --
+ktap_test exec_indirect 0 script-exec.inc ./set-exec -fi --
+
+ktap_print_msg "With -fi, executable stdin is allowed to be interpreted."
+ktap_test exec_stdin_reg 0 script-exec.inc ./set-exec -fi --
+
+ktap_print_msg "With -fi, non-executable scripts are not allowed to be interpreted nor executed."
+# Direct execution of non-executable script is alwayse denied by the kernel.
+ktap_test exec_direct 1 script-noexec.inc ./set-exec -fi --
+ktap_test exec_indirect 1 script-noexec.inc ./set-exec -fi --
+
+ktap_print_msg "With -fi, non-executable stdin is not allowed to be interpreted."
+ktap_test exec_stdin_reg 1 script-noexec.inc ./set-exec -fi --
+
+ktap_print_msg "With -fi, interactive commands are not allowed to be interpreted."
+ktap_test exec_interactive 1 ./set-exec -fi --
+
+ktap_finished
-- 
cgit v1.2.3


From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Date: Thu, 5 Dec 2024 11:29:41 -0800
Subject: selftests/memfd: run sysctl tests when PID namespace support is
 enabled

The sysctl tests for vm.memfd_noexec rely on the kernel to support PID
namespaces (i.e.  the kernel is built with CONFIG_PID_NS=y).  If the
kernel the test runs on does not support PID namespaces, the first sysctl
test will fail when attempting to spawn a new thread in a new PID
namespace, abort the test, preventing the remaining tests from being run.

This is not desirable, as not all kernels need PID namespaces, but can
still use the other features provided by memfd.  Therefore, only run the
sysctl tests if the kernel supports PID namespaces.  Otherwise, skip those
tests and emit an informative message to let the user know why the sysctl
tests are not being run.

Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com
Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC")
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
Reviewed-by: Jeff Xu <jeffxu@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: <stable@vger.kernel.org>	[6.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 95af2d78fd31..0a0b55516028 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -9,6 +9,7 @@
 #include <fcntl.h>
 #include <linux/memfd.h>
 #include <sched.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <signal.h>
@@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner, char *b_suffix)
 	close(fd);
 }
 
+static bool pid_ns_supported(void)
+{
+	return access("/proc/self/ns/pid", F_OK) == 0;
+}
+
 int main(int argc, char **argv)
 {
 	pid_t pid;
@@ -1591,8 +1597,12 @@ int main(int argc, char **argv)
 	test_seal_grow();
 	test_seal_resize();
 
-	test_sysctl_simple();
-	test_sysctl_nested();
+	if (pid_ns_supported()) {
+		test_sysctl_simple();
+		test_sysctl_nested();
+	} else {
+		printf("PID namespaces are not supported; skipping sysctl tests\n");
+	}
 
 	test_share_dup("SHARE-DUP", "");
 	test_share_mmap("SHARE-MMAP", "");
-- 
cgit v1.2.3


From a17975992cc11588767175247ccaae1213a8b582 Mon Sep 17 00:00:00 2001
From: Adrian Moreno <amorenoz@redhat.com>
Date: Tue, 17 Dec 2024 22:16:51 +0100
Subject: selftests: openvswitch: fix tcpdump execution

Fix the way tcpdump is executed by:
- Using the right variable for the namespace. Currently the use of the
  empty "ns" makes the command fail.
- Waiting until it starts to capture to ensure the interesting traffic
  is caught on slow systems.
- Using line-buffered output to ensure logs are available when the test
  is paused with "-p". Otherwise the last chunk of data might only be
  written when tcpdump is killed.

Fixes: 74cc26f416b9 ("selftests: openvswitch: add interface support")
Signed-off-by: Adrian Moreno <amorenoz@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Link: https://patch.msgid.link/20241217211652.483016-1-amorenoz@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/openvswitch/openvswitch.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh
index cc0bfae2bafa..960e1ab4dd04 100755
--- a/tools/testing/selftests/net/openvswitch/openvswitch.sh
+++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh
@@ -171,8 +171,10 @@ ovs_add_netns_and_veths () {
 		ovs_add_if "$1" "$2" "$4" -u || return 1
 	fi
 
-	[ $TRACING -eq 1 ] && ovs_netns_spawn_daemon "$1" "$ns" \
-			tcpdump -i any -s 65535
+	if [ $TRACING -eq 1 ]; then
+		ovs_netns_spawn_daemon "$1" "$3" tcpdump -l -i any -s 6553
+		ovs_wait grep -q "listening on any" ${ovs_dir}/stderr
+	fi
 
 	return 0
 }
-- 
cgit v1.2.3


From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 19 Dec 2024 19:15:06 +0800
Subject: selftests/bpf: Use asm constraint "m" for LoongArch

Currently, LoongArch LLVM does not support the constraint "o" and no plan
to support it, it only supports the similar constraint "m", so change the
constraints from "nor" in the "else" case to arch-specific "nmr" to avoid
the build error such as "unexpected asm memory constraint" for LoongArch.

Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests")
Suggested-by: Weining Lu <luweining@loongson.cn>
Suggested-by: Li Chen <chenli@loongson.cn>
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>
Cc: stable@vger.kernel.org
Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list
Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172
Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn
---
 tools/testing/selftests/bpf/sdt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h
index ca0162b4dc57..1fcfa5160231 100644
--- a/tools/testing/selftests/bpf/sdt.h
+++ b/tools/testing/selftests/bpf/sdt.h
@@ -102,6 +102,8 @@
 # define STAP_SDT_ARG_CONSTRAINT        nZr
 # elif defined __arm__
 # define STAP_SDT_ARG_CONSTRAINT        g
+# elif defined __loongarch__
+# define STAP_SDT_ARG_CONSTRAINT        nmr
 # else
 # define STAP_SDT_ARG_CONSTRAINT        nor
 # endif
-- 
cgit v1.2.3


From 716f2bca1ce93bb95364f1fc0555c1650507b588 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 18 Dec 2024 18:57:24 +0100
Subject: selftests/bpf: Fix compilation error in get_uprobe_offset()

In get_uprobe_offset(), the call to procmap_query() use the constant
PROCMAP_QUERY_VMA_EXECUTABLE, even if PROCMAP_QUERY is not defined.

Define PROCMAP_QUERY_VMA_EXECUTABLE when PROCMAP_QUERY isn't.

Fixes: 4e9e07603ecd ("selftests/bpf: make use of PROCMAP_QUERY ioctl if available")
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20241218175724.578884-1-jmarchan@redhat.com
---
 tools/testing/selftests/bpf/trace_helpers.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 2d742fdac6b9..81943c6254e6 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -293,6 +293,10 @@ static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *st
 	return 0;
 }
 #else
+# ifndef PROCMAP_QUERY_VMA_EXECUTABLE
+#  define PROCMAP_QUERY_VMA_EXECUTABLE 0x04
+# endif
+
 static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 5760711e198d86bd0d0b9270a54a494ae9a501e0 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Mon, 16 Dec 2024 19:12:01 +0200
Subject: selftests: fib_rule_tests: Add flow label selector match tests

Add tests for the new FIB rule flow label selector. Test both good and bad
flows and with both input and output routes.

 # ./fib_rule_tests.sh
 IPv6 FIB rule tests
 [...]
    TEST: rule6 check: flowlabel redirect to table                      [ OK ]
    TEST: rule6 check: flowlabel no redirect to table                   [ OK ]
    TEST: rule6 del by pref: flowlabel redirect to table                [ OK ]
    TEST: rule6 check: iif flowlabel redirect to table                  [ OK ]
    TEST: rule6 check: iif flowlabel no redirect to table               [ OK ]
    TEST: rule6 del by pref: iif flowlabel redirect to table            [ OK ]
    TEST: rule6 check: flowlabel masked redirect to table               [ OK ]
    TEST: rule6 check: flowlabel masked no redirect to table            [ OK ]
    TEST: rule6 del by pref: flowlabel masked redirect to table         [ OK ]
    TEST: rule6 check: iif flowlabel masked redirect to table           [ OK ]
    TEST: rule6 check: iif flowlabel masked no redirect to table        [ OK ]
    TEST: rule6 del by pref: iif flowlabel masked redirect to table     [ OK ]
 [...]

 Tests passed: 268
 Tests failed:   0

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 31 +++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 1d58b3b87465..847936363a12 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -291,6 +291,37 @@ fib_rule6_test()
 			"$getnomatch" "iif dscp redirect to table" \
 			"iif dscp no redirect to table"
 	fi
+
+	fib_check_iproute_support "flowlabel" "flowlabel"
+	if [ $? -eq 0 ]; then
+		match="flowlabel 0xfffff"
+		getmatch="flowlabel 0xfffff"
+		getnomatch="flowlabel 0xf"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "flowlabel redirect to table" \
+			"flowlabel no redirect to table"
+
+		match="flowlabel 0xfffff"
+		getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff"
+		getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif flowlabel redirect to table" \
+			"iif flowlabel no redirect to table"
+
+		match="flowlabel 0x08000/0x08000"
+		getmatch="flowlabel 0xfffff"
+		getnomatch="flowlabel 0xf7fff"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "flowlabel masked redirect to table" \
+			"flowlabel masked no redirect to table"
+
+		match="flowlabel 0x08000/0x08000"
+		getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff"
+		getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf7fff"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif flowlabel masked redirect to table" \
+			"iif flowlabel masked no redirect to table"
+	fi
 }
 
 fib_rule6_vrf_test()
-- 
cgit v1.2.3


From 6724bc65e59b57e64f65269da8956f8bdc12bb03 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Wed, 18 Dec 2024 09:00:18 -0500
Subject: selftests: net: remove redundant ncdevmem print

Remove extrenous fprintf

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20241218140018.15607-1-jhs@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/ncdevmem.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
index 8e502a1f8f9b..19a6969643f4 100644
--- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c
+++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
@@ -619,9 +619,6 @@ int do_server(struct memory_buffer *mem)
 	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
 		page_aligned_frags, non_page_aligned_frags);
 
-	fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
-		page_aligned_frags, non_page_aligned_frags);
-
 cleanup:
 
 	free(tmp_mem);
-- 
cgit v1.2.3


From 55853cb829dc707427c3519f6b8686682a204368 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Wed, 18 Dec 2024 10:59:31 +0800
Subject: selftests/alsa: Fix circular dependency involving global-timer

The pattern rule `$(OUTPUT)/%: %.c` inadvertently included a circular
dependency on the global-timer target due to its inclusion in
$(TEST_GEN_PROGS_EXTENDED). This resulted in a circular dependency
warning during the build process.

To resolve this, the dependency on $(TEST_GEN_PROGS_EXTENDED) has been
replaced with an explicit dependency on $(OUTPUT)/libatest.so. This change
ensures that libatest.so is built before any other targets that require it,
without creating a circular dependency.

This fix addresses the following warning:

make[4]: Entering directory 'tools/testing/selftests/alsa'
make[4]: Circular default_modconfig/kselftest/alsa/global-timer <- default_modconfig/kselftest/alsa/global-timer dependency dropped.
make[4]: Nothing to be done for 'all'.
make[4]: Leaving directory 'tools/testing/selftests/alsa'

Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://patch.msgid.link/20241218025931.914164-1-lizhijian@fujitsu.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile
index 944279160fed..8dab90ad22bb 100644
--- a/tools/testing/selftests/alsa/Makefile
+++ b/tools/testing/selftests/alsa/Makefile
@@ -27,5 +27,5 @@ include ../lib.mk
 $(OUTPUT)/libatest.so: conf.c alsa-local.h
 	$(CC) $(CFLAGS) -shared -fPIC $< $(LDLIBS) -o $@
 
-$(OUTPUT)/%: %.c $(TEST_GEN_PROGS_EXTENDED) alsa-local.h
+$(OUTPUT)/%: %.c $(OUTPUT)/libatest.so alsa-local.h
 	$(CC) $(CFLAGS) $< $(LDLIBS) -latest -o $@
-- 
cgit v1.2.3


From dec2f97a1571ed28ddbadf4431afc5e5872a10df Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:50 -0600
Subject: cpupower: Remove spurious return statement

print_duration() has a return; statement at the end of the function
that is not necessary as it's a void function.

Link: https://lore.kernel.org/r/20241218191144.3440854-2-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/cpufreq-info.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c
index c96b77365c63..5f092f3c729e 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -120,7 +120,6 @@ static void print_duration(unsigned long duration)
 		} else
 			printf("%lu ns", duration);
 	}
-	return;
 }
 
 static int get_boost_mode_x86(unsigned int cpu)
-- 
cgit v1.2.3


From 3f2eb7606eee37aea630c4b7aa42497bc36ca157 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:51 -0600
Subject: cpupower: Add support for parsing 'enabled' or 'disabled' strings
 from table

When cpufreq_get_sysfs_value_from_table() is passed a table with
kernel strings that report 'enabled' or 'disabled' it always returns 0
because these can't cleanly convert to integers.

Explicitly look for enabled or disabled strings from the kernel to handle
this.

Link: https://lore.kernel.org/r/20241218191144.3440854-3-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/lib/cpufreq.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/lib/cpufreq.c b/tools/power/cpupower/lib/cpufreq.c
index 1516d23c17c9..f27ee6d4b000 100644
--- a/tools/power/cpupower/lib/cpufreq.c
+++ b/tools/power/cpupower/lib/cpufreq.c
@@ -102,6 +102,10 @@ unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu,
 	if (len == 0)
 		return 0;
 
+	if (!strcmp(linebuf, "enabled\n"))
+		return 1;
+	if (!strcmp(linebuf, "disabled\n"))
+		return 0;
 	value = strtoul(linebuf, &endp, 0);
 
 	if (endp == linebuf || errno == ERANGE)
-- 
cgit v1.2.3


From 6d4a2987f96b9f281b07286eeb1d4022054e1ecd Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:52 -0600
Subject: cpupower: Add support for amd-pstate preferred core rankings

The rankings are useful information to determine if the scheduler
is placing tasks appropriately for the hardware.

Link: https://lore.kernel.org/r/20241218191144.3440854-4-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/helpers/amd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c
index 0a56e22240fc..090fdd6d1551 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -177,6 +177,8 @@ enum amd_pstate_value {
 	AMD_PSTATE_HIGHEST_PERF,
 	AMD_PSTATE_MAX_FREQ,
 	AMD_PSTATE_LOWEST_NONLINEAR_FREQ,
+	AMD_PSTATE_HW_PREFCORE,
+	AMD_PSTATE_PREFCORE_RANKING,
 	MAX_AMD_PSTATE_VALUE_READ_FILES,
 };
 
@@ -184,6 +186,8 @@ static const char *amd_pstate_value_files[MAX_AMD_PSTATE_VALUE_READ_FILES] = {
 	[AMD_PSTATE_HIGHEST_PERF] = "amd_pstate_highest_perf",
 	[AMD_PSTATE_MAX_FREQ] = "amd_pstate_max_freq",
 	[AMD_PSTATE_LOWEST_NONLINEAR_FREQ] = "amd_pstate_lowest_nonlinear_freq",
+	[AMD_PSTATE_HW_PREFCORE] = "amd_pstate_hw_prefcore",
+	[AMD_PSTATE_PREFCORE_RANKING] = "amd_pstate_prefcore_ranking",
 };
 
 static unsigned long amd_pstate_get_data(unsigned int cpu,
@@ -240,6 +244,10 @@ void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding)
 	       acpi_cppc_get_data(cpu, LOWEST_PERF));
 	print_speed(acpi_cppc_get_data(cpu, LOWEST_FREQ) * 1000, no_rounding);
 	printf(".\n");
+
+	printf(_("    Preferred Core Support: %lu. Preferred Core Ranking: %lu.\n"),
+	       amd_pstate_get_data(cpu, AMD_PSTATE_HW_PREFCORE),
+	       amd_pstate_get_data(cpu, AMD_PSTATE_PREFCORE_RANKING));
 }
 
 /* AMD P-State Helper Functions ************************************/
-- 
cgit v1.2.3


From 26e16174f54d40a3774614c4d43966572ed79dc1 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:53 -0600
Subject: cpupower: Don't try to read frequency from hardware when kernel uses
 aperfmperf

When the amd-pstate is in use frequency is set by the hardware and
measured by the kernel through using the aperf and mperf registers.

There is no direct call to the hardware to indicate current frequency.

Detect that this feature is in use and skip the check.

Link: https://lore.kernel.org/r/20241218191144.3440854-5-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/cpufreq-info.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c
index 5f092f3c729e..3df28e45be42 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -254,7 +254,12 @@ static int get_freq_kernel(unsigned int cpu, unsigned int human)
 
 static int get_freq_hardware(unsigned int cpu, unsigned int human)
 {
-	unsigned long freq = cpufreq_get_freq_hardware(cpu);
+	unsigned long freq;
+
+	if (cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)
+		return -EINVAL;
+
+	freq = cpufreq_get_freq_hardware(cpu);
 	printf(_("  current CPU frequency: "));
 	if (!freq) {
 		printf("Unable to call hardware\n");
-- 
cgit v1.2.3


From 5f567afc283fc9e7c6a34d013c4fc6c5e8d6afae Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:54 -0600
Subject: cpupower: Add support for showing energy performance preference

The EPP value is useful for characterization of performance. Show
it in cpupower frequency-info output.

Link: https://lore.kernel.org/r/20241218191144.3440854-6-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/lib/cpufreq.c        | 14 ++++++++++++++
 tools/power/cpupower/lib/cpufreq.h        |  8 ++++++++
 tools/power/cpupower/utils/cpufreq-info.c | 25 ++++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/lib/cpufreq.c b/tools/power/cpupower/lib/cpufreq.c
index f27ee6d4b000..8dda3db2dff0 100644
--- a/tools/power/cpupower/lib/cpufreq.c
+++ b/tools/power/cpupower/lib/cpufreq.c
@@ -127,12 +127,14 @@ static unsigned long sysfs_cpufreq_get_one_value(unsigned int cpu,
 enum cpufreq_string {
 	SCALING_DRIVER,
 	SCALING_GOVERNOR,
+	ENERGY_PERFORMANCE_PREFERENCE,
 	MAX_CPUFREQ_STRING_FILES
 };
 
 static const char *cpufreq_string_files[MAX_CPUFREQ_STRING_FILES] = {
 	[SCALING_DRIVER] = "scaling_driver",
 	[SCALING_GOVERNOR] = "scaling_governor",
+	[ENERGY_PERFORMANCE_PREFERENCE] = "energy_performance_preference",
 };
 
 
@@ -207,6 +209,18 @@ unsigned long cpufreq_get_transition_latency(unsigned int cpu)
 	return sysfs_cpufreq_get_one_value(cpu, CPUINFO_LATENCY);
 }
 
+char *cpufreq_get_energy_performance_preference(unsigned int cpu)
+{
+	return sysfs_cpufreq_get_one_string(cpu, ENERGY_PERFORMANCE_PREFERENCE);
+}
+
+void cpufreq_put_energy_performance_preference(char *ptr)
+{
+	if (!ptr)
+		return;
+	free(ptr);
+}
+
 int cpufreq_get_hardware_limits(unsigned int cpu,
 				unsigned long *min,
 				unsigned long *max)
diff --git a/tools/power/cpupower/lib/cpufreq.h b/tools/power/cpupower/lib/cpufreq.h
index 2f3c84035806..bfc617311ebd 100644
--- a/tools/power/cpupower/lib/cpufreq.h
+++ b/tools/power/cpupower/lib/cpufreq.h
@@ -68,6 +68,14 @@ unsigned long cpufreq_get_freq_hardware(unsigned int cpu);
 unsigned long cpufreq_get_transition_latency(unsigned int cpu);
 
 
+/* determine energy performance preference
+ *
+ * returns NULL on failure, else the string that represents the energy performance
+ * preference requested.
+ */
+char *cpufreq_get_energy_performance_preference(unsigned int cpu);
+void cpufreq_put_energy_performance_preference(char *ptr);
+
 /* determine hardware CPU frequency limits
  *
  * These may be limited further by thermal, energy or other
diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c
index 3df28e45be42..eb9cc0f10634 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -422,6 +422,23 @@ static int get_freq_stats(unsigned int cpu, unsigned int human)
 	return 0;
 }
 
+/* --epp / -z */
+
+static int get_epp(unsigned int cpu, bool interactive)
+{
+	char *epp;
+
+	epp = cpufreq_get_energy_performance_preference(cpu);
+	if (!epp)
+		return -EINVAL;
+	if (interactive)
+		printf(_("  energy performance preference: %s\n"), epp);
+
+	cpufreq_put_energy_performance_preference(epp);
+
+	return 0;
+}
+
 /* --latency / -y */
 
 static int get_latency(unsigned int cpu, unsigned int human)
@@ -461,6 +478,7 @@ static void debug_output_one(unsigned int cpu)
 	get_related_cpus(cpu);
 	get_affected_cpus(cpu);
 	get_latency(cpu, 1);
+	get_epp(cpu, true);
 	get_hardware_limits(cpu, 1);
 
 	freqs = cpufreq_get_available_frequencies(cpu);
@@ -501,6 +519,7 @@ static struct option info_opts[] = {
 	{"human",	 no_argument,		 NULL,	 'm'},
 	{"no-rounding", no_argument,	 NULL,	 'n'},
 	{"performance", no_argument,	 NULL,	 'c'},
+	{"epp",		 no_argument,		 NULL,	 'z'},
 	{ },
 };
 
@@ -514,7 +533,7 @@ int cmd_freq_info(int argc, char **argv)
 	int output_param = 0;
 
 	do {
-		ret = getopt_long(argc, argv, "oefwldpgrasmybnc", info_opts,
+		ret = getopt_long(argc, argv, "oefwldpgrasmybncz", info_opts,
 				  NULL);
 		switch (ret) {
 		case '?':
@@ -538,6 +557,7 @@ int cmd_freq_info(int argc, char **argv)
 		case 's':
 		case 'y':
 		case 'c':
+		case 'z':
 			if (output_param) {
 				output_param = -1;
 				cont = 0;
@@ -647,6 +667,9 @@ int cmd_freq_info(int argc, char **argv)
 		case 'c':
 			ret = get_perf_cap(cpu);
 			break;
+		case 'z':
+			ret = get_epp(cpu, true);
+			break;
 		}
 		if (ret)
 			return ret;
-- 
cgit v1.2.3


From acf71265e4c0289e23ee1b66fc0977478edea9a5 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:55 -0600
Subject: cpupower: Don't fetch maximum latency when EPP is enabled

When EPP has been enabled the hardware will autonomously change
frequencies on it's own and thus there is no latency with changing
from the kernel.

Avoid doing the maximum latency check when EPP is found. This will
apply to both amd-pstate and intel-pstate drivers.

Link: https://lore.kernel.org/r/20241218191144.3440854-7-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/cpufreq-info.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c
index eb9cc0f10634..fc750e127404 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -445,6 +445,9 @@ static int get_latency(unsigned int cpu, unsigned int human)
 {
 	unsigned long latency = cpufreq_get_transition_latency(cpu);
 
+	if (!get_epp(cpu, false))
+		return -EINVAL;
+
 	printf(_("  maximum transition latency: "));
 	if (!latency || latency == UINT_MAX) {
 		printf(_(" Cannot determine or is not supported.\n"));
-- 
cgit v1.2.3


From 8395d43949790f1671621975730a07c264ef3e6f Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 18 Dec 2024 13:09:56 -0600
Subject: cpupower: Adjust whitespace for amd-pstate specific prints

The amd-pstate section is grouped under boost, which isn't appropriate.

Adjust the indentation so that it is it's own section.

Link: https://lore.kernel.org/r/20241218191144.3440854-8-superm1@kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/helpers/amd.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c
index 090fdd6d1551..795562e879de 100644
--- a/tools/power/cpupower/utils/helpers/amd.c
+++ b/tools/power/cpupower/utils/helpers/amd.c
@@ -219,7 +219,9 @@ void amd_pstate_boost_init(unsigned int cpu, int *support, int *active)
 
 void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding)
 {
-	printf(_("    AMD PSTATE Highest Performance: %lu. Maximum Frequency: "),
+
+	printf(_("  amd-pstate limits:\n"));
+	printf(_("    Highest Performance: %lu. Maximum Frequency: "),
 	       amd_pstate_get_data(cpu, AMD_PSTATE_HIGHEST_PERF));
 	/*
 	 * If boost isn't active, the cpuinfo_max doesn't indicate real max
@@ -228,19 +230,19 @@ void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding)
 	print_speed(amd_pstate_get_data(cpu, AMD_PSTATE_MAX_FREQ), no_rounding);
 	printf(".\n");
 
-	printf(_("    AMD PSTATE Nominal Performance: %lu. Nominal Frequency: "),
+	printf(_("    Nominal Performance: %lu. Nominal Frequency: "),
 	       acpi_cppc_get_data(cpu, NOMINAL_PERF));
 	print_speed(acpi_cppc_get_data(cpu, NOMINAL_FREQ) * 1000,
 		    no_rounding);
 	printf(".\n");
 
-	printf(_("    AMD PSTATE Lowest Non-linear Performance: %lu. Lowest Non-linear Frequency: "),
+	printf(_("    Lowest Non-linear Performance: %lu. Lowest Non-linear Frequency: "),
 	       acpi_cppc_get_data(cpu, LOWEST_NONLINEAR_PERF));
 	print_speed(amd_pstate_get_data(cpu, AMD_PSTATE_LOWEST_NONLINEAR_FREQ),
 		    no_rounding);
 	printf(".\n");
 
-	printf(_("    AMD PSTATE Lowest Performance: %lu. Lowest Frequency: "),
+	printf(_("    Lowest Performance: %lu. Lowest Frequency: "),
 	       acpi_cppc_get_data(cpu, LOWEST_PERF));
 	print_speed(acpi_cppc_get_data(cpu, LOWEST_FREQ) * 1000, no_rounding);
 	printf(".\n");
-- 
cgit v1.2.3


From 6de02569a2bb678db04236fdf29814c0c27f5121 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Wed, 18 Dec 2024 20:26:02 -0500
Subject: pm: cpupower: Add install and uninstall options to bindings makefile

Installs the .so and .py files generated by SWIG to system's site packages
directory. This allows the Python bindings to be used system wide. This
commit also includes documentation on setting up and installing the Python
bindings.

Link: https://lore.kernel.org/r/20241219012606.38963-1-jwyatt@redhat.com
Signed-off-by: "John B. Wyatt IV" <jwyatt@redhat.com>
Signed-off-by: "John B. Wyatt IV" <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/Makefile | 10 ++++++++++
 tools/power/cpupower/bindings/python/README   | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile
index e1ebb1d60cd4..741f21477432 100644
--- a/tools/power/cpupower/bindings/python/Makefile
+++ b/tools/power/cpupower/bindings/python/Makefile
@@ -11,6 +11,7 @@ HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; el
 LIB_DIR := ../../lib
 PY_INCLUDE = $(firstword $(shell python-config --includes))
 OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o)
+INSTALL_DIR = $(shell python3 -c "import site; print(site.getsitepackages()[0])")
 
 all: _raw_pylibcpupower.so
 
@@ -28,6 +29,15 @@ else ifeq ($(HAVE_PYCONFIG),0)
 endif
 	swig -python raw_pylibcpupower.swg
 
+# Only installs the Python bindings
+install: _raw_pylibcpupower.so
+	install -D _raw_pylibcpupower.so $(INSTALL_DIR)/_raw_pylibcpupower.so
+	install -D raw_pylibcpupower.py $(INSTALL_DIR)/raw_pylibcpupower.py
+
+uninstall:
+	rm -f $(INSTALL_DIR)/_raw_pylibcpupower.so
+	rm -f $(INSTALL_DIR)/raw_pylibcpupower.py
+
 # Will only clean the bindings folder; will not clean the actual cpupower folder
 clean:
 	rm -f raw_pylibcpupower.py raw_pylibcpupower_wrap.c raw_pylibcpupower_wrap.o _raw_pylibcpupower.so
diff --git a/tools/power/cpupower/bindings/python/README b/tools/power/cpupower/bindings/python/README
index 0a4bb2581e8a..952e2e02fd32 100644
--- a/tools/power/cpupower/bindings/python/README
+++ b/tools/power/cpupower/bindings/python/README
@@ -48,6 +48,31 @@ To run the test script:
 $ python test_raw_pylibcpupower.py
 
 
+developing/using the bindings directly
+--------------------------------------
+
+You need to add the Python bindings directory to your $PYTHONPATH.
+
+You would set the path in the Bash terminal or in the Bash profile:
+
+PYTHONPATH=~/linux/tools/power/cpupower/bindings/python:$PYTHONPATH
+
+This allows you to set a specific repo of the bindings to use.
+
+
+installing/uninstalling
+-----------------------
+
+Python uses a system specific site-packages folder to look up modules to import
+by default. You do not need to install cpupower to use the SWIG bindings.
+
+You can install and uninstall the bindings to the site-packages with:
+
+sudo make install
+
+sudo make uninstall
+
+
 credits
 -------
 
-- 
cgit v1.2.3


From c5d2bac978c513e1f22273cba9c55db3778032e5 Mon Sep 17 00:00:00 2001
From: Ariel Otilibili <ariel.otilibili-anieli@eurecom.fr>
Date: Wed, 11 Dec 2024 22:57:29 +0100
Subject: selftests/bpf: Clear out Python syntax warnings

Invalid escape sequences are used, and produced syntax warnings:

  $ test_bpftool_synctypes.py
  test_bpftool_synctypes.py:69: SyntaxWarning: invalid escape sequence '\['
    self.start_marker = re.compile(f'(static )?const bool {self.array_name}\[.*\] = {{\n')
  test_bpftool_synctypes.py:83: SyntaxWarning: invalid escape sequence '\['
    pattern = re.compile('\[(BPF_\w*)\]\s*= (true|false),?$')
  test_bpftool_synctypes.py:181: SyntaxWarning: invalid escape sequence '\s'
    pattern = re.compile('^\s*(BPF_\w+),?(\s+/\*.*\*/)?$')
  test_bpftool_synctypes.py:229: SyntaxWarning: invalid escape sequence '\*'
    start_marker = re.compile(f'\*{block_name}\* := {{')
  test_bpftool_synctypes.py:229: SyntaxWarning: invalid escape sequence '\*'
    start_marker = re.compile(f'\*{block_name}\* := {{')
  test_bpftool_synctypes.py:230: SyntaxWarning: invalid escape sequence '\*'
    pattern = re.compile('\*\*([\w/-]+)\*\*')
  test_bpftool_synctypes.py:248: SyntaxWarning: invalid escape sequence '\s'
    start_marker = re.compile(f'"\s*{block_name} := {{')
  test_bpftool_synctypes.py:249: SyntaxWarning: invalid escape sequence '\w'
    pattern = re.compile('([\w/]+) [|}]')
  test_bpftool_synctypes.py:267: SyntaxWarning: invalid escape sequence '\s'
    start_marker = re.compile(f'"\s*{macro}\s*" [|}}]')
  test_bpftool_synctypes.py:267: SyntaxWarning: invalid escape sequence '\s'
    start_marker = re.compile(f'"\s*{macro}\s*" [|}}]')
  test_bpftool_synctypes.py:268: SyntaxWarning: invalid escape sequence '\w'
    pattern = re.compile('([\w-]+) ?(?:\||}[ }\]])')
  test_bpftool_synctypes.py:287: SyntaxWarning: invalid escape sequence '\w'
    pattern = re.compile('(?:.*=\')?([\w/]+)')
  test_bpftool_synctypes.py:319: SyntaxWarning: invalid escape sequence '\w'
    pattern = re.compile('([\w-]+) ?(?:\||}[ }\]"])')
  test_bpftool_synctypes.py:341: SyntaxWarning: invalid escape sequence '\|'
    start_marker = re.compile('\|COMMON_OPTIONS\| replace:: {')
  test_bpftool_synctypes.py:342: SyntaxWarning: invalid escape sequence '\*'
    pattern = re.compile('\*\*([\w/-]+)\*\*')

Escaping them clears out the warnings.

  $ tools/testing/selftests/bpf/test_bpftool_synctypes.py; echo $?
  0

Signed-off-by: Ariel Otilibili <ariel.otilibili-anieli@eurecom.fr>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Quentin Monnet <qmo@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Link: https://docs.python.org/3/library/re.html
Link: https://lore.kernel.org/bpf/20241211220012.714055-2-ariel.otilibili-anieli@eurecom.fr
---
 .../selftests/bpf/test_bpftool_synctypes.py        | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
index 0ed67b6b31dd..238121fda5b6 100755
--- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -66,7 +66,7 @@ class ArrayParser(BlockParser):
 
     def __init__(self, reader, array_name):
         self.array_name = array_name
-        self.start_marker = re.compile(f'(static )?const bool {self.array_name}\[.*\] = {{\n')
+        self.start_marker = re.compile(fr'(static )?const bool {self.array_name}\[.*\] = {{\n')
         super().__init__(reader)
 
     def search_block(self):
@@ -80,7 +80,7 @@ class ArrayParser(BlockParser):
         Parse a block and return data as a dictionary. Items to extract must be
         on separate lines in the file.
         """
-        pattern = re.compile('\[(BPF_\w*)\]\s*= (true|false),?$')
+        pattern = re.compile(r'\[(BPF_\w*)\]\s*= (true|false),?$')
         entries = set()
         while True:
             line = self.reader.readline()
@@ -178,7 +178,7 @@ class FileExtractor(object):
         @enum_name: name of the enum to parse
         """
         start_marker = re.compile(f'enum {enum_name} {{\n')
-        pattern = re.compile('^\s*(BPF_\w+),?(\s+/\*.*\*/)?$')
+        pattern = re.compile(r'^\s*(BPF_\w+),?(\s+/\*.*\*/)?$')
         end_marker = re.compile('^};')
         parser = BlockParser(self.reader)
         parser.search_block(start_marker)
@@ -226,8 +226,8 @@ class FileExtractor(object):
 
         @block_name: name of the blog to parse, 'TYPE' in the example
         """
-        start_marker = re.compile(f'\*{block_name}\* := {{')
-        pattern = re.compile('\*\*([\w/-]+)\*\*')
+        start_marker = re.compile(fr'\*{block_name}\* := {{')
+        pattern = re.compile(r'\*\*([\w/-]+)\*\*')
         end_marker = re.compile('}\n')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
@@ -245,8 +245,8 @@ class FileExtractor(object):
 
         @block_name: name of the blog to parse, 'TYPE' in the example
         """
-        start_marker = re.compile(f'"\s*{block_name} := {{')
-        pattern = re.compile('([\w/]+) [|}]')
+        start_marker = re.compile(fr'"\s*{block_name} := {{')
+        pattern = re.compile(r'([\w/]+) [|}]')
         end_marker = re.compile('}')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
@@ -264,8 +264,8 @@ class FileExtractor(object):
 
         @macro: macro starting the block, 'HELP_SPEC_OPTIONS' in the example
         """
-        start_marker = re.compile(f'"\s*{macro}\s*" [|}}]')
-        pattern = re.compile('([\w-]+) ?(?:\||}[ }\]])')
+        start_marker = re.compile(fr'"\s*{macro}\s*" [|}}]')
+        pattern = re.compile(r'([\w-]+) ?(?:\||}[ }\]])')
         end_marker = re.compile('}\\\\n')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
@@ -283,8 +283,8 @@ class FileExtractor(object):
 
         @block_name: name of the blog to parse, 'TYPE' in the example
         """
-        start_marker = re.compile(f'local {block_name}=\'')
-        pattern = re.compile('(?:.*=\')?([\w/]+)')
+        start_marker = re.compile(fr'local {block_name}=\'')
+        pattern = re.compile(r'(?:.*=\')?([\w/]+)')
         end_marker = re.compile('\'$')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
@@ -316,7 +316,7 @@ class MainHeaderFileExtractor(SourceFileExtractor):
             {'-p', '-d', '--pretty', '--debug', '--json', '-j'}
         """
         start_marker = re.compile(f'"OPTIONS :=')
-        pattern = re.compile('([\w-]+) ?(?:\||}[ }\]"])')
+        pattern = re.compile(r'([\w-]+) ?(?:\||}[ }\]"])')
         end_marker = re.compile('#define')
 
         parser = InlineListParser(self.reader)
@@ -338,8 +338,8 @@ class ManSubstitutionsExtractor(SourceFileExtractor):
 
             {'-p', '-d', '--pretty', '--debug', '--json', '-j'}
         """
-        start_marker = re.compile('\|COMMON_OPTIONS\| replace:: {')
-        pattern = re.compile('\*\*([\w/-]+)\*\*')
+        start_marker = re.compile(r'\|COMMON_OPTIONS\| replace:: {')
+        pattern = re.compile(r'\*\*([\w/-]+)\*\*')
         end_marker = re.compile('}$')
 
         parser = InlineListParser(self.reader)
-- 
cgit v1.2.3


From 724c6ce38bbaeb4b3f109b0e066d6c0ecd15446c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 19 Dec 2024 14:57:34 +0100
Subject: stddef: make __struct_group() UAPI C++-friendly

For the most part of the C++ history, it couldn't have type
declarations inside anonymous unions for different reasons. At the
same time, __struct_group() relies on the latters, so when the @TAG
argument is not empty, C++ code doesn't want to build (even under
`extern "C"`):

../linux/include/uapi/linux/pkt_cls.h:25:24: error:
'struct tc_u32_sel::<unnamed union>::tc_u32_sel_hdr,' invalid;
an anonymous union may only have public non-static data members
[-fpermissive]

The safest way to fix this without trying to switch standards (which
is impossible in UAPI anyway) etc., is to disable tag declaration
for that language. This won't break anything since for now it's not
buildable at all.
Use a separate definition for __struct_group() when __cplusplus is
defined to mitigate the error, including the version from tools/.

Fixes: 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro")
Reported-by: Christopher Ferris <cferris@google.com>
Closes: https://lore.kernel.org/linux-hardening/Z1HZpe3WE5As8UAz@google.com
Suggested-by: Kees Cook <kees@kernel.org> # __struct_group_tag()
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20241219135734.2130002-1-aleksander.lobakin@intel.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/include/uapi/linux/stddef.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/stddef.h b/tools/include/uapi/linux/stddef.h
index bb6ea517efb5..c53cde425406 100644
--- a/tools/include/uapi/linux/stddef.h
+++ b/tools/include/uapi/linux/stddef.h
@@ -8,6 +8,13 @@
 #define __always_inline __inline__
 #endif
 
+/* Not all C++ standards support type declarations inside an anonymous union */
+#ifndef __cplusplus
+#define __struct_group_tag(TAG)		TAG
+#else
+#define __struct_group_tag(TAG)
+#endif
+
 /**
  * __struct_group() - Create a mirrored named and anonyomous struct
  *
@@ -20,14 +27,14 @@
  * and size: one anonymous and one named. The former's members can be used
  * normally without sub-struct naming, and the latter can be used to
  * reason about the start, end, and size of the group of struct members.
- * The named struct can also be explicitly tagged for layer reuse, as well
- * as both having struct attributes appended.
+ * The named struct can also be explicitly tagged for layer reuse (C only),
+ * as well as both having struct attributes appended.
  */
 #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \
 	union { \
 		struct { MEMBERS } ATTRS; \
-		struct TAG { MEMBERS } ATTRS NAME; \
-	}
+		struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \
+	} ATTRS
 
 /**
  * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union
-- 
cgit v1.2.3


From 246068b86b1c36e4590388ab8f278e21f1997dc1 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 19 Dec 2024 17:54:10 +0200
Subject: selftests: net: local_termination: require mausezahn

Since the blamed commit, we require mausezahn because send_raw() uses it.
Remove the "REQUIRE_MZ=no" line, which overwrites the default of requiring it.

Fixes: 237979504264 ("selftests: net: local_termination: add PTP frames to the mix")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20241219155410.1856868-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/local_termination.sh | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
index c35548767756..ecd34f364125 100755
--- a/tools/testing/selftests/net/forwarding/local_termination.sh
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -7,7 +7,6 @@ ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \
 NUM_NETIFS=2
 PING_COUNT=1
 REQUIRE_MTOOLS=yes
-REQUIRE_MZ=no
 
 source lib.sh
 
-- 
cgit v1.2.3


From 30b981796b94b083da8fdded7cb74cb493608760 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 18 Dec 2024 19:28:33 -0800
Subject: selftests: drv-net: test empty queue and NAPI responses in netlink

Make sure kernel doesn't respond to GETs for queues and NAPIs when
link is down. Not with valid data, or with empty message, we want
a ENOENT.

Link: https://patch.msgid.link/20241219032833.1165433-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/queues.py | 28 +++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py
index 9c5473abbd78..38303da957ee 100755
--- a/tools/testing/selftests/drivers/net/queues.py
+++ b/tools/testing/selftests/drivers/net/queues.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0
 
-from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx
-from lib.py import EthtoolFamily, NetdevFamily
+from lib.py import ksft_disruptive, ksft_exit, ksft_run
+from lib.py import ksft_eq, ksft_raises, KsftSkipEx
+from lib.py import EthtoolFamily, NetdevFamily, NlError
 from lib.py import NetDrvEnv
-from lib.py import cmd
+from lib.py import cmd, defer, ip
+import errno
 import glob
 
 
@@ -59,9 +61,27 @@ def addremove_queues(cfg, nl) -> None:
     ksft_eq(queues, expected)
 
 
+@ksft_disruptive
+def check_down(cfg, nl) -> None:
+    # Check the NAPI IDs before interface goes down and hides them
+    napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True)
+
+    ip(f"link set dev {cfg.dev['ifname']} down")
+    defer(ip, f"link set dev {cfg.dev['ifname']} up")
+
+    with ksft_raises(NlError) as cm:
+        nl.queue_get({'ifindex': cfg.ifindex, 'id': 0, 'type': 'rx'})
+    ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT)
+
+    if napis:
+        with ksft_raises(NlError) as cm:
+            nl.napi_get({'id': napis[0]['id']})
+        ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT)
+
+
 def main() -> None:
     with NetDrvEnv(__file__, queue_count=100) as cfg:
-        ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily()))
+        ksft_run([get_queues, addremove_queues, check_down], args=(cfg, NetdevFamily()))
     ksft_exit()
 
 
-- 
cgit v1.2.3


From 976d248bd33356eecb958cdc1b0c37622fd5d595 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 18 Dec 2024 18:15:58 +0100
Subject: selftests: net: lib: Add a couple autodefer helpers

Alongside the helper ip_link_set_up(), one to set the link down will be
useful as well. Add a helper to determine the link state as well,
ip_link_is_up(), and use it to short-circuit any changes if the state is
already the desired one.

Furthermore, add a helper bridge_vlan_add().

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/856d9e01725fdba21b7f6716358f645b19131af2.1734540770.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib.sh | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 2cd5c743b2d9..0bd9a038a1f0 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -477,12 +477,33 @@ ip_link_set_addr()
 	defer ip link set dev "$name" address "$old_addr"
 }
 
+ip_link_is_up()
+{
+	local name=$1; shift
+
+	local state=$(ip -j link show "$name" |
+		      jq -r '(.[].flags[] | select(. == "UP")) // "DOWN"')
+	[[ $state == "UP" ]]
+}
+
 ip_link_set_up()
 {
 	local name=$1; shift
 
-	ip link set dev "$name" up
-	defer ip link set dev "$name" down
+	if ! ip_link_is_up "$name"; then
+		ip link set dev "$name" up
+		defer ip link set dev "$name" down
+	fi
+}
+
+ip_link_set_down()
+{
+	local name=$1; shift
+
+	if ip_link_is_up "$name"; then
+		ip link set dev "$name" down
+		defer ip link set dev "$name" up
+	fi
 }
 
 ip_addr_add()
@@ -498,3 +519,9 @@ ip_route_add()
 	ip route add "$@"
 	defer ip route del "$@"
 }
+
+bridge_vlan_add()
+{
+	bridge vlan add "$@"
+	defer bridge vlan del "$@"
+}
-- 
cgit v1.2.3


From dca12e9ab7603d94e47ded65080f750d6527c852 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 18 Dec 2024 18:15:59 +0100
Subject: selftests: net: Add a VLAN bridge binding selftest

Add a test that exercises bridge binding.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/baf7244fd1fe223a6d93e027584fa9f99dee982c.1734540770.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile               |   1 +
 tools/testing/selftests/net/vlan_bridge_binding.sh | 256 +++++++++++++++++++++
 2 files changed, 257 insertions(+)
 create mode 100755 tools/testing/selftests/net/vlan_bridge_binding.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index f09bd96cc978..73ee88d6b043 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -96,6 +96,7 @@ TEST_PROGS += test_bridge_backup_port.sh
 TEST_PROGS += fdb_flush.sh fdb_notify.sh
 TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
+TEST_PROGS += vlan_bridge_binding.sh
 TEST_PROGS += bpf_offload.py
 TEST_PROGS += ipv6_route_update_soft_lockup.sh
 TEST_PROGS += busy_poll_test.sh
diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh
new file mode 100755
index 000000000000..e7cb8c678bde
--- /dev/null
+++ b/tools/testing/selftests/net/vlan_bridge_binding.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+ALL_TESTS="
+	test_binding_on
+	test_binding_off
+	test_binding_toggle_on
+	test_binding_toggle_off
+	test_binding_toggle_on_when_upper_down
+	test_binding_toggle_off_when_upper_down
+	test_binding_toggle_on_when_lower_down
+	test_binding_toggle_off_when_lower_down
+"
+
+setup_prepare()
+{
+	local port
+
+	ip_link_add br up type bridge vlan_filtering 1
+
+	for port in d1 d2 d3; do
+		ip_link_add $port type veth peer name r$port
+		ip_link_set_up $port
+		ip_link_set_up r$port
+		ip_link_set_master $port br
+	done
+
+	bridge_vlan_add vid 11 dev br self
+	bridge_vlan_add vid 11 dev d1 master
+
+	bridge_vlan_add vid 12 dev br self
+	bridge_vlan_add vid 12 dev d2 master
+
+	bridge_vlan_add vid 13 dev br self
+	bridge_vlan_add vid 13 dev d1 master
+	bridge_vlan_add vid 13 dev d2 master
+
+	bridge_vlan_add vid 14 dev br self
+	bridge_vlan_add vid 14 dev d1 master
+	bridge_vlan_add vid 14 dev d2 master
+	bridge_vlan_add vid 14 dev d3 master
+}
+
+operstate_is()
+{
+	local dev=$1; shift
+	local expect=$1; shift
+
+	local operstate=$(ip -j link show $dev | jq -r .[].operstate)
+	if [[ $operstate == UP ]]; then
+		operstate=1
+	elif [[ $operstate == DOWN || $operstate == LOWERLAYERDOWN ]]; then
+		operstate=0
+	fi
+	echo -n $operstate
+	[[ $operstate == $expect ]]
+}
+
+check_operstate()
+{
+	local dev=$1; shift
+	local expect=$1; shift
+	local operstate
+
+	operstate=$(busywait 1000 \
+			operstate_is "$dev" "$expect")
+	check_err $? "Got operstate of $operstate, expected $expect"
+}
+
+add_one_vlan()
+{
+	local link=$1; shift
+	local id=$1; shift
+
+	ip_link_add $link.$id link $link type vlan id $id "$@"
+}
+
+add_vlans()
+{
+	add_one_vlan br 11 "$@"
+	add_one_vlan br 12 "$@"
+	add_one_vlan br 13 "$@"
+	add_one_vlan br 14 "$@"
+}
+
+set_vlans()
+{
+	ip link set dev br.11 "$@"
+	ip link set dev br.12 "$@"
+	ip link set dev br.13 "$@"
+	ip link set dev br.14 "$@"
+}
+
+down_netdevs()
+{
+	local dev
+
+	for dev in "$@"; do
+		ip_link_set_down $dev
+	done
+}
+
+check_operstates()
+{
+	local opst_11=$1; shift
+	local opst_12=$1; shift
+	local opst_13=$1; shift
+	local opst_14=$1; shift
+
+	check_operstate br.11 $opst_11
+	check_operstate br.12 $opst_12
+	check_operstate br.13 $opst_13
+	check_operstate br.14 $opst_14
+}
+
+do_test_binding()
+{
+	local inject=$1; shift
+	local what=$1; shift
+	local opsts_d1=$1; shift
+	local opsts_d2=$1; shift
+	local opsts_d12=$1; shift
+	local opsts_d123=$1; shift
+
+	RET=0
+
+	defer_scope_push
+		down_netdevs d1
+		$inject
+		check_operstates $opsts_d1
+	defer_scope_pop
+
+	defer_scope_push
+		down_netdevs d2
+		$inject
+		check_operstates $opsts_d2
+	defer_scope_pop
+
+	defer_scope_push
+		down_netdevs d1 d2
+		$inject
+		check_operstates $opsts_d12
+	defer_scope_pop
+
+	defer_scope_push
+		down_netdevs d1 d2 d3
+		$inject
+		check_operstates $opsts_d123
+	defer_scope_pop
+
+	log_test "Test bridge_binding $what"
+}
+
+do_test_binding_on()
+{
+	local inject=$1; shift
+	local what=$1; shift
+
+	do_test_binding "$inject" "$what"	\
+			"0 1 1 1"		\
+			"1 0 1 1"		\
+			"0 0 0 1"		\
+			"0 0 0 0"
+}
+
+do_test_binding_off()
+{
+	local inject=$1; shift
+	local what=$1; shift
+
+	do_test_binding "$inject" "$what"	\
+			"1 1 1 1"		\
+			"1 1 1 1"		\
+			"1 1 1 1"		\
+			"0 0 0 0"
+}
+
+test_binding_on()
+{
+	add_vlans bridge_binding on
+	set_vlans up
+	do_test_binding_on : "on"
+}
+
+test_binding_off()
+{
+	add_vlans bridge_binding off
+	set_vlans up
+	do_test_binding_off : "off"
+}
+
+test_binding_toggle_on()
+{
+	add_vlans bridge_binding off
+	set_vlans up
+	set_vlans type vlan bridge_binding on
+	do_test_binding_on : "off->on"
+}
+
+test_binding_toggle_off()
+{
+	add_vlans bridge_binding on
+	set_vlans up
+	set_vlans type vlan bridge_binding off
+	do_test_binding_off : "on->off"
+}
+
+dfr_set_binding_on()
+{
+	set_vlans type vlan bridge_binding on
+	defer set_vlans type vlan bridge_binding off
+}
+
+dfr_set_binding_off()
+{
+	set_vlans type vlan bridge_binding off
+	defer set_vlans type vlan bridge_binding on
+}
+
+test_binding_toggle_on_when_lower_down()
+{
+	add_vlans bridge_binding off
+	set_vlans up
+	do_test_binding_on dfr_set_binding_on "off->on when lower down"
+}
+
+test_binding_toggle_off_when_lower_down()
+{
+	add_vlans bridge_binding on
+	set_vlans up
+	do_test_binding_off dfr_set_binding_off "on->off when lower down"
+}
+
+test_binding_toggle_on_when_upper_down()
+{
+	add_vlans bridge_binding off
+	set_vlans type vlan bridge_binding on
+	set_vlans up
+	do_test_binding_on : "off->on when upper down"
+}
+
+test_binding_toggle_off_when_upper_down()
+{
+	add_vlans bridge_binding on
+	set_vlans type vlan bridge_binding off
+	set_vlans up
+	do_test_binding_off : "on->off when upper down"
+}
+
+trap defer_scopes_cleanup EXIT
+setup_prepare
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From 9ee0c7b8654346d60c823babe4b3747357a30477 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Thu, 12 Dec 2024 19:40:55 -0800
Subject: selftests/bpf: Add a BPF selftest for bpf_skb_change_tail()

As requested by Daniel, we need to add a selftest to cover
bpf_skb_change_tail() cases in skb_verdict. Here we test trimming,
growing and error cases, and validate its expected return values and the
expected sizes of the payload.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241213034057.246437-3-xiyou.wangcong@gmail.com
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       | 51 ++++++++++++++++++++++
 .../selftests/bpf/progs/test_sockmap_change_tail.c | 40 +++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 248754296d97..884ad87783d5 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -12,6 +12,7 @@
 #include "test_sockmap_progs_query.skel.h"
 #include "test_sockmap_pass_prog.skel.h"
 #include "test_sockmap_drop_prog.skel.h"
+#include "test_sockmap_change_tail.skel.h"
 #include "bpf_iter_sockmap.skel.h"
 
 #include "sockmap_helpers.h"
@@ -643,6 +644,54 @@ out:
 		test_sockmap_drop_prog__destroy(drop);
 }
 
+static void test_sockmap_skb_verdict_change_tail(void)
+{
+	struct test_sockmap_change_tail *skel;
+	int err, map, verdict;
+	int c1, p1, sent, recvd;
+	int zero = 0;
+	char buf[2];
+
+	skel = test_sockmap_change_tail__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+	verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+	map = bpf_map__fd(skel->maps.sock_map_rx);
+
+	err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto out;
+	err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1);
+	if (!ASSERT_OK(err, "create_pair()"))
+		goto out;
+	err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
+		goto out_close;
+	sent = xsend(p1, "Tr", 2, 0);
+	ASSERT_EQ(sent, 2, "xsend(p1)");
+	recvd = recv(c1, buf, 2, 0);
+	ASSERT_EQ(recvd, 1, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret");
+
+	sent = xsend(p1, "G", 1, 0);
+	ASSERT_EQ(sent, 1, "xsend(p1)");
+	recvd = recv(c1, buf, 2, 0);
+	ASSERT_EQ(recvd, 2, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret");
+
+	sent = xsend(p1, "E", 1, 0);
+	ASSERT_EQ(sent, 1, "xsend(p1)");
+	recvd = recv(c1, buf, 1, 0);
+	ASSERT_EQ(recvd, 1, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret");
+
+out_close:
+	close(c1);
+	close(p1);
+out:
+	test_sockmap_change_tail__destroy(skel);
+}
+
 static void test_sockmap_skb_verdict_peek_helper(int map)
 {
 	int err, c1, p1, zero = 0, sent, recvd, avail;
@@ -1058,6 +1107,8 @@ void test_sockmap_basic(void)
 		test_sockmap_skb_verdict_fionread(true);
 	if (test__start_subtest("sockmap skb_verdict fionread on drop"))
 		test_sockmap_skb_verdict_fionread(false);
+	if (test__start_subtest("sockmap skb_verdict change tail"))
+		test_sockmap_skb_verdict_change_tail();
 	if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
 		test_sockmap_skb_verdict_peek();
 	if (test__start_subtest("sockmap skb_verdict msg_f_peek with link"))
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c
new file mode 100644
index 000000000000..2796dd8545eb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 ByteDance */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKMAP);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, int);
+} sock_map_rx SEC(".maps");
+
+long change_tail_ret = 1;
+
+SEC("sk_skb")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+	char *data, *data_end;
+
+	bpf_skb_pull_data(skb, 1);
+	data = (char *)(unsigned long)skb->data;
+	data_end = (char *)(unsigned long)skb->data_end;
+
+	if (data + 1 > data_end)
+		return SK_PASS;
+
+	if (data[0] == 'T') { /* Trim the packet */
+		change_tail_ret = bpf_skb_change_tail(skb, skb->len - 1, 0);
+		return SK_PASS;
+	} else if (data[0] == 'G') { /* Grow the packet */
+		change_tail_ret = bpf_skb_change_tail(skb, skb->len + 1, 0);
+		return SK_PASS;
+	} else if (data[0] == 'E') { /* Error */
+		change_tail_ret = bpf_skb_change_tail(skb, 65535, 0);
+		return SK_PASS;
+	}
+	return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 472759c9f5377912c7483cca5da847888a27cecc Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Thu, 12 Dec 2024 19:40:56 -0800
Subject: selftests/bpf: Introduce socket_helpers.h for TC tests

Pull socket helpers out of sockmap_helpers.h so that they can be reused
for TC tests as well. This prepares for the next patch.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241213034057.246437-4-xiyou.wangcong@gmail.com
---
 .../selftests/bpf/prog_tests/socket_helpers.h      | 394 +++++++++++++++++++++
 .../selftests/bpf/prog_tests/sockmap_helpers.h     | 385 +-------------------
 2 files changed, 395 insertions(+), 384 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/socket_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/socket_helpers.h b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h
new file mode 100644
index 000000000000..1bdfb79ef009
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h
@@ -0,0 +1,394 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SOCKET_HELPERS__
+#define __SOCKET_HELPERS__
+
+#include <linux/vm_sockets.h>
+
+/* include/linux/net.h */
+#define SOCK_TYPE_MASK 0xf
+
+#define IO_TIMEOUT_SEC 30
+#define MAX_STRERR_LEN 256
+
+/* workaround for older vm_sockets.h */
+#ifndef VMADDR_CID_LOCAL
+#define VMADDR_CID_LOCAL 1
+#endif
+
+/* include/linux/cleanup.h */
+#define __get_and_null(p, nullvalue)                                           \
+	({                                                                     \
+		__auto_type __ptr = &(p);                                      \
+		__auto_type __val = *__ptr;                                    \
+		*__ptr = nullvalue;                                            \
+		__val;                                                         \
+	})
+
+#define take_fd(fd) __get_and_null(fd, -EBADF)
+
+/* Wrappers that fail the test on error and report it. */
+
+#define _FAIL(errnum, fmt...)                                                  \
+	({                                                                     \
+		error_at_line(0, (errnum), __func__, __LINE__, fmt);           \
+		CHECK_FAIL(true);                                              \
+	})
+#define FAIL(fmt...) _FAIL(0, fmt)
+#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt)
+#define FAIL_LIBBPF(err, msg)                                                  \
+	({                                                                     \
+		char __buf[MAX_STRERR_LEN];                                    \
+		libbpf_strerror((err), __buf, sizeof(__buf));                  \
+		FAIL("%s: %s", (msg), __buf);                                  \
+	})
+
+
+#define xaccept_nonblock(fd, addr, len)                                        \
+	({                                                                     \
+		int __ret =                                                    \
+			accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC);   \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("accept");                                  \
+		__ret;                                                         \
+	})
+
+#define xbind(fd, addr, len)                                                   \
+	({                                                                     \
+		int __ret = bind((fd), (addr), (len));                         \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("bind");                                    \
+		__ret;                                                         \
+	})
+
+#define xclose(fd)                                                             \
+	({                                                                     \
+		int __ret = close((fd));                                       \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("close");                                   \
+		__ret;                                                         \
+	})
+
+#define xconnect(fd, addr, len)                                                \
+	({                                                                     \
+		int __ret = connect((fd), (addr), (len));                      \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("connect");                                 \
+		__ret;                                                         \
+	})
+
+#define xgetsockname(fd, addr, len)                                            \
+	({                                                                     \
+		int __ret = getsockname((fd), (addr), (len));                  \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("getsockname");                             \
+		__ret;                                                         \
+	})
+
+#define xgetsockopt(fd, level, name, val, len)                                 \
+	({                                                                     \
+		int __ret = getsockopt((fd), (level), (name), (val), (len));   \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("getsockopt(" #name ")");                   \
+		__ret;                                                         \
+	})
+
+#define xlisten(fd, backlog)                                                   \
+	({                                                                     \
+		int __ret = listen((fd), (backlog));                           \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("listen");                                  \
+		__ret;                                                         \
+	})
+
+#define xsetsockopt(fd, level, name, val, len)                                 \
+	({                                                                     \
+		int __ret = setsockopt((fd), (level), (name), (val), (len));   \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("setsockopt(" #name ")");                   \
+		__ret;                                                         \
+	})
+
+#define xsend(fd, buf, len, flags)                                             \
+	({                                                                     \
+		ssize_t __ret = send((fd), (buf), (len), (flags));             \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("send");                                    \
+		__ret;                                                         \
+	})
+
+#define xrecv_nonblock(fd, buf, len, flags)                                    \
+	({                                                                     \
+		ssize_t __ret = recv_timeout((fd), (buf), (len), (flags),      \
+					     IO_TIMEOUT_SEC);                  \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("recv");                                    \
+		__ret;                                                         \
+	})
+
+#define xsocket(family, sotype, flags)                                         \
+	({                                                                     \
+		int __ret = socket(family, sotype, flags);                     \
+		if (__ret == -1)                                               \
+			FAIL_ERRNO("socket");                                  \
+		__ret;                                                         \
+	})
+
+static inline void close_fd(int *fd)
+{
+	if (*fd >= 0)
+		xclose(*fd);
+}
+
+#define __close_fd __attribute__((cleanup(close_fd)))
+
+static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss)
+{
+	return (struct sockaddr *)ss;
+}
+
+static inline void init_addr_loopback4(struct sockaddr_storage *ss,
+				       socklen_t *len)
+{
+	struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss));
+
+	addr4->sin_family = AF_INET;
+	addr4->sin_port = 0;
+	addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+	*len = sizeof(*addr4);
+}
+
+static inline void init_addr_loopback6(struct sockaddr_storage *ss,
+				       socklen_t *len)
+{
+	struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss));
+
+	addr6->sin6_family = AF_INET6;
+	addr6->sin6_port = 0;
+	addr6->sin6_addr = in6addr_loopback;
+	*len = sizeof(*addr6);
+}
+
+static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss,
+					    socklen_t *len)
+{
+	struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss));
+
+	addr->svm_family = AF_VSOCK;
+	addr->svm_port = VMADDR_PORT_ANY;
+	addr->svm_cid = VMADDR_CID_LOCAL;
+	*len = sizeof(*addr);
+}
+
+static inline void init_addr_loopback(int family, struct sockaddr_storage *ss,
+				      socklen_t *len)
+{
+	switch (family) {
+	case AF_INET:
+		init_addr_loopback4(ss, len);
+		return;
+	case AF_INET6:
+		init_addr_loopback6(ss, len);
+		return;
+	case AF_VSOCK:
+		init_addr_loopback_vsock(ss, len);
+		return;
+	default:
+		FAIL("unsupported address family %d", family);
+	}
+}
+
+static inline int enable_reuseport(int s, int progfd)
+{
+	int err, one = 1;
+
+	err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one));
+	if (err)
+		return -1;
+	err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd,
+			  sizeof(progfd));
+	if (err)
+		return -1;
+
+	return 0;
+}
+
+static inline int socket_loopback_reuseport(int family, int sotype, int progfd)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = 0;
+	int err, s;
+
+	init_addr_loopback(family, &addr, &len);
+
+	s = xsocket(family, sotype, 0);
+	if (s == -1)
+		return -1;
+
+	if (progfd >= 0)
+		enable_reuseport(s, progfd);
+
+	err = xbind(s, sockaddr(&addr), len);
+	if (err)
+		goto close;
+
+	if (sotype & SOCK_DGRAM)
+		return s;
+
+	err = xlisten(s, SOMAXCONN);
+	if (err)
+		goto close;
+
+	return s;
+close:
+	xclose(s);
+	return -1;
+}
+
+static inline int socket_loopback(int family, int sotype)
+{
+	return socket_loopback_reuseport(family, sotype, -1);
+}
+
+static inline int poll_connect(int fd, unsigned int timeout_sec)
+{
+	struct timeval timeout = { .tv_sec = timeout_sec };
+	fd_set wfds;
+	int r, eval;
+	socklen_t esize = sizeof(eval);
+
+	FD_ZERO(&wfds);
+	FD_SET(fd, &wfds);
+
+	r = select(fd + 1, NULL, &wfds, NULL, &timeout);
+	if (r == 0)
+		errno = ETIME;
+	if (r != 1)
+		return -1;
+
+	if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0)
+		return -1;
+	if (eval != 0) {
+		errno = eval;
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline int poll_read(int fd, unsigned int timeout_sec)
+{
+	struct timeval timeout = { .tv_sec = timeout_sec };
+	fd_set rfds;
+	int r;
+
+	FD_ZERO(&rfds);
+	FD_SET(fd, &rfds);
+
+	r = select(fd + 1, &rfds, NULL, NULL, &timeout);
+	if (r == 0)
+		errno = ETIME;
+
+	return r == 1 ? 0 : -1;
+}
+
+static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len,
+				 unsigned int timeout_sec)
+{
+	if (poll_read(fd, timeout_sec))
+		return -1;
+
+	return accept(fd, addr, len);
+}
+
+static inline int recv_timeout(int fd, void *buf, size_t len, int flags,
+			       unsigned int timeout_sec)
+{
+	if (poll_read(fd, timeout_sec))
+		return -1;
+
+	return recv(fd, buf, len, flags);
+}
+
+
+static inline int create_pair(int family, int sotype, int *p0, int *p1)
+{
+	__close_fd int s, c = -1, p = -1;
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int err;
+
+	s = socket_loopback(family, sotype);
+	if (s < 0)
+		return s;
+
+	err = xgetsockname(s, sockaddr(&addr), &len);
+	if (err)
+		return err;
+
+	c = xsocket(family, sotype, 0);
+	if (c < 0)
+		return c;
+
+	err = connect(c, sockaddr(&addr), len);
+	if (err) {
+		if (errno != EINPROGRESS) {
+			FAIL_ERRNO("connect");
+			return err;
+		}
+
+		err = poll_connect(c, IO_TIMEOUT_SEC);
+		if (err) {
+			FAIL_ERRNO("poll_connect");
+			return err;
+		}
+	}
+
+	switch (sotype & SOCK_TYPE_MASK) {
+	case SOCK_DGRAM:
+		err = xgetsockname(c, sockaddr(&addr), &len);
+		if (err)
+			return err;
+
+		err = xconnect(s, sockaddr(&addr), len);
+		if (err)
+			return err;
+
+		*p0 = take_fd(s);
+		break;
+	case SOCK_STREAM:
+	case SOCK_SEQPACKET:
+		p = xaccept_nonblock(s, NULL, NULL);
+		if (p < 0)
+			return p;
+
+		*p0 = take_fd(p);
+		break;
+	default:
+		FAIL("Unsupported socket type %#x", sotype);
+		return -EOPNOTSUPP;
+	}
+
+	*p1 = take_fd(c);
+	return 0;
+}
+
+static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1,
+				      int *p0, int *p1)
+{
+	int err;
+
+	err = create_pair(family, sotype, c0, p0);
+	if (err)
+		return err;
+
+	err = create_pair(family, sotype, c1, p1);
+	if (err) {
+		close(*c0);
+		close(*p0);
+	}
+
+	return err;
+}
+
+#endif // __SOCKET_HELPERS__
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
index 38e35c72bdaa..3e5571dd578d 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
@@ -1,139 +1,12 @@
 #ifndef __SOCKMAP_HELPERS__
 #define __SOCKMAP_HELPERS__
 
-#include <linux/vm_sockets.h>
+#include "socket_helpers.h"
 
-/* include/linux/net.h */
-#define SOCK_TYPE_MASK 0xf
-
-#define IO_TIMEOUT_SEC 30
-#define MAX_STRERR_LEN 256
 #define MAX_TEST_NAME 80
 
-/* workaround for older vm_sockets.h */
-#ifndef VMADDR_CID_LOCAL
-#define VMADDR_CID_LOCAL 1
-#endif
-
 #define __always_unused	__attribute__((__unused__))
 
-/* include/linux/cleanup.h */
-#define __get_and_null(p, nullvalue)                                           \
-	({                                                                     \
-		__auto_type __ptr = &(p);                                      \
-		__auto_type __val = *__ptr;                                    \
-		*__ptr = nullvalue;                                            \
-		__val;                                                         \
-	})
-
-#define take_fd(fd) __get_and_null(fd, -EBADF)
-
-#define _FAIL(errnum, fmt...)                                                  \
-	({                                                                     \
-		error_at_line(0, (errnum), __func__, __LINE__, fmt);           \
-		CHECK_FAIL(true);                                              \
-	})
-#define FAIL(fmt...) _FAIL(0, fmt)
-#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt)
-#define FAIL_LIBBPF(err, msg)                                                  \
-	({                                                                     \
-		char __buf[MAX_STRERR_LEN];                                    \
-		libbpf_strerror((err), __buf, sizeof(__buf));                  \
-		FAIL("%s: %s", (msg), __buf);                                  \
-	})
-
-/* Wrappers that fail the test on error and report it. */
-
-#define xaccept_nonblock(fd, addr, len)                                        \
-	({                                                                     \
-		int __ret =                                                    \
-			accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC);   \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("accept");                                  \
-		__ret;                                                         \
-	})
-
-#define xbind(fd, addr, len)                                                   \
-	({                                                                     \
-		int __ret = bind((fd), (addr), (len));                         \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("bind");                                    \
-		__ret;                                                         \
-	})
-
-#define xclose(fd)                                                             \
-	({                                                                     \
-		int __ret = close((fd));                                       \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("close");                                   \
-		__ret;                                                         \
-	})
-
-#define xconnect(fd, addr, len)                                                \
-	({                                                                     \
-		int __ret = connect((fd), (addr), (len));                      \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("connect");                                 \
-		__ret;                                                         \
-	})
-
-#define xgetsockname(fd, addr, len)                                            \
-	({                                                                     \
-		int __ret = getsockname((fd), (addr), (len));                  \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("getsockname");                             \
-		__ret;                                                         \
-	})
-
-#define xgetsockopt(fd, level, name, val, len)                                 \
-	({                                                                     \
-		int __ret = getsockopt((fd), (level), (name), (val), (len));   \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("getsockopt(" #name ")");                   \
-		__ret;                                                         \
-	})
-
-#define xlisten(fd, backlog)                                                   \
-	({                                                                     \
-		int __ret = listen((fd), (backlog));                           \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("listen");                                  \
-		__ret;                                                         \
-	})
-
-#define xsetsockopt(fd, level, name, val, len)                                 \
-	({                                                                     \
-		int __ret = setsockopt((fd), (level), (name), (val), (len));   \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("setsockopt(" #name ")");                   \
-		__ret;                                                         \
-	})
-
-#define xsend(fd, buf, len, flags)                                             \
-	({                                                                     \
-		ssize_t __ret = send((fd), (buf), (len), (flags));             \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("send");                                    \
-		__ret;                                                         \
-	})
-
-#define xrecv_nonblock(fd, buf, len, flags)                                    \
-	({                                                                     \
-		ssize_t __ret = recv_timeout((fd), (buf), (len), (flags),      \
-					     IO_TIMEOUT_SEC);                  \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("recv");                                    \
-		__ret;                                                         \
-	})
-
-#define xsocket(family, sotype, flags)                                         \
-	({                                                                     \
-		int __ret = socket(family, sotype, flags);                     \
-		if (__ret == -1)                                               \
-			FAIL_ERRNO("socket");                                  \
-		__ret;                                                         \
-	})
-
 #define xbpf_map_delete_elem(fd, key)                                          \
 	({                                                                     \
 		int __ret = bpf_map_delete_elem((fd), (key));                  \
@@ -193,130 +66,6 @@
 		__ret;                                                         \
 	})
 
-static inline void close_fd(int *fd)
-{
-	if (*fd >= 0)
-		xclose(*fd);
-}
-
-#define __close_fd __attribute__((cleanup(close_fd)))
-
-static inline int poll_connect(int fd, unsigned int timeout_sec)
-{
-	struct timeval timeout = { .tv_sec = timeout_sec };
-	fd_set wfds;
-	int r, eval;
-	socklen_t esize = sizeof(eval);
-
-	FD_ZERO(&wfds);
-	FD_SET(fd, &wfds);
-
-	r = select(fd + 1, NULL, &wfds, NULL, &timeout);
-	if (r == 0)
-		errno = ETIME;
-	if (r != 1)
-		return -1;
-
-	if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0)
-		return -1;
-	if (eval != 0) {
-		errno = eval;
-		return -1;
-	}
-
-	return 0;
-}
-
-static inline int poll_read(int fd, unsigned int timeout_sec)
-{
-	struct timeval timeout = { .tv_sec = timeout_sec };
-	fd_set rfds;
-	int r;
-
-	FD_ZERO(&rfds);
-	FD_SET(fd, &rfds);
-
-	r = select(fd + 1, &rfds, NULL, NULL, &timeout);
-	if (r == 0)
-		errno = ETIME;
-
-	return r == 1 ? 0 : -1;
-}
-
-static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len,
-				 unsigned int timeout_sec)
-{
-	if (poll_read(fd, timeout_sec))
-		return -1;
-
-	return accept(fd, addr, len);
-}
-
-static inline int recv_timeout(int fd, void *buf, size_t len, int flags,
-			       unsigned int timeout_sec)
-{
-	if (poll_read(fd, timeout_sec))
-		return -1;
-
-	return recv(fd, buf, len, flags);
-}
-
-static inline void init_addr_loopback4(struct sockaddr_storage *ss,
-				       socklen_t *len)
-{
-	struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss));
-
-	addr4->sin_family = AF_INET;
-	addr4->sin_port = 0;
-	addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-	*len = sizeof(*addr4);
-}
-
-static inline void init_addr_loopback6(struct sockaddr_storage *ss,
-				       socklen_t *len)
-{
-	struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss));
-
-	addr6->sin6_family = AF_INET6;
-	addr6->sin6_port = 0;
-	addr6->sin6_addr = in6addr_loopback;
-	*len = sizeof(*addr6);
-}
-
-static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss,
-					    socklen_t *len)
-{
-	struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss));
-
-	addr->svm_family = AF_VSOCK;
-	addr->svm_port = VMADDR_PORT_ANY;
-	addr->svm_cid = VMADDR_CID_LOCAL;
-	*len = sizeof(*addr);
-}
-
-static inline void init_addr_loopback(int family, struct sockaddr_storage *ss,
-				      socklen_t *len)
-{
-	switch (family) {
-	case AF_INET:
-		init_addr_loopback4(ss, len);
-		return;
-	case AF_INET6:
-		init_addr_loopback6(ss, len);
-		return;
-	case AF_VSOCK:
-		init_addr_loopback_vsock(ss, len);
-		return;
-	default:
-		FAIL("unsupported address family %d", family);
-	}
-}
-
-static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss)
-{
-	return (struct sockaddr *)ss;
-}
-
 static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2)
 {
 	u64 value;
@@ -334,136 +83,4 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2)
 	return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
 }
 
-static inline int enable_reuseport(int s, int progfd)
-{
-	int err, one = 1;
-
-	err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one));
-	if (err)
-		return -1;
-	err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd,
-			  sizeof(progfd));
-	if (err)
-		return -1;
-
-	return 0;
-}
-
-static inline int socket_loopback_reuseport(int family, int sotype, int progfd)
-{
-	struct sockaddr_storage addr;
-	socklen_t len = 0;
-	int err, s;
-
-	init_addr_loopback(family, &addr, &len);
-
-	s = xsocket(family, sotype, 0);
-	if (s == -1)
-		return -1;
-
-	if (progfd >= 0)
-		enable_reuseport(s, progfd);
-
-	err = xbind(s, sockaddr(&addr), len);
-	if (err)
-		goto close;
-
-	if (sotype & SOCK_DGRAM)
-		return s;
-
-	err = xlisten(s, SOMAXCONN);
-	if (err)
-		goto close;
-
-	return s;
-close:
-	xclose(s);
-	return -1;
-}
-
-static inline int socket_loopback(int family, int sotype)
-{
-	return socket_loopback_reuseport(family, sotype, -1);
-}
-
-static inline int create_pair(int family, int sotype, int *p0, int *p1)
-{
-	__close_fd int s, c = -1, p = -1;
-	struct sockaddr_storage addr;
-	socklen_t len = sizeof(addr);
-	int err;
-
-	s = socket_loopback(family, sotype);
-	if (s < 0)
-		return s;
-
-	err = xgetsockname(s, sockaddr(&addr), &len);
-	if (err)
-		return err;
-
-	c = xsocket(family, sotype, 0);
-	if (c < 0)
-		return c;
-
-	err = connect(c, sockaddr(&addr), len);
-	if (err) {
-		if (errno != EINPROGRESS) {
-			FAIL_ERRNO("connect");
-			return err;
-		}
-
-		err = poll_connect(c, IO_TIMEOUT_SEC);
-		if (err) {
-			FAIL_ERRNO("poll_connect");
-			return err;
-		}
-	}
-
-	switch (sotype & SOCK_TYPE_MASK) {
-	case SOCK_DGRAM:
-		err = xgetsockname(c, sockaddr(&addr), &len);
-		if (err)
-			return err;
-
-		err = xconnect(s, sockaddr(&addr), len);
-		if (err)
-			return err;
-
-		*p0 = take_fd(s);
-		break;
-	case SOCK_STREAM:
-	case SOCK_SEQPACKET:
-		p = xaccept_nonblock(s, NULL, NULL);
-		if (p < 0)
-			return p;
-
-		*p0 = take_fd(p);
-		break;
-	default:
-		FAIL("Unsupported socket type %#x", sotype);
-		return -EOPNOTSUPP;
-	}
-
-	*p1 = take_fd(c);
-	return 0;
-}
-
-static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1,
-				      int *p0, int *p1)
-{
-	int err;
-
-	err = create_pair(family, sotype, c0, p0);
-	if (err)
-		return err;
-
-	err = create_pair(family, sotype, c1, p1);
-	if (err) {
-		close(*c0);
-		close(*p0);
-	}
-
-	return err;
-}
-
 #endif // __SOCKMAP_HELPERS__
-- 
cgit v1.2.3


From 4a58963d10fa3cb654b859e3f9a8aecbcf9f4982 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Thu, 12 Dec 2024 19:40:57 -0800
Subject: selftests/bpf: Test bpf_skb_change_tail() in TC ingress

Similarly to the previous test, we also need a test case to cover
positive offsets as well, TC is an excellent hook for this.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Zijian Zhang <zijianzhang@bytedance.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20241213034057.246437-5-xiyou.wangcong@gmail.com
---
 .../selftests/bpf/prog_tests/tc_change_tail.c      |  62 ++++++++++++
 .../selftests/bpf/progs/test_tc_change_tail.c      | 106 +++++++++++++++++++++
 2 files changed, 168 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_change_tail.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_tc_change_tail.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c
new file mode 100644
index 000000000000..74752233e779
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <test_progs.h>
+#include <linux/pkt_cls.h>
+
+#include "test_tc_change_tail.skel.h"
+#include "socket_helpers.h"
+
+#define LO_IFINDEX 1
+
+void test_tc_change_tail(void)
+{
+	LIBBPF_OPTS(bpf_tcx_opts, tcx_opts);
+	struct test_tc_change_tail *skel = NULL;
+	struct bpf_link *link;
+	int c1, p1;
+	char buf[2];
+	int ret;
+
+	skel = test_tc_change_tail__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "test_tc_change_tail__open_and_load"))
+		return;
+
+	link = bpf_program__attach_tcx(skel->progs.change_tail, LO_IFINDEX,
+				     &tcx_opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_tcx"))
+		goto destroy;
+
+	skel->links.change_tail = link;
+	ret = create_pair(AF_INET, SOCK_DGRAM, &c1, &p1);
+	if (!ASSERT_OK(ret, "create_pair"))
+		goto destroy;
+
+	ret = xsend(p1, "Tr", 2, 0);
+	ASSERT_EQ(ret, 2, "xsend(p1)");
+	ret = recv(c1, buf, 2, 0);
+	ASSERT_EQ(ret, 2, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret");
+
+	ret = xsend(p1, "G", 1, 0);
+	ASSERT_EQ(ret, 1, "xsend(p1)");
+	ret = recv(c1, buf, 2, 0);
+	ASSERT_EQ(ret, 1, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret");
+
+	ret = xsend(p1, "E", 1, 0);
+	ASSERT_EQ(ret, 1, "xsend(p1)");
+	ret = recv(c1, buf, 1, 0);
+	ASSERT_EQ(ret, 1, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret");
+
+	ret = xsend(p1, "Z", 1, 0);
+	ASSERT_EQ(ret, 1, "xsend(p1)");
+	ret = recv(c1, buf, 1, 0);
+	ASSERT_EQ(ret, 1, "recv(c1)");
+	ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret");
+
+	close(c1);
+	close(p1);
+destroy:
+	test_tc_change_tail__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_tc_change_tail.c b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c
new file mode 100644
index 000000000000..28edafe803f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/pkt_cls.h>
+
+long change_tail_ret = 1;
+
+static __always_inline struct iphdr *parse_ip_header(struct __sk_buff *skb, int *ip_proto)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *data = (void *)(long)skb->data;
+	struct ethhdr *eth = data;
+	struct iphdr *iph;
+
+	/* Verify Ethernet header */
+	if ((void *)(data + sizeof(*eth)) > data_end)
+		return NULL;
+
+	/* Skip Ethernet header to get to IP header */
+	iph = (void *)(data + sizeof(struct ethhdr));
+
+	/* Verify IP header */
+	if ((void *)(data + sizeof(struct ethhdr) + sizeof(*iph)) > data_end)
+		return NULL;
+
+	/* Basic IP header validation */
+	if (iph->version != 4)  /* Only support IPv4 */
+		return NULL;
+
+	if (iph->ihl < 5)  /* Minimum IP header length */
+		return NULL;
+
+	*ip_proto = iph->protocol;
+	return iph;
+}
+
+static __always_inline struct udphdr *parse_udp_header(struct __sk_buff *skb, struct iphdr *iph)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *hdr = (void *)iph;
+	struct udphdr *udp;
+
+	/* Calculate UDP header position */
+	udp = hdr + (iph->ihl * 4);
+	hdr = (void *)udp;
+
+	/* Verify UDP header bounds */
+	if ((void *)(hdr + sizeof(*udp)) > data_end)
+		return NULL;
+
+	return udp;
+}
+
+SEC("tc/ingress")
+int change_tail(struct __sk_buff *skb)
+{
+	int len = skb->len;
+	struct udphdr *udp;
+	struct iphdr *iph;
+	void *data_end;
+	char *payload;
+	int ip_proto;
+
+	bpf_skb_pull_data(skb, len);
+
+	data_end = (void *)(long)skb->data_end;
+	iph = parse_ip_header(skb, &ip_proto);
+	if (!iph)
+		return TCX_PASS;
+
+	if (ip_proto != IPPROTO_UDP)
+		return TCX_PASS;
+
+	udp = parse_udp_header(skb, iph);
+	if (!udp)
+		return TCX_PASS;
+
+	payload = (char *)udp + (sizeof(struct udphdr));
+	if (payload + 1 > (char *)data_end)
+		return TCX_PASS;
+
+	if (payload[0] == 'T') { /* Trim the packet */
+		change_tail_ret = bpf_skb_change_tail(skb, len - 1, 0);
+		if (!change_tail_ret)
+			bpf_skb_change_tail(skb, len, 0);
+		return TCX_PASS;
+	} else if (payload[0] == 'G') { /* Grow the packet */
+		change_tail_ret = bpf_skb_change_tail(skb, len + 1, 0);
+		if (!change_tail_ret)
+			bpf_skb_change_tail(skb, len, 0);
+		return TCX_PASS;
+	} else if (payload[0] == 'E') { /* Error */
+		change_tail_ret = bpf_skb_change_tail(skb, 65535, 0);
+		return TCX_PASS;
+	} else if (payload[0] == 'Z') { /* Zero */
+		change_tail_ret = bpf_skb_change_tail(skb, 0, 0);
+		return TCX_PASS;
+	}
+	return TCX_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From f63df61651be541cc5699083faa1bfbaa105ed44 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 19 Dec 2024 18:01:33 +0100
Subject: selftests: add pidfd bind-mount tests

Link: https://lore.kernel.org/r/20241219-work-pidfs-mount-v1-2-dbc56198b839@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/pidfd/.gitignore         |   1 +
 tools/testing/selftests/pidfd/Makefile           |   2 +-
 tools/testing/selftests/pidfd/pidfd_bind_mount.c | 188 +++++++++++++++++++++++
 3 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/pidfd/pidfd_bind_mount.c

(limited to 'tools')

diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 224260e1a4a2..bf92481f925c 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -7,3 +7,4 @@ pidfd_fdinfo_test
 pidfd_getfd_test
 pidfd_setns_test
 pidfd_file_handle_test
+pidfd_bind_mount
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index 3c16d8e77684..301343a11b62 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -3,7 +3,7 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
 
 TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
 	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
-	pidfd_file_handle_test
+	pidfd_file_handle_test pidfd_bind_mount
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
new file mode 100644
index 000000000000..7822dd080258
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+#ifndef __NR_open_tree
+	#if defined __alpha__
+		#define __NR_open_tree 538
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_open_tree 4428
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_open_tree 6428
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_open_tree 5428
+		#endif
+	#elif defined __ia64__
+		#define __NR_open_tree (428 + 1024)
+	#else
+		#define __NR_open_tree 428
+	#endif
+#endif
+
+#ifndef __NR_move_mount
+	#if defined __alpha__
+		#define __NR_move_mount 539
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_move_mount 4429
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_move_mount 6429
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_move_mount 5429
+		#endif
+	#elif defined __ia64__
+		#define __NR_move_mount (428 + 1024)
+	#else
+		#define __NR_move_mount 429
+	#endif
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+                                 int to_dfd, const char *to_pathname,
+                                 unsigned int flags)
+{
+        return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+                       to_pathname, flags);
+}
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
+{
+	return syscall(__NR_open_tree, dfd, filename, flags);
+}
+
+FIXTURE(pidfd_bind_mount) {
+	char template[PATH_MAX];
+	int fd_tmp;
+	int pidfd;
+	struct stat st1;
+	struct stat st2;
+	__u32 gen1;
+	__u32 gen2;
+	bool must_unmount;
+};
+
+FIXTURE_SETUP(pidfd_bind_mount)
+{
+	self->fd_tmp = -EBADF;
+	self->must_unmount = false;
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+	ASSERT_LE(snprintf(self->template, PATH_MAX, "%s", P_tmpdir "/pidfd_bind_mount_XXXXXX"), PATH_MAX);
+	self->fd_tmp = mkstemp(self->template);
+	ASSERT_GE(self->fd_tmp, 0);
+	self->pidfd = sys_pidfd_open(getpid(), 0);
+	ASSERT_GE(self->pidfd, 0);
+	ASSERT_GE(fstat(self->pidfd, &self->st1), 0);
+	ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen1), 0);
+}
+
+FIXTURE_TEARDOWN(pidfd_bind_mount)
+{
+	ASSERT_EQ(close(self->fd_tmp), 0);
+	if (self->must_unmount)
+		ASSERT_EQ(umount2(self->template, 0), 0);
+	ASSERT_EQ(unlink(self->template), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy.
+ */
+TEST_F(pidfd_bind_mount, bind_mount)
+{
+	int fd_tree;
+
+	fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+	ASSERT_GE(fd_tree, 0);
+
+	ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+	self->must_unmount = true;
+
+	ASSERT_EQ(close(fd_tree), 0);
+}
+
+/* Test that a pidfd can be reopened through procfs. */
+TEST_F(pidfd_bind_mount, reopen)
+{
+	int pidfd;
+	char proc_path[PATH_MAX];
+
+	sprintf(proc_path, "/proc/self/fd/%d", self->pidfd);
+	pidfd = open(proc_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_GE(fstat(self->pidfd, &self->st2), 0);
+	ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen2), 0);
+
+	ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+	ASSERT_TRUE(self->gen1 == self->gen2);
+
+	ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy and reopened.
+ */
+TEST_F(pidfd_bind_mount, bind_mount_reopen)
+{
+	int fd_tree, fd_pidfd_mnt;
+
+	fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+	ASSERT_GE(fd_tree, 0);
+
+	ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+	self->must_unmount = true;
+
+	fd_pidfd_mnt = openat(-EBADF, self->template, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+	ASSERT_GE(fd_pidfd_mnt, 0);
+
+	ASSERT_GE(fstat(fd_tree, &self->st2), 0);
+	ASSERT_EQ(ioctl(fd_pidfd_mnt, FS_IOC_GETVERSION, &self->gen2), 0);
+
+	ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+	ASSERT_TRUE(self->gen1 == self->gen2);
+
+	ASSERT_EQ(close(fd_tree), 0);
+	ASSERT_EQ(close(fd_pidfd_mnt), 0);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From f288c7a1ba268a9ed58a7971142a98a1e41a3c73 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 19 Dec 2024 16:31:16 -0800
Subject: selftests: drv-net: assume stats refresh is 0 if no ethtool -c
 support

Tests using HW stats wait for them to stabilize, using data from
ethtool -c as the delay. Not all drivers implement ethtool -c
so handle the errors gracefully.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241220003116.1458863-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/lib/py/env.py | 9 +++++++--
 tools/testing/selftests/net/lib/py/utils.py       | 6 ++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 1ea9bb695e94..fea343f209ea 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -5,7 +5,7 @@ import time
 from pathlib import Path
 from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import ksft_setup
-from lib.py import cmd, ethtool, ip
+from lib.py import cmd, ethtool, ip, CmdExitFailure
 from lib.py import NetNS, NetdevSimDev
 from .remote import Remote
 
@@ -234,7 +234,12 @@ class NetDrvEpEnv:
         Good drivers will tell us via ethtool what their sync period is.
         """
         if self._stats_settle_time is None:
-            data = ethtool("-c " + self.ifname, json=True)[0]
+            data = {}
+            try:
+                data = ethtool("-c " + self.ifname, json=True)[0]
+            except CmdExitFailure as e:
+                if "Operation not supported" not in e.cmd.stderr:
+                    raise
 
             self._stats_settle_time = 0.025 + \
                 data.get('stats-block-usecs', 0) / 1000 / 1000
diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
index 72590c3f90f1..9e3bcddcf3e8 100644
--- a/tools/testing/selftests/net/lib/py/utils.py
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -10,7 +10,9 @@ import time
 
 
 class CmdExitFailure(Exception):
-    pass
+    def __init__(self, msg, cmd_obj):
+        super().__init__(msg)
+        self.cmd = cmd_obj
 
 
 class cmd:
@@ -48,7 +50,7 @@ class cmd:
             if len(stderr) > 0 and stderr[-1] == "\n":
                 stderr = stderr[:-1]
             raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" %
-                                 (self.proc.args, stdout, stderr))
+                                 (self.proc.args, stdout, stderr), self)
 
 
 class bkg(cmd):
-- 
cgit v1.2.3


From f3af3ba1083836d174ada619366783fa17272f66 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:28 +0100
Subject: vsock/test: Use NSEC_PER_SEC

Replace 1000000000ULL with NSEC_PER_SEC.

No functional change intended.

Reviewed-by: Luigi Leonardi <leonardi@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-1-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 48f17641ca50..38fd8d96eb83 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -22,6 +22,7 @@
 #include <signal.h>
 #include <sys/ioctl.h>
 #include <linux/sockios.h>
+#include <linux/time64.h>
 
 #include "vsock_test_zerocopy.h"
 #include "timeout.h"
@@ -559,7 +560,7 @@ static time_t current_nsec(void)
 		exit(EXIT_FAILURE);
 	}
 
-	return (ts.tv_sec * 1000000000ULL) + ts.tv_nsec;
+	return (ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
 }
 
 #define RCVTIMEO_TIMEOUT_SEC 1
@@ -599,7 +600,7 @@ static void test_seqpacket_timeout_client(const struct test_opts *opts)
 	}
 
 	read_overhead_ns = current_nsec() - read_enter_ns -
-			1000000000ULL * RCVTIMEO_TIMEOUT_SEC;
+			   NSEC_PER_SEC * RCVTIMEO_TIMEOUT_SEC;
 
 	if (read_overhead_ns > READ_OVERHEAD_NSEC) {
 		fprintf(stderr,
-- 
cgit v1.2.3


From ef8bd18f475e969753b1b72588a4932195d420f3 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:29 +0100
Subject: vsock/test: Introduce option to select tests

Allow for selecting specific test IDs to be executed.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-2-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/util.c       | 29 +++++++++++++++++++++++++++--
 tools/testing/vsock/util.h       |  2 ++
 tools/testing/vsock/vsock_test.c | 11 +++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 34e9dac0a105..81b9a31059d8 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -486,8 +486,7 @@ void list_tests(const struct test_case *test_cases)
 	exit(EXIT_FAILURE);
 }
 
-void skip_test(struct test_case *test_cases, size_t test_cases_len,
-	       const char *test_id_str)
+static unsigned long parse_test_id(const char *test_id_str, size_t test_cases_len)
 {
 	unsigned long test_id;
 	char *endptr = NULL;
@@ -505,9 +504,35 @@ void skip_test(struct test_case *test_cases, size_t test_cases_len,
 		exit(EXIT_FAILURE);
 	}
 
+	return test_id;
+}
+
+void skip_test(struct test_case *test_cases, size_t test_cases_len,
+	       const char *test_id_str)
+{
+	unsigned long test_id = parse_test_id(test_id_str, test_cases_len);
 	test_cases[test_id].skip = true;
 }
 
+void pick_test(struct test_case *test_cases, size_t test_cases_len,
+	       const char *test_id_str)
+{
+	static bool skip_all = true;
+	unsigned long test_id;
+
+	if (skip_all) {
+		unsigned long i;
+
+		for (i = 0; i < test_cases_len; ++i)
+			test_cases[i].skip = true;
+
+		skip_all = false;
+	}
+
+	test_id = parse_test_id(test_id_str, test_cases_len);
+	test_cases[test_id].skip = false;
+}
+
 unsigned long hash_djb2(const void *data, size_t len)
 {
 	unsigned long hash = 5381;
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index ba84d296d8b7..e62f46b2b92a 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -62,6 +62,8 @@ void run_tests(const struct test_case *test_cases,
 void list_tests(const struct test_case *test_cases);
 void skip_test(struct test_case *test_cases, size_t test_cases_len,
 	       const char *test_id_str);
+void pick_test(struct test_case *test_cases, size_t test_cases_len,
+	       const char *test_id_str);
 unsigned long hash_djb2(const void *data, size_t len);
 size_t iovec_bytes(const struct iovec *iov, size_t iovnum);
 unsigned long iovec_hash_djb2(const struct iovec *iov, size_t iovnum);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 38fd8d96eb83..8bb2ab41c55f 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1644,6 +1644,11 @@ static const struct option longopts[] = {
 		.has_arg = required_argument,
 		.val = 's',
 	},
+	{
+		.name = "pick",
+		.has_arg = required_argument,
+		.val = 't',
+	},
 	{
 		.name = "help",
 		.has_arg = no_argument,
@@ -1681,6 +1686,8 @@ static void usage(void)
 		"  --peer-cid <cid>       CID of the other side\n"
 		"  --peer-port <port>     AF_VSOCK port used for the test [default: %d]\n"
 		"  --list                 List of tests that will be executed\n"
+		"  --pick <test_id>       Test ID to execute selectively;\n"
+		"                         use multiple --pick options to select more tests\n"
 		"  --skip <test_id>       Test ID to skip;\n"
 		"                         use multiple --skip options to skip more tests\n",
 		DEFAULT_PEER_PORT
@@ -1737,6 +1744,10 @@ int main(int argc, char **argv)
 			skip_test(test_cases, ARRAY_SIZE(test_cases) - 1,
 				  optarg);
 			break;
+		case 't':
+			pick_test(test_cases, ARRAY_SIZE(test_cases) - 1,
+				  optarg);
+			break;
 		case '?':
 		default:
 			usage();
-- 
cgit v1.2.3


From 50f9434463a0be5b972ee442ba6a9704c9afb02a Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:30 +0100
Subject: vsock/test: Add README blurb about kmemleak usage

Document the suggested use of kmemleak for memory leak detection.

Suggested-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-3-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/README | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/README b/tools/testing/vsock/README
index 84ee217ba8ee..680ce666ceb5 100644
--- a/tools/testing/vsock/README
+++ b/tools/testing/vsock/README
@@ -36,6 +36,21 @@ Invoke test binaries in both directions as follows:
                        --control-port=1234 \
                        --peer-cid=3
 
+Some tests are designed to produce kernel memory leaks. Leaks detection,
+however, is deferred to Kernel Memory Leak Detector. It is recommended to enable
+kmemleak (CONFIG_DEBUG_KMEMLEAK=y) and explicitly trigger a scan after each test
+suite run, e.g.
+
+  # echo clear > /sys/kernel/debug/kmemleak
+  # $TEST_BINARY ...
+  # echo "wait for any grace periods" && sleep 2
+  # echo scan > /sys/kernel/debug/kmemleak
+  # echo "wait for kmemleak" && sleep 5
+  # echo scan > /sys/kernel/debug/kmemleak
+  # cat /sys/kernel/debug/kmemleak
+
+For more information see Documentation/dev-tools/kmemleak.rst.
+
 vsock_perf utility
 -------------------
 'vsock_perf' is a simple tool to measure vsock performance. It works in
-- 
cgit v1.2.3


From f52e7f593b49344b9497c289cbb2ada213f60a7a Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:31 +0100
Subject: vsock/test: Adapt send_byte()/recv_byte() to handle MSG_ZEROCOPY

For a zerocopy send(), buffer (always byte 'A') needs to be preserved (thus
it can not be on the stack) or the data recv()ed check in recv_byte() might
fail.

While there, change the printf format to 0x%02x so the '\0' bytes can be
seen.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-4-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 81b9a31059d8..7058dc614c25 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -401,7 +401,7 @@ void recv_buf(int fd, void *buf, size_t len, int flags, ssize_t expected_ret)
  */
 void send_byte(int fd, int expected_ret, int flags)
 {
-	const uint8_t byte = 'A';
+	static const uint8_t byte = 'A';
 
 	send_buf(fd, &byte, sizeof(byte), flags, expected_ret);
 }
@@ -420,7 +420,7 @@ void recv_byte(int fd, int expected_ret, int flags)
 	recv_buf(fd, &byte, sizeof(byte), flags, expected_ret);
 
 	if (byte != 'A') {
-		fprintf(stderr, "unexpected byte read %c\n", byte);
+		fprintf(stderr, "unexpected byte read 0x%02x\n", byte);
 		exit(EXIT_FAILURE);
 	}
 }
-- 
cgit v1.2.3


From f66ef469a72d19764f943067307a570f83b00dca Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:32 +0100
Subject: vsock/test: Add test for accept_queue memory leak

Attempt to enqueue a child after the queue was flushed, but before
SOCK_DONE flag has been set.

Test tries to produce a memory leak, kmemleak should be employed. Dealing
with a race condition, test by its very nature may lead to a false
negative.

Fixed by commit d7b0ff5a8667 ("virtio/vsock: Fix accept_queue memory
leak").

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-5-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c | 52 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 8bb2ab41c55f..2a8fcb062d9d 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -29,6 +29,10 @@
 #include "control.h"
 #include "util.h"
 
+/* Basic messages for control_writeulong(), control_readulong() */
+#define CONTROL_CONTINUE	1
+#define CONTROL_DONE		0
+
 static void test_stream_connection_reset(const struct test_opts *opts)
 {
 	union {
@@ -1474,6 +1478,49 @@ static void test_stream_cred_upd_on_set_rcvlowat(const struct test_opts *opts)
 	test_stream_credit_update_test(opts, false);
 }
 
+/* The goal of test leak_acceptq is to stress the race between connect() and
+ * close(listener). Implementation of client/server loops boils down to:
+ *
+ * client                server
+ * ------                ------
+ * write(CONTINUE)
+ *                       expect(CONTINUE)
+ *                       listen()
+ *                       write(LISTENING)
+ * expect(LISTENING)
+ * connect()             close()
+ */
+#define ACCEPTQ_LEAK_RACE_TIMEOUT 2 /* seconds */
+
+static void test_stream_leak_acceptq_client(const struct test_opts *opts)
+{
+	time_t tout;
+	int fd;
+
+	tout = current_nsec() + ACCEPTQ_LEAK_RACE_TIMEOUT * NSEC_PER_SEC;
+	do {
+		control_writeulong(CONTROL_CONTINUE);
+
+		fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
+		if (fd >= 0)
+			close(fd);
+	} while (current_nsec() < tout);
+
+	control_writeulong(CONTROL_DONE);
+}
+
+/* Test for a memory leak. User is expected to run kmemleak scan, see README. */
+static void test_stream_leak_acceptq_server(const struct test_opts *opts)
+{
+	int fd;
+
+	while (control_readulong() == CONTROL_CONTINUE) {
+		fd = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port);
+		control_writeln("LISTENING");
+		close(fd);
+	}
+}
+
 static struct test_case test_cases[] = {
 	{
 		.name = "SOCK_STREAM connection reset",
@@ -1604,6 +1651,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_seqpacket_unsent_bytes_client,
 		.run_server = test_seqpacket_unsent_bytes_server,
 	},
+	{
+		.name = "SOCK_STREAM leak accept queue",
+		.run_client = test_stream_leak_acceptq_client,
+		.run_server = test_stream_leak_acceptq_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From ec50efee8cf814035d82f3b42dad916144d98b38 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:33 +0100
Subject: vsock/test: Add test for sk_error_queue memory leak

Ask for MSG_ZEROCOPY completion notification, but do not recv() it.
Test attempts to create a memory leak, kmemleak should be employed.

Fixed by commit fbf7085b3ad1 ("vsock: Fix sk_error_queue memory leak").

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-6-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c | 45 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 2a8fcb062d9d..2dec6290b075 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1521,6 +1521,46 @@ static void test_stream_leak_acceptq_server(const struct test_opts *opts)
 	}
 }
 
+/* Test for a memory leak. User is expected to run kmemleak scan, see README. */
+static void test_stream_msgzcopy_leak_errq_client(const struct test_opts *opts)
+{
+	struct pollfd fds = { 0 };
+	int fd;
+
+	fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
+	if (fd < 0) {
+		perror("connect");
+		exit(EXIT_FAILURE);
+	}
+
+	enable_so_zerocopy_check(fd);
+	send_byte(fd, 1, MSG_ZEROCOPY);
+
+	fds.fd = fd;
+	fds.events = 0;
+	if (poll(&fds, 1, -1) < 0) {
+		perror("poll");
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+}
+
+static void test_stream_msgzcopy_leak_errq_server(const struct test_opts *opts)
+{
+	int fd;
+
+	fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
+	if (fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	recv_byte(fd, 1, 0);
+	vsock_wait_remote_close(fd);
+	close(fd);
+}
+
 static struct test_case test_cases[] = {
 	{
 		.name = "SOCK_STREAM connection reset",
@@ -1656,6 +1696,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_leak_acceptq_client,
 		.run_server = test_stream_leak_acceptq_server,
 	},
+	{
+		.name = "SOCK_STREAM MSG_ZEROCOPY leak MSG_ERRQUEUE",
+		.run_client = test_stream_msgzcopy_leak_errq_client,
+		.run_server = test_stream_msgzcopy_leak_errq_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From d127ac8b1d4d3524d292b597100fef96dd909c9b Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 19 Dec 2024 10:49:34 +0100
Subject: vsock/test: Add test for MSG_ZEROCOPY completion memory leak

Exercise the ENOMEM error path by attempting to hit net.core.optmem_max
limit on send().

Test aims to create a memory leak, kmemleak should be employed.

Fixed by commit 60cf6206a1f5 ("virtio/vsock: Improve MSG_ZEROCOPY error
handling").

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-7-a416e554d9d7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c | 152 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 2dec6290b075..1eebbc0d5f61 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1561,6 +1561,153 @@ static void test_stream_msgzcopy_leak_errq_server(const struct test_opts *opts)
 	close(fd);
 }
 
+/* Test msgzcopy_leak_zcskb is meant to exercise sendmsg() error handling path,
+ * that might leak an skb. The idea is to fail virtio_transport_init_zcopy_skb()
+ * by hitting net.core.optmem_max limit in sock_omalloc(), specifically
+ *
+ *   vsock_connectible_sendmsg
+ *     virtio_transport_stream_enqueue
+ *       virtio_transport_send_pkt_info
+ *         virtio_transport_init_zcopy_skb
+ *         . msg_zerocopy_realloc
+ *         .   msg_zerocopy_alloc
+ *         .     sock_omalloc
+ *         .       sk_omem_alloc + size > sysctl_optmem_max
+ *         return -ENOMEM
+ *
+ * We abuse the implementation detail of net/socket.c:____sys_sendmsg().
+ * sk_omem_alloc can be precisely bumped by sock_kmalloc(), as it is used to
+ * fetch user-provided control data.
+ *
+ * While this approach works for now, it relies on assumptions regarding the
+ * implementation and configuration (for example, order of net.core.optmem_max
+ * can not exceed MAX_PAGE_ORDER), which may not hold in the future. A more
+ * resilient testing could be implemented by leveraging the Fault injection
+ * framework (CONFIG_FAULT_INJECTION), e.g.
+ *
+ *   client# echo N > /sys/kernel/debug/failslab/ignore-gfp-wait
+ *   client# echo 0 > /sys/kernel/debug/failslab/verbose
+ *
+ *   void client(const struct test_opts *opts)
+ *   {
+ *       char buf[16];
+ *       int f, s, i;
+ *
+ *       f = open("/proc/self/fail-nth", O_WRONLY);
+ *
+ *       for (i = 1; i < 32; i++) {
+ *           control_writeulong(CONTROL_CONTINUE);
+ *
+ *           s = vsock_stream_connect(opts->peer_cid, opts->peer_port);
+ *           enable_so_zerocopy_check(s);
+ *
+ *           sprintf(buf, "%d", i);
+ *           write(f, buf, strlen(buf));
+ *
+ *           send(s, &(char){ 0 }, 1, MSG_ZEROCOPY);
+ *
+ *           write(f, "0", 1);
+ *           close(s);
+ *       }
+ *
+ *       control_writeulong(CONTROL_DONE);
+ *       close(f);
+ *   }
+ *
+ *   void server(const struct test_opts *opts)
+ *   {
+ *       int fd;
+ *
+ *       while (control_readulong() == CONTROL_CONTINUE) {
+ *           fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
+ *           vsock_wait_remote_close(fd);
+ *           close(fd);
+ *       }
+ *   }
+ *
+ * Refer to Documentation/fault-injection/fault-injection.rst.
+ */
+#define MAX_PAGE_ORDER	10	/* usually */
+#define PAGE_SIZE	4096
+
+/* Test for a memory leak. User is expected to run kmemleak scan, see README. */
+static void test_stream_msgzcopy_leak_zcskb_client(const struct test_opts *opts)
+{
+	size_t optmem_max, ctl_len, chunk_size;
+	struct msghdr msg = { 0 };
+	struct iovec iov;
+	char *chunk;
+	int fd, res;
+	FILE *f;
+
+	f = fopen("/proc/sys/net/core/optmem_max", "r");
+	if (!f) {
+		perror("fopen(optmem_max)");
+		exit(EXIT_FAILURE);
+	}
+
+	if (fscanf(f, "%zu", &optmem_max) != 1) {
+		fprintf(stderr, "fscanf(optmem_max) failed\n");
+		exit(EXIT_FAILURE);
+	}
+
+	fclose(f);
+
+	fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
+	if (fd < 0) {
+		perror("connect");
+		exit(EXIT_FAILURE);
+	}
+
+	enable_so_zerocopy_check(fd);
+
+	ctl_len = optmem_max - 1;
+	if (ctl_len > PAGE_SIZE << MAX_PAGE_ORDER) {
+		fprintf(stderr, "Try with net.core.optmem_max = 100000\n");
+		exit(EXIT_FAILURE);
+	}
+
+	chunk_size = CMSG_SPACE(ctl_len);
+	chunk = malloc(chunk_size);
+	if (!chunk) {
+		perror("malloc");
+		exit(EXIT_FAILURE);
+	}
+	memset(chunk, 0, chunk_size);
+
+	iov.iov_base = &(char){ 0 };
+	iov.iov_len = 1;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = chunk;
+	msg.msg_controllen = ctl_len;
+
+	errno = 0;
+	res = sendmsg(fd, &msg, MSG_ZEROCOPY);
+	if (res >= 0 || errno != ENOMEM) {
+		fprintf(stderr, "Expected ENOMEM, got errno=%d res=%d\n",
+			errno, res);
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+}
+
+static void test_stream_msgzcopy_leak_zcskb_server(const struct test_opts *opts)
+{
+	int fd;
+
+	fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
+	if (fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	vsock_wait_remote_close(fd);
+	close(fd);
+}
+
 static struct test_case test_cases[] = {
 	{
 		.name = "SOCK_STREAM connection reset",
@@ -1701,6 +1848,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_msgzcopy_leak_errq_client,
 		.run_server = test_stream_msgzcopy_leak_errq_server,
 	},
+	{
+		.name = "SOCK_STREAM MSG_ZEROCOPY leak completion skb",
+		.run_client = test_stream_msgzcopy_leak_zcskb_client,
+		.run_server = test_stream_msgzcopy_leak_zcskb_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From ce2b93fc1dfa1c82f2576aa571731c4e5dcc8dd7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Dec 2024 14:09:15 -1000
Subject: sched_ext: Fix dsq_local_on selftest

The dsp_local_on selftest expects the scheduler to fail by trying to
schedule an e.g. CPU-affine task to the wrong CPU. However, this isn't
guaranteed to happen in the 1 second window that the test is running.
Besides, it's odd to have this particular exception path tested when there
are no other tests that verify that the interface is working at all - e.g.
the test would pass if dsp_local_on interface is completely broken and fails
on any attempt.

Flip the test so that it verifies that the feature works. While at it, fix a
typo in the info message.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ihor Solodrai <ihor.solodrai@pm.me>
Link: http://lkml.kernel.org/r/Z1n9v7Z6iNJ-wKmq@slm.duckdns.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 5 ++++-
 tools/testing/selftests/sched_ext/dsp_local_on.c     | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
index 6325bf76f47e..fbda6bf54671 100644
--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
@@ -43,7 +43,10 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev)
 	if (!p)
 		return;
 
-	target = bpf_get_prandom_u32() % nr_cpus;
+	if (p->nr_cpus_allowed == nr_cpus)
+		target = bpf_get_prandom_u32() % nr_cpus;
+	else
+		target = scx_bpf_task_cpu(p);
 
 	scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
 	bpf_task_release(p);
diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c
index 472851b56854..0ff27e57fe43 100644
--- a/tools/testing/selftests/sched_ext/dsp_local_on.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.c
@@ -34,9 +34,10 @@ static enum scx_test_status run(void *ctx)
 	/* Just sleeping is fine, plenty of scheduling events happening */
 	sleep(1);
 
-	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
 	bpf_link__destroy(link);
 
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
 	return SCX_TEST_PASS;
 }
 
@@ -50,7 +51,7 @@ static void cleanup(void *ctx)
 struct scx_test dsp_local_on = {
 	.name = "dsp_local_on",
 	.description = "Verify we can directly dispatch tasks to a local DSQs "
-		       "from osp.dispatch()",
+		       "from ops.dispatch()",
 	.setup = setup,
 	.run = run,
 	.cleanup = cleanup,
-- 
cgit v1.2.3


From 91fce23a08f6f8cc827b865d1870b7c39bf10455 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 26 Dec 2024 14:14:43 +0900
Subject: selftests: ftrace: Remove obsolate maxactive syntax check

Since the fprobe event does not support maxactive anymore, stop
testing the maxactive syntax error checking.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Florent Revest <revest@chromium.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: bpf <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/173519008333.391279.10184048816208739987.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 .../testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
index 61877d166451..c9425a34fae3 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
@@ -16,9 +16,7 @@ aarch64)
   REG=%r0 ;;
 esac
 
-check_error 'f^100 vfs_read'		# MAXACT_NO_KPROBE
-check_error 'f^1a111 vfs_read'		# BAD_MAXACT
-check_error 'f^100000 vfs_read'		# MAXACT_TOO_BIG
+check_error 'f^100 vfs_read'		# BAD_MAXACT
 
 check_error 'f ^non_exist_func'		# BAD_PROBE_ADDR (enoent)
 check_error 'f ^vfs_read+10'		# BAD_PROBE_ADDR
-- 
cgit v1.2.3


From 0c2dd44d3f9b1e6959cd201e2192cd55636d7bbb Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 26 Dec 2024 14:14:54 +0900
Subject: selftests/ftrace: Add a test case for repeating register/unregister
 fprobe

This test case repeats define and undefine the fprobe dynamic event to
ensure that the fprobe does not cause any issue with such operations.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Florent Revest <revest@chromium.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: bpf <bpf@vger.kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/173519009398.391279.4625924605120064761.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 .../test.d/dynevent/add_remove_fprobe_repeat.tc       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_repeat.tc

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_repeat.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_repeat.tc
new file mode 100644
index 000000000000..b4ad09237e2a
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_repeat.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - Repeating add/remove fprobe events
+# requires: dynamic_events "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+PLACE=$FUNCTION_FORK
+REPEAT_TIMES=64
+
+for i in `seq 1 $REPEAT_TIMES`; do
+  echo "f:myevent $PLACE" >> dynamic_events
+  grep -q myevent dynamic_events
+  test -d events/fprobes/myevent
+  echo > dynamic_events
+done
+
+clear_trace
-- 
cgit v1.2.3


From 8d097444982d7b23a5396169dc9d2923a59b5a79 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Tue, 24 Dec 2024 01:23:28 -0500
Subject: pm: cpupower: Add header changes for cpufreq.h to SWIG bindings

"cpupower: Add support for showing energy performance preference" added
two new functions to cpufreq.h. This patch adds them to the bindings.

Link: https://lore.kernel.org/linux-pm/8dc731c3-6586-4265-ae6a-d93ed219a963@linuxfoundation.org/T/#t

Tested by compiling both libcpupower and the headers; running the test
script that does not use the functions as a basic sanity test.

Link: https://lore.kernel.org/r/20241224062329.39606-1-jwyatt@redhat.com
Signed-off-by: "John B. Wyatt IV" <jwyatt@redhat.com>
Signed-off-by: "John B. Wyatt IV" <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/raw_pylibcpupower.swg | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
index 96556d87a745..a8226c79cfea 100644
--- a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
+++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
@@ -134,6 +134,9 @@ void cpufreq_put_stats(struct cpufreq_stats *stats);
 
 unsigned long cpufreq_get_transitions(unsigned int cpu);
 
+char *cpufreq_get_energy_performance_preference(unsigned int cpu);
+void cpufreq_put_energy_performance_preference(char *ptr);
+
 int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy);
 
 int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq);
-- 
cgit v1.2.3


From 6cc45f8c1f898570916044f606be9890d295e129 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Wed, 27 Nov 2024 14:41:30 +0100
Subject: rtla/timerlat: Fix histogram ALL for zero samples

rtla timerlat hist currently computers the minimum, maximum and average
latency even in cases when there are zero samples. This leads to
nonsensical values being calculated for maximum and minimum, and to
divide by zero for average.

A similar bug is fixed by 01b05fc0e5f3 ("rtla/timerlat: Fix histogram
report when a cpu count is 0") but the bug still remains for printing
the sum over all CPUs in timerlat_print_stats_all.

The issue can be reproduced with this command:

$ rtla timerlat hist -U -d 1s
Index
over:
count:
min:
avg:
max:
Floating point exception (core dumped)

(There are always no samples with -U unless the user workload is
created.)

Fix the bug by omitting max/min/avg when sample count is zero,
displaying a dash instead, just like we already do for the individual
CPUs. The logic is moved into a new function called
format_summary_value, which is used for both the individual CPUs
and for the overall summary.

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/20241127134130.51171-1-tglozar@redhat.com
Fixes: 1462501c7a8 ("rtla/timerlat: Add a summary for hist mode")
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_hist.c | 177 ++++++++++++++++++---------------
 1 file changed, 96 insertions(+), 81 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 8b66387e5f35..4403cc4eba30 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -281,6 +281,21 @@ static void timerlat_hist_header(struct osnoise_tool *tool)
 	trace_seq_reset(s);
 }
 
+/*
+ * format_summary_value - format a line of summary value (min, max or avg)
+ * of hist data
+ */
+static void format_summary_value(struct trace_seq *seq,
+				 int count,
+				 unsigned long long val,
+				 bool avg)
+{
+	if (count)
+		trace_seq_printf(seq, "%9llu ", avg ? val / count : val);
+	else
+		trace_seq_printf(seq, "%9c ", '-');
+}
+
 /*
  * timerlat_print_summary - print the summary of the hist data to the output
  */
@@ -328,29 +343,23 @@ timerlat_print_summary(struct timerlat_hist_params *params,
 		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
 			continue;
 
-		if (!params->no_irq) {
-			if (data->hist[cpu].irq_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						data->hist[cpu].min_irq);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (!params->no_irq)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].irq_count,
+					     data->hist[cpu].min_irq,
+					     false);
 
-		if (!params->no_thread) {
-			if (data->hist[cpu].thread_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						data->hist[cpu].min_thread);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (!params->no_thread)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].thread_count,
+					     data->hist[cpu].min_thread,
+					     false);
 
-		if (params->user_hist) {
-			if (data->hist[cpu].user_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						data->hist[cpu].min_user);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (params->user_hist)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].user_count,
+					     data->hist[cpu].min_user,
+					     false);
 	}
 	trace_seq_printf(trace->seq, "\n");
 
@@ -364,29 +373,23 @@ timerlat_print_summary(struct timerlat_hist_params *params,
 		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
 			continue;
 
-		if (!params->no_irq) {
-			if (data->hist[cpu].irq_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						 data->hist[cpu].sum_irq / data->hist[cpu].irq_count);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (!params->no_irq)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].irq_count,
+					     data->hist[cpu].sum_irq,
+					     true);
 
-		if (!params->no_thread) {
-			if (data->hist[cpu].thread_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						 data->hist[cpu].sum_thread / data->hist[cpu].thread_count);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (!params->no_thread)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].thread_count,
+					     data->hist[cpu].sum_thread,
+					     true);
 
-		if (params->user_hist) {
-			if (data->hist[cpu].user_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						 data->hist[cpu].sum_user / data->hist[cpu].user_count);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (params->user_hist)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].user_count,
+					     data->hist[cpu].sum_user,
+					     true);
 	}
 	trace_seq_printf(trace->seq, "\n");
 
@@ -400,29 +403,23 @@ timerlat_print_summary(struct timerlat_hist_params *params,
 		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
 			continue;
 
-		if (!params->no_irq) {
-			if (data->hist[cpu].irq_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						 data->hist[cpu].max_irq);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (!params->no_irq)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].irq_count,
+					     data->hist[cpu].max_irq,
+					     false);
 
-		if (!params->no_thread) {
-			if (data->hist[cpu].thread_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						data->hist[cpu].max_thread);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (!params->no_thread)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].thread_count,
+					     data->hist[cpu].max_thread,
+					     false);
 
-		if (params->user_hist) {
-			if (data->hist[cpu].user_count)
-				trace_seq_printf(trace->seq, "%9llu ",
-						data->hist[cpu].max_user);
-			else
-				trace_seq_printf(trace->seq, "        - ");
-		}
+		if (params->user_hist)
+			format_summary_value(trace->seq,
+					     data->hist[cpu].user_count,
+					     data->hist[cpu].max_user,
+					     false);
 	}
 	trace_seq_printf(trace->seq, "\n");
 	trace_seq_do_printf(trace->seq);
@@ -506,16 +503,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params,
 		trace_seq_printf(trace->seq, "min:  ");
 
 	if (!params->no_irq)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.min_irq);
+		format_summary_value(trace->seq,
+				     sum.irq_count,
+				     sum.min_irq,
+				     false);
 
 	if (!params->no_thread)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.min_thread);
+		format_summary_value(trace->seq,
+				     sum.thread_count,
+				     sum.min_thread,
+				     false);
 
 	if (params->user_hist)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.min_user);
+		format_summary_value(trace->seq,
+				     sum.user_count,
+				     sum.min_user,
+				     false);
 
 	trace_seq_printf(trace->seq, "\n");
 
@@ -523,16 +526,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params,
 		trace_seq_printf(trace->seq, "avg:  ");
 
 	if (!params->no_irq)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.sum_irq / sum.irq_count);
+		format_summary_value(trace->seq,
+				     sum.irq_count,
+				     sum.sum_irq,
+				     true);
 
 	if (!params->no_thread)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.sum_thread / sum.thread_count);
+		format_summary_value(trace->seq,
+				     sum.thread_count,
+				     sum.sum_thread,
+				     true);
 
 	if (params->user_hist)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.sum_user / sum.user_count);
+		format_summary_value(trace->seq,
+				     sum.user_count,
+				     sum.sum_user,
+				     true);
 
 	trace_seq_printf(trace->seq, "\n");
 
@@ -540,16 +549,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params,
 		trace_seq_printf(trace->seq, "max:  ");
 
 	if (!params->no_irq)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.max_irq);
+		format_summary_value(trace->seq,
+				     sum.irq_count,
+				     sum.max_irq,
+				     false);
 
 	if (!params->no_thread)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.max_thread);
+		format_summary_value(trace->seq,
+				     sum.thread_count,
+				     sum.max_thread,
+				     false);
 
 	if (params->user_hist)
-		trace_seq_printf(trace->seq, "%9llu ",
-				 sum.max_user);
+		format_summary_value(trace->seq,
+				     sum.user_count,
+				     sum.max_user,
+				     false);
 
 	trace_seq_printf(trace->seq, "\n");
 	trace_seq_do_printf(trace->seq);
-- 
cgit v1.2.3


From 6c432b56a16a0727561211a137f37ec47f96f1d0 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:45 +0100
Subject: verification/dot2k: Fix template directory detection

dot2k can be run as installed (e.g. make install) or from the kernel
tree. In the former case it looks for templates in a known location; in
the latter, the PWD has to be `<linux>/tools/verification` to properly
import python modules. The current version looks for the template
in a wrong directory in this latter case.

This patch adjusts the directory where dot2k looks for templates if run
from the kernel tree (i.e. not installed).
Additionally we fix a few simple pylint warnings in boolean expressions.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-2-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/dot2k.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index 016550fccf1f..f6d02e3406a3 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -14,14 +14,14 @@ import os
 
 class dot2k(Dot2c):
     monitor_types = { "global" : 1, "per_cpu" : 2, "per_task" : 3 }
-    monitor_templates_dir = "dot2k/rv_templates/"
+    monitor_templates_dir = "dot2/dot2k_templates/"
     monitor_type = "per_cpu"
 
     def __init__(self, file_path, MonitorType):
         super().__init__(file_path)
 
         self.monitor_type = self.monitor_types.get(MonitorType)
-        if self.monitor_type == None:
+        if self.monitor_type is None:
             raise Exception("Unknown monitor type: %s" % MonitorType)
 
         self.monitor_type = MonitorType
@@ -31,7 +31,7 @@ class dot2k(Dot2c):
 
     def __fill_rv_templates_dir(self):
 
-        if os.path.exists(self.monitor_templates_dir) == True:
+        if os.path.exists(self.monitor_templates_dir):
             return
 
         if platform.system() != "Linux":
@@ -39,11 +39,11 @@ class dot2k(Dot2c):
 
         kernel_path = "/lib/modules/%s/build/tools/verification/dot2/dot2k_templates/" % (platform.release())
 
-        if os.path.exists(kernel_path) == True:
+        if os.path.exists(kernel_path):
             self.monitor_templates_dir = kernel_path
             return
 
-        if os.path.exists("/usr/share/dot2/dot2k_templates/") == True:
+        if os.path.exists("/usr/share/dot2/dot2k_templates/"):
             self.monitor_templates_dir = "/usr/share/dot2/dot2k_templates/"
             return
 
@@ -98,7 +98,7 @@ class dot2k(Dot2c):
     def fill_main_c(self):
         main_c = self.main_c
         min_type = self.get_minimun_type()
-        nr_events = self.events.__len__()
+        nr_events = len(self.events)
         tracepoint_handlers = self.fill_tracepoint_handlers_skel()
         tracepoint_attach = self.fill_tracepoint_attach_probe()
         tracepoint_detach = self.fill_tracepoint_detach_helper()
@@ -160,8 +160,8 @@ class dot2k(Dot2c):
 
     def __get_main_name(self):
         path = "%s/%s" % (self.name, "main.c")
-        if os.path.exists(path) == False:
-           return "main.c"
+        if not os.path.exists(path):
+            return "main.c"
         return "__main.c"
 
     def print_files(self):
-- 
cgit v1.2.3


From ca08e071c59d96cb1db19b20ba70e9db7b9d5791 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:46 +0100
Subject: verification/dot2k: Unify main.c templates

dot2k has 3 templates, one per monitor type, but the only difference
among them is the `DECLARE_DA_MON_*` call, keeping 3 almost identical
templates requires more work whenever we introduce a change.

This patch removes the 3 dot2k templates and replaces them with a
generic one, we then adjust the model type from the script.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-3-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/dot2k.py                   |  7 +-
 tools/verification/dot2/dot2k_templates/main.c     | 91 ++++++++++++++++++++++
 .../dot2/dot2k_templates/main_global.c             | 91 ----------------------
 .../dot2/dot2k_templates/main_per_cpu.c            | 91 ----------------------
 .../dot2/dot2k_templates/main_per_task.c           | 91 ----------------------
 5 files changed, 97 insertions(+), 274 deletions(-)
 create mode 100644 tools/verification/dot2/dot2k_templates/main.c
 delete mode 100644 tools/verification/dot2/dot2k_templates/main_global.c
 delete mode 100644 tools/verification/dot2/dot2k_templates/main_per_cpu.c
 delete mode 100644 tools/verification/dot2/dot2k_templates/main_per_task.c

(limited to 'tools')

diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index f6d02e3406a3..15d6f7048f8d 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -26,7 +26,7 @@ class dot2k(Dot2c):
 
         self.monitor_type = MonitorType
         self.__fill_rv_templates_dir()
-        self.main_c = self.__open_file(self.monitor_templates_dir + "main_" + MonitorType + ".c")
+        self.main_c = self.__open_file(self.monitor_templates_dir + "main.c")
         self.enum_suffix = "_%s" % self.name
 
     def __fill_rv_templates_dir(self):
@@ -69,6 +69,9 @@ class dot2k(Dot2c):
         # cut off the last \n
         return string[:-1]
 
+    def fill_monitor_type(self):
+        return self.monitor_type.upper()
+
     def fill_tracepoint_handlers_skel(self):
         buff = []
         for event in self.events:
@@ -97,12 +100,14 @@ class dot2k(Dot2c):
 
     def fill_main_c(self):
         main_c = self.main_c
+        monitor_type = self.fill_monitor_type()
         min_type = self.get_minimun_type()
         nr_events = len(self.events)
         tracepoint_handlers = self.fill_tracepoint_handlers_skel()
         tracepoint_attach = self.fill_tracepoint_attach_probe()
         tracepoint_detach = self.fill_tracepoint_detach_helper()
 
+        main_c = main_c.replace("MONITOR_TYPE", monitor_type)
         main_c = main_c.replace("MIN_TYPE", min_type)
         main_c = main_c.replace("MODEL_NAME", self.name)
         main_c = main_c.replace("NR_EVENTS", str(nr_events))
diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c
new file mode 100644
index 000000000000..2419a6f89cd8
--- /dev/null
+++ b/tools/verification/dot2/dot2k_templates/main.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "MODEL_NAME"
+
+/*
+ * XXX: include required tracepoint headers, e.g.,
+ * #include <trace/events/sched.h>
+ */
+#include <trace/events/rv.h>
+
+/*
+ * This is the self-generated part of the monitor. Generally, there is no need
+ * to touch this section.
+ */
+#include "MODEL_NAME.h"
+
+/*
+ * Declare the deterministic automata monitor.
+ *
+ * The rv monitor reference is needed for the monitor declaration.
+ */
+static struct rv_monitor rv_MODEL_NAME;
+DECLARE_DA_MON_MONITOR_TYPE(MODEL_NAME, MIN_TYPE);
+
+/*
+ * This is the instrumentation part of the monitor.
+ *
+ * This is the section where manual work is required. Here the kernel events
+ * are translated into model's event.
+ *
+ */
+TRACEPOINT_HANDLERS_SKEL
+static int enable_MODEL_NAME(void)
+{
+	int retval;
+
+	retval = da_monitor_init_MODEL_NAME();
+	if (retval)
+		return retval;
+
+TRACEPOINT_ATTACH
+
+	return 0;
+}
+
+static void disable_MODEL_NAME(void)
+{
+	rv_MODEL_NAME.enabled = 0;
+
+TRACEPOINT_DETACH
+
+	da_monitor_destroy_MODEL_NAME();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_MODEL_NAME = {
+	.name = "MODEL_NAME",
+	.description = "auto-generated MODEL_NAME",
+	.enable = enable_MODEL_NAME,
+	.disable = disable_MODEL_NAME,
+	.reset = da_monitor_reset_all_MODEL_NAME,
+	.enabled = 0,
+};
+
+static int __init register_MODEL_NAME(void)
+{
+	rv_register_monitor(&rv_MODEL_NAME);
+	return 0;
+}
+
+static void __exit unregister_MODEL_NAME(void)
+{
+	rv_unregister_monitor(&rv_MODEL_NAME);
+}
+
+module_init(register_MODEL_NAME);
+module_exit(unregister_MODEL_NAME);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("dot2k: auto-generated");
+MODULE_DESCRIPTION("MODEL_NAME");
diff --git a/tools/verification/dot2/dot2k_templates/main_global.c b/tools/verification/dot2/dot2k_templates/main_global.c
deleted file mode 100644
index a5658bfb9044..000000000000
--- a/tools/verification/dot2/dot2k_templates/main_global.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "MODEL_NAME"
-
-/*
- * XXX: include required tracepoint headers, e.g.,
- * #include <trace/events/sched.h>
- */
-#include <trace/events/rv.h>
-
-/*
- * This is the self-generated part of the monitor. Generally, there is no need
- * to touch this section.
- */
-#include "MODEL_NAME.h"
-
-/*
- * Declare the deterministic automata monitor.
- *
- * The rv monitor reference is needed for the monitor declaration.
- */
-static struct rv_monitor rv_MODEL_NAME;
-DECLARE_DA_MON_GLOBAL(MODEL_NAME, MIN_TYPE);
-
-/*
- * This is the instrumentation part of the monitor.
- *
- * This is the section where manual work is required. Here the kernel events
- * are translated into model's event.
- *
- */
-TRACEPOINT_HANDLERS_SKEL
-static int enable_MODEL_NAME(void)
-{
-	int retval;
-
-	retval = da_monitor_init_MODEL_NAME();
-	if (retval)
-		return retval;
-
-TRACEPOINT_ATTACH
-
-	return 0;
-}
-
-static void disable_MODEL_NAME(void)
-{
-	rv_MODEL_NAME.enabled = 0;
-
-TRACEPOINT_DETACH
-
-	da_monitor_destroy_MODEL_NAME();
-}
-
-/*
- * This is the monitor register section.
- */
-static struct rv_monitor rv_MODEL_NAME = {
-	.name = "MODEL_NAME",
-	.description = "auto-generated MODEL_NAME",
-	.enable = enable_MODEL_NAME,
-	.disable = disable_MODEL_NAME,
-	.reset = da_monitor_reset_all_MODEL_NAME,
-	.enabled = 0,
-};
-
-static int __init register_MODEL_NAME(void)
-{
-	rv_register_monitor(&rv_MODEL_NAME);
-	return 0;
-}
-
-static void __exit unregister_MODEL_NAME(void)
-{
-	rv_unregister_monitor(&rv_MODEL_NAME);
-}
-
-module_init(register_MODEL_NAME);
-module_exit(unregister_MODEL_NAME);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("dot2k: auto-generated");
-MODULE_DESCRIPTION("MODEL_NAME");
diff --git a/tools/verification/dot2/dot2k_templates/main_per_cpu.c b/tools/verification/dot2/dot2k_templates/main_per_cpu.c
deleted file mode 100644
index 03539a97633f..000000000000
--- a/tools/verification/dot2/dot2k_templates/main_per_cpu.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "MODEL_NAME"
-
-/*
- * XXX: include required tracepoint headers, e.g.,
- * #include <linux/trace/events/sched.h>
- */
-#include <trace/events/rv.h>
-
-/*
- * This is the self-generated part of the monitor. Generally, there is no need
- * to touch this section.
- */
-#include "MODEL_NAME.h"
-
-/*
- * Declare the deterministic automata monitor.
- *
- * The rv monitor reference is needed for the monitor declaration.
- */
-static struct rv_monitor rv_MODEL_NAME;
-DECLARE_DA_MON_PER_CPU(MODEL_NAME, MIN_TYPE);
-
-/*
- * This is the instrumentation part of the monitor.
- *
- * This is the section where manual work is required. Here the kernel events
- * are translated into model's event.
- *
- */
-TRACEPOINT_HANDLERS_SKEL
-static int enable_MODEL_NAME(void)
-{
-	int retval;
-
-	retval = da_monitor_init_MODEL_NAME();
-	if (retval)
-		return retval;
-
-TRACEPOINT_ATTACH
-
-	return 0;
-}
-
-static void disable_MODEL_NAME(void)
-{
-	rv_MODEL_NAME.enabled = 0;
-
-TRACEPOINT_DETACH
-
-	da_monitor_destroy_MODEL_NAME();
-}
-
-/*
- * This is the monitor register section.
- */
-static struct rv_monitor rv_MODEL_NAME = {
-	.name = "MODEL_NAME",
-	.description = "auto-generated MODEL_NAME",
-	.enable = enable_MODEL_NAME,
-	.disable = disable_MODEL_NAME,
-	.reset = da_monitor_reset_all_MODEL_NAME,
-	.enabled = 0,
-};
-
-static int __init register_MODEL_NAME(void)
-{
-	rv_register_monitor(&rv_MODEL_NAME);
-	return 0;
-}
-
-static void __exit unregister_MODEL_NAME(void)
-{
-	rv_unregister_monitor(&rv_MODEL_NAME);
-}
-
-module_init(register_MODEL_NAME);
-module_exit(unregister_MODEL_NAME);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("dot2k: auto-generated");
-MODULE_DESCRIPTION("MODEL_NAME");
diff --git a/tools/verification/dot2/dot2k_templates/main_per_task.c b/tools/verification/dot2/dot2k_templates/main_per_task.c
deleted file mode 100644
index ffd92af87a86..000000000000
--- a/tools/verification/dot2/dot2k_templates/main_per_task.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "MODEL_NAME"
-
-/*
- * XXX: include required tracepoint headers, e.g.,
- * #include <linux/trace/events/sched.h>
- */
-#include <trace/events/rv.h>
-
-/*
- * This is the self-generated part of the monitor. Generally, there is no need
- * to touch this section.
- */
-#include "MODEL_NAME.h"
-
-/*
- * Declare the deterministic automata monitor.
- *
- * The rv monitor reference is needed for the monitor declaration.
- */
-static struct rv_monitor rv_MODEL_NAME;
-DECLARE_DA_MON_PER_TASK(MODEL_NAME, MIN_TYPE);
-
-/*
- * This is the instrumentation part of the monitor.
- *
- * This is the section where manual work is required. Here the kernel events
- * are translated into model's event.
- *
- */
-TRACEPOINT_HANDLERS_SKEL
-static int enable_MODEL_NAME(void)
-{
-	int retval;
-
-	retval = da_monitor_init_MODEL_NAME();
-	if (retval)
-		return retval;
-
-TRACEPOINT_ATTACH
-
-	return 0;
-}
-
-static void disable_MODEL_NAME(void)
-{
-	rv_MODEL_NAME.enabled = 0;
-
-TRACEPOINT_DETACH
-
-	da_monitor_destroy_MODEL_NAME();
-}
-
-/*
- * This is the monitor register section.
- */
-static struct rv_monitor rv_MODEL_NAME = {
-	.name = "MODEL_NAME",
-	.description = "auto-generated MODEL_NAME",
-	.enable = enable_MODEL_NAME,
-	.disable = disable_MODEL_NAME,
-	.reset = da_monitor_reset_all_MODEL_NAME,
-	.enabled = 0,
-};
-
-static int __init register_MODEL_NAME(void)
-{
-	rv_register_monitor(&rv_MODEL_NAME);
-	return 0;
-}
-
-static void __exit unregister_MODEL_NAME(void)
-{
-	rv_unregister_monitor(&rv_MODEL_NAME);
-}
-
-module_init(register_MODEL_NAME);
-module_exit(unregister_MODEL_NAME);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("dot2k: auto-generated");
-MODULE_DESCRIPTION("MODEL_NAME");
-- 
cgit v1.2.3


From 91f3407e13b89b7391ebc5b6143fd22edd901041 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:47 +0100
Subject: verification/dot2k: More robust template variables

The dot2k templates currently have variables that are automatically
filled by the script marked as an uppercase VARIABLE. This requires some
care while adding new variables to avoid using valid keywords and get
them unexpectedly substituted.

This patch switches the variables to the %%VARIABLE%% notation to make
the pattern substitution more robust.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-4-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/dot2k.py               | 14 ++++----
 tools/verification/dot2/dot2k_templates/main.c | 50 +++++++++++++-------------
 2 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index 15d6f7048f8d..c88b3c011706 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -107,13 +107,13 @@ class dot2k(Dot2c):
         tracepoint_attach = self.fill_tracepoint_attach_probe()
         tracepoint_detach = self.fill_tracepoint_detach_helper()
 
-        main_c = main_c.replace("MONITOR_TYPE", monitor_type)
-        main_c = main_c.replace("MIN_TYPE", min_type)
-        main_c = main_c.replace("MODEL_NAME", self.name)
-        main_c = main_c.replace("NR_EVENTS", str(nr_events))
-        main_c = main_c.replace("TRACEPOINT_HANDLERS_SKEL", tracepoint_handlers)
-        main_c = main_c.replace("TRACEPOINT_ATTACH", tracepoint_attach)
-        main_c = main_c.replace("TRACEPOINT_DETACH", tracepoint_detach)
+        main_c = main_c.replace("%%MONITOR_TYPE%%", monitor_type)
+        main_c = main_c.replace("%%MIN_TYPE%%", min_type)
+        main_c = main_c.replace("%%MODEL_NAME%%", self.name)
+        main_c = main_c.replace("%%NR_EVENTS%%", str(nr_events))
+        main_c = main_c.replace("%%TRACEPOINT_HANDLERS_SKEL%%", tracepoint_handlers)
+        main_c = main_c.replace("%%TRACEPOINT_ATTACH%%", tracepoint_attach)
+        main_c = main_c.replace("%%TRACEPOINT_DETACH%%", tracepoint_detach)
 
         return main_c
 
diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c
index 2419a6f89cd8..4a05fef7f3c7 100644
--- a/tools/verification/dot2/dot2k_templates/main.c
+++ b/tools/verification/dot2/dot2k_templates/main.c
@@ -8,7 +8,7 @@
 #include <rv/instrumentation.h>
 #include <rv/da_monitor.h>
 
-#define MODULE_NAME "MODEL_NAME"
+#define MODULE_NAME "%%MODEL_NAME%%"
 
 /*
  * XXX: include required tracepoint headers, e.g.,
@@ -20,15 +20,15 @@
  * This is the self-generated part of the monitor. Generally, there is no need
  * to touch this section.
  */
-#include "MODEL_NAME.h"
+#include "%%MODEL_NAME%%.h"
 
 /*
  * Declare the deterministic automata monitor.
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-static struct rv_monitor rv_MODEL_NAME;
-DECLARE_DA_MON_MONITOR_TYPE(MODEL_NAME, MIN_TYPE);
+static struct rv_monitor rv_%%MODEL_NAME%%;
+DECLARE_DA_MON_%%MONITOR_TYPE%%(%%MODEL_NAME%%, %%MIN_TYPE%%);
 
 /*
  * This is the instrumentation part of the monitor.
@@ -37,55 +37,55 @@ DECLARE_DA_MON_MONITOR_TYPE(MODEL_NAME, MIN_TYPE);
  * are translated into model's event.
  *
  */
-TRACEPOINT_HANDLERS_SKEL
-static int enable_MODEL_NAME(void)
+%%TRACEPOINT_HANDLERS_SKEL%%
+static int enable_%%MODEL_NAME%%(void)
 {
 	int retval;
 
-	retval = da_monitor_init_MODEL_NAME();
+	retval = da_monitor_init_%%MODEL_NAME%%();
 	if (retval)
 		return retval;
 
-TRACEPOINT_ATTACH
+%%TRACEPOINT_ATTACH%%
 
 	return 0;
 }
 
-static void disable_MODEL_NAME(void)
+static void disable_%%MODEL_NAME%%(void)
 {
-	rv_MODEL_NAME.enabled = 0;
+	rv_%%MODEL_NAME%%.enabled = 0;
 
-TRACEPOINT_DETACH
+%%TRACEPOINT_DETACH%%
 
-	da_monitor_destroy_MODEL_NAME();
+	da_monitor_destroy_%%MODEL_NAME%%();
 }
 
 /*
  * This is the monitor register section.
  */
-static struct rv_monitor rv_MODEL_NAME = {
-	.name = "MODEL_NAME",
-	.description = "auto-generated MODEL_NAME",
-	.enable = enable_MODEL_NAME,
-	.disable = disable_MODEL_NAME,
-	.reset = da_monitor_reset_all_MODEL_NAME,
+static struct rv_monitor rv_%%MODEL_NAME%% = {
+	.name = "%%MODEL_NAME%%",
+	.description = "auto-generated %%MODEL_NAME%%",
+	.enable = enable_%%MODEL_NAME%%,
+	.disable = disable_%%MODEL_NAME%%,
+	.reset = da_monitor_reset_all_%%MODEL_NAME%%,
 	.enabled = 0,
 };
 
-static int __init register_MODEL_NAME(void)
+static int __init register_%%MODEL_NAME%%(void)
 {
-	rv_register_monitor(&rv_MODEL_NAME);
+	rv_register_monitor(&rv_%%MODEL_NAME%%);
 	return 0;
 }
 
-static void __exit unregister_MODEL_NAME(void)
+static void __exit unregister_%%MODEL_NAME%%(void)
 {
-	rv_unregister_monitor(&rv_MODEL_NAME);
+	rv_unregister_monitor(&rv_%%MODEL_NAME%%);
 }
 
-module_init(register_MODEL_NAME);
-module_exit(unregister_MODEL_NAME);
+module_init(register_%%MODEL_NAME%%);
+module_exit(unregister_%%MODEL_NAME%%);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("dot2k: auto-generated");
-MODULE_DESCRIPTION("MODEL_NAME");
+MODULE_DESCRIPTION("%%MODEL_NAME%%");
-- 
cgit v1.2.3


From 64b3e5f0d45329bc593e13b64dcdcf836da006cd Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:48 +0100
Subject: verification/dot2k: Add support for name and description options

The dot2k command includes options to set a model name with -n and a
description with -D, however those are not used in practice.

This patch allows to specify a custom model name (by default the name of
the dot file without extension) and a description which overrides the
one in the C file.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-5-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/automata.py            | 4 ++--
 tools/verification/dot2/dot2c.py               | 4 ++--
 tools/verification/dot2/dot2k                  | 6 +-----
 tools/verification/dot2/dot2k.py               | 8 +++++---
 tools/verification/dot2/dot2k_templates/main.c | 4 ++--
 5 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/dot2/automata.py b/tools/verification/dot2/automata.py
index bdeb98baa8b0..f6921cf3c914 100644
--- a/tools/verification/dot2/automata.py
+++ b/tools/verification/dot2/automata.py
@@ -19,9 +19,9 @@ class Automata:
 
     invalid_state_str = "INVALID_STATE"
 
-    def __init__(self, file_path):
+    def __init__(self, file_path, model_name=None):
         self.__dot_path = file_path
-        self.name = self.__get_model_name()
+        self.name = model_name or self.__get_model_name()
         self.__dot_lines = self.__open_dot()
         self.states, self.initial_state, self.final_states = self.__get_state_variables()
         self.events = self.__get_event_variables()
diff --git a/tools/verification/dot2/dot2c.py b/tools/verification/dot2/dot2c.py
index 87d8a1e1470c..fa2816ac7b61 100644
--- a/tools/verification/dot2/dot2c.py
+++ b/tools/verification/dot2/dot2c.py
@@ -22,8 +22,8 @@ class Dot2c(Automata):
     struct_automaton_def = "automaton"
     var_automaton_def = "aut"
 
-    def __init__(self, file_path):
-        super().__init__(file_path)
+    def __init__(self, file_path, model_name=None):
+        super().__init__(file_path, model_name)
         self.line_length = 100
 
     def __buff_to_string(self, buff):
diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k
index d4d7e52d549e..827b62b8d5e1 100644
--- a/tools/verification/dot2/dot2k
+++ b/tools/verification/dot2/dot2k
@@ -25,16 +25,12 @@ if __name__ == '__main__':
 
     print("Opening and parsing the dot file %s" % params.dot_file)
     try:
-        monitor=dot2k(params.dot_file, params.monitor_type)
+        monitor=dot2k(params.dot_file, params.monitor_type, vars(params))
     except Exception as e:
         print('Error: '+ str(e))
         print("Sorry : :-(")
         sys.exit(1)
 
-    # easier than using argparse action.
-    if params.model_name != None:
-        print(params.model_name)
-
     print("Writing the monitor into the directory %s" % monitor.name)
     monitor.print_files()
     print("Almost done, checklist")
diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index c88b3c011706..d48ad86a035a 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -17,17 +17,18 @@ class dot2k(Dot2c):
     monitor_templates_dir = "dot2/dot2k_templates/"
     monitor_type = "per_cpu"
 
-    def __init__(self, file_path, MonitorType):
-        super().__init__(file_path)
+    def __init__(self, file_path, MonitorType, extra_params={}):
+        super().__init__(file_path, extra_params.get("model_name"))
 
         self.monitor_type = self.monitor_types.get(MonitorType)
         if self.monitor_type is None:
-            raise Exception("Unknown monitor type: %s" % MonitorType)
+            raise ValueError("Unknown monitor type: %s" % MonitorType)
 
         self.monitor_type = MonitorType
         self.__fill_rv_templates_dir()
         self.main_c = self.__open_file(self.monitor_templates_dir + "main.c")
         self.enum_suffix = "_%s" % self.name
+        self.description = extra_params.get("description", self.name) or "auto-generated"
 
     def __fill_rv_templates_dir(self):
 
@@ -114,6 +115,7 @@ class dot2k(Dot2c):
         main_c = main_c.replace("%%TRACEPOINT_HANDLERS_SKEL%%", tracepoint_handlers)
         main_c = main_c.replace("%%TRACEPOINT_ATTACH%%", tracepoint_attach)
         main_c = main_c.replace("%%TRACEPOINT_DETACH%%", tracepoint_detach)
+        main_c = main_c.replace("%%DESCRIPTION%%", self.description)
 
         return main_c
 
diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c
index 4a05fef7f3c7..704617168578 100644
--- a/tools/verification/dot2/dot2k_templates/main.c
+++ b/tools/verification/dot2/dot2k_templates/main.c
@@ -65,7 +65,7 @@ static void disable_%%MODEL_NAME%%(void)
  */
 static struct rv_monitor rv_%%MODEL_NAME%% = {
 	.name = "%%MODEL_NAME%%",
-	.description = "auto-generated %%MODEL_NAME%%",
+	.description = "%%DESCRIPTION%%",
 	.enable = enable_%%MODEL_NAME%%,
 	.disable = disable_%%MODEL_NAME%%,
 	.reset = da_monitor_reset_all_%%MODEL_NAME%%,
@@ -88,4 +88,4 @@ module_exit(unregister_%%MODEL_NAME%%);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("dot2k: auto-generated");
-MODULE_DESCRIPTION("%%MODEL_NAME%%");
+MODULE_DESCRIPTION("%%MODEL_NAME%%: %%DESCRIPTION%%");
-- 
cgit v1.2.3


From 9c6cfe80980056042f1f80d65c74806021708989 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:50 +0100
Subject: verification/dot2k: Simplify manual steps in monitor creation

This patch reduces and simplifies the manual steps still needed in
creating a new RV monitor.

It extends the dot2k script to create a tracepoint snippet and a
Kconfig file for the newly generated monitor. Those files can be kept
in the monitor's directory but shall be included in the main tracepoint
header and Kconfig.
Together with the checklist, dot2k now suggests the lines to add to
those files for inclusion and the Makefile line to compile the new
monitor:

 Writing the monitor into the directory monitor_name
 Almost done, checklist
   - Edit the monitor_name/monitor_name.c to add the instrumentation
   - Edit kernel/trace/rv/rv_trace.h:
 Add this line where other tracepoints are included and DA_MON_EVENTS_ID is defined:
 #include <monitors/monitor_name/monitor_name_trace.h>

   - Edit kernel/trace/rv/Makefile:
 Add this line where other monitors are included:
 obj-$(CONFIG_RV_MON_MONITOR_NAME) += monitors/monitor_name/monitor_name.o

   - Edit kernel/trace/rv/Kconfig:
 Add this line where other monitors are included:
 source "kernel/trace/rv/monitors/monitor_name/Kconfig"

   - Move monitor_name/ to the kernel's monitor directory (kernel/trace/rv/monitors)

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-7-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/dot2k                   |  8 +--
 tools/verification/dot2/dot2k.py                | 86 +++++++++++++++++++++++++
 tools/verification/dot2/dot2k_templates/Kconfig |  6 ++
 tools/verification/dot2/dot2k_templates/main.c  |  2 +-
 tools/verification/dot2/dot2k_templates/trace.h | 13 ++++
 5 files changed, 110 insertions(+), 5 deletions(-)
 create mode 100644 tools/verification/dot2/dot2k_templates/Kconfig
 create mode 100644 tools/verification/dot2/dot2k_templates/trace.h

(limited to 'tools')

diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k
index 827b62b8d5e1..190c974edd0a 100644
--- a/tools/verification/dot2/dot2k
+++ b/tools/verification/dot2/dot2k
@@ -35,7 +35,7 @@ if __name__ == '__main__':
     monitor.print_files()
     print("Almost done, checklist")
     print("  - Edit the %s/%s.c to add the instrumentation" % (monitor.name, monitor.name))
-    print("  - Edit include/trace/events/rv.h to add the tracepoint entry")
-    print("  - Move it to the kernel's monitor directory")
-    print("  - Edit kernel/trace/rv/Makefile")
-    print("  - Edit kernel/trace/rv/Kconfig")
+    print(monitor.fill_tracepoint_tooltip())
+    print(monitor.fill_makefile_tooltip())
+    print(monitor.fill_kconfig_tooltip())
+    print("  - Move %s/ to the kernel's monitor directory (%s/monitors)" % (monitor.name, monitor.rv_dir))
diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index d48ad86a035a..dc56cd1fb0b4 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -15,6 +15,7 @@ import os
 class dot2k(Dot2c):
     monitor_types = { "global" : 1, "per_cpu" : 2, "per_task" : 3 }
     monitor_templates_dir = "dot2/dot2k_templates/"
+    rv_dir = "kernel/trace/rv"
     monitor_type = "per_cpu"
 
     def __init__(self, file_path, MonitorType, extra_params={}):
@@ -27,6 +28,8 @@ class dot2k(Dot2c):
         self.monitor_type = MonitorType
         self.__fill_rv_templates_dir()
         self.main_c = self.__open_file(self.monitor_templates_dir + "main.c")
+        self.trace_h = self.__open_file(self.monitor_templates_dir + "trace.h")
+        self.kconfig = self.__open_file(self.monitor_templates_dir + "Kconfig")
         self.enum_suffix = "_%s" % self.name
         self.description = extra_params.get("description", self.name) or "auto-generated"
 
@@ -144,6 +147,82 @@ class dot2k(Dot2c):
 
         return self.__buff_to_string(buff)
 
+    def fill_monitor_class_type(self):
+        if self.monitor_type == "per_task":
+            return "DA_MON_EVENTS_ID"
+        return "DA_MON_EVENTS_IMPLICIT"
+
+    def fill_monitor_class(self):
+        if self.monitor_type == "per_task":
+            return "da_monitor_id"
+        return "da_monitor"
+
+    def fill_tracepoint_args_skel(self, tp_type):
+        buff = []
+        tp_args_event = [
+                ("char *", "state"),
+                ("char *", "event"),
+                ("char *", "next_state"),
+                ("bool ",  "final_state"),
+                ]
+        tp_args_error = [
+                ("char *", "state"),
+                ("char *", "event"),
+                ]
+        tp_args_id = ("int ", "id")
+        tp_args = tp_args_event if tp_type == "event" else tp_args_error
+        if self.monitor_type == "per_task":
+            tp_args.insert(0, tp_args_id)
+        tp_proto_c = ", ".join([a+b for a,b in tp_args])
+        tp_args_c = ", ".join([b for a,b in tp_args])
+        buff.append("	     TP_PROTO(%s)," % tp_proto_c)
+        buff.append("	     TP_ARGS(%s)" % tp_args_c)
+        return self.__buff_to_string(buff)
+
+    def fill_trace_h(self):
+        trace_h = self.trace_h
+        monitor_class = self.fill_monitor_class()
+        monitor_class_type = self.fill_monitor_class_type()
+        tracepoint_args_skel_event = self.fill_tracepoint_args_skel("event")
+        tracepoint_args_skel_error = self.fill_tracepoint_args_skel("error")
+        trace_h = trace_h.replace("%%MODEL_NAME%%", self.name)
+        trace_h = trace_h.replace("%%MODEL_NAME_UP%%", self.name.upper())
+        trace_h = trace_h.replace("%%MONITOR_CLASS%%", monitor_class)
+        trace_h = trace_h.replace("%%MONITOR_CLASS_TYPE%%", monitor_class_type)
+        trace_h = trace_h.replace("%%TRACEPOINT_ARGS_SKEL_EVENT%%", tracepoint_args_skel_event)
+        trace_h = trace_h.replace("%%TRACEPOINT_ARGS_SKEL_ERROR%%", tracepoint_args_skel_error)
+        return trace_h
+
+    def fill_kconfig(self):
+        kconfig = self.kconfig
+        monitor_class_type = self.fill_monitor_class_type()
+        kconfig = kconfig.replace("%%MODEL_NAME%%", self.name)
+        kconfig = kconfig.replace("%%MODEL_NAME_UP%%", self.name.upper())
+        kconfig = kconfig.replace("%%MONITOR_CLASS_TYPE%%", monitor_class_type)
+        kconfig = kconfig.replace("%%DESCRIPTION%%", self.description)
+        return kconfig
+
+    def fill_tracepoint_tooltip(self):
+        monitor_class_type = self.fill_monitor_class_type()
+        return """  - Edit %s/rv_trace.h:
+Add this line where other tracepoints are included and %s is defined:
+#include <monitors/%s/%s_trace.h>
+""" % (self.rv_dir, monitor_class_type, self.name, self.name)
+
+    def fill_kconfig_tooltip(self):
+        return """  - Edit %s/Kconfig:
+Add this line where other monitors are included:
+source \"kernel/trace/rv/monitors/%s/Kconfig\"
+""" % (self.rv_dir, self.name)
+
+    def fill_makefile_tooltip(self):
+        name = self.name
+        name_up = name.upper()
+        return """  - Edit %s/Makefile:
+Add this line where other monitors are included:
+obj-$(CONFIG_RV_MON_%s) += monitors/%s/%s.o
+""" % (self.rv_dir, name_up, name, name)
+
     def __create_directory(self):
         try:
             os.mkdir(self.name)
@@ -182,3 +261,10 @@ class dot2k(Dot2c):
 
         path = "%s.h" % self.name
         self.__create_file(path, model_h)
+
+        trace_h = self.fill_trace_h()
+        path = "%s_trace.h" % self.name
+        self.__create_file(path, trace_h)
+
+        kconfig = self.fill_kconfig()
+        self.__create_file("Kconfig", kconfig)
diff --git a/tools/verification/dot2/dot2k_templates/Kconfig b/tools/verification/dot2/dot2k_templates/Kconfig
new file mode 100644
index 000000000000..90cdc1e9379e
--- /dev/null
+++ b/tools/verification/dot2/dot2k_templates/Kconfig
@@ -0,0 +1,6 @@
+config RV_MON_%%MODEL_NAME_UP%%
+	depends on RV
+	select %%MONITOR_CLASS_TYPE%%
+	bool "%%MODEL_NAME%% monitor"
+	help
+	  %%DESCRIPTION%%
diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c
index 704617168578..9605ca994416 100644
--- a/tools/verification/dot2/dot2k_templates/main.c
+++ b/tools/verification/dot2/dot2k_templates/main.c
@@ -14,7 +14,7 @@
  * XXX: include required tracepoint headers, e.g.,
  * #include <trace/events/sched.h>
  */
-#include <trace/events/rv.h>
+#include <rv_trace.h>
 
 /*
  * This is the self-generated part of the monitor. Generally, there is no need
diff --git a/tools/verification/dot2/dot2k_templates/trace.h b/tools/verification/dot2/dot2k_templates/trace.h
new file mode 100644
index 000000000000..87d3a1308926
--- /dev/null
+++ b/tools/verification/dot2/dot2k_templates/trace.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_%%MODEL_NAME_UP%%
+DEFINE_EVENT(event_%%MONITOR_CLASS%%, event_%%MODEL_NAME%%,
+%%TRACEPOINT_ARGS_SKEL_EVENT%%);
+
+DEFINE_EVENT(error_%%MONITOR_CLASS%%, error_%%MODEL_NAME%%,
+%%TRACEPOINT_ARGS_SKEL_ERROR%%);
+#endif /* CONFIG_RV_MON_%%MODEL_NAME_UP%% */
-- 
cgit v1.2.3


From de6f45c2dd226269fe9886290a139533c817c5bc Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:51 +0100
Subject: verification/dot2k: Auto patch current kernel source

dot2k suggests a list of changes to the kernel tree while adding a
monitor: edit tracepoints header, Makefile, Kconfig and moving the
monitor folder. Those changes can be easily run automatically.

Add a flag to dot2k to alter the kernel source.

The kernel source directory can be either assumed from the PWD, or from
the running kernel, if installed.
This feature works best if the kernel tree is a git repository, so that
its easier to make sure there are no unintended changes.

The main RV files (e.g. Makefile) have now a comment placeholder that
can be useful for manual editing (e.g. to know where to add new
monitors) and it is used by the script to append the required lines.

We also slightly adapt the file handling functions in dot2k: __open_file
is now called __read_file and also closes the file before returning the
content; __create_file is now a more general __write_file, we no longer
return on FileExistsError (not thrown while opening), a new
__create_file simply calls __write_file specifying the monitor folder in
the path.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-8-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/dot2k    |  5 ++-
 tools/verification/dot2/dot2k.py | 92 ++++++++++++++++++++++++++++++++++------
 2 files changed, 82 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k
index 190c974edd0a..559ba191a1f6 100644
--- a/tools/verification/dot2/dot2k
+++ b/tools/verification/dot2/dot2k
@@ -21,6 +21,9 @@ if __name__ == '__main__':
     parser.add_argument('-t', "--monitor_type", dest="monitor_type", required=True)
     parser.add_argument('-n', "--model_name", dest="model_name", required=False)
     parser.add_argument("-D", "--description", dest="description", required=False)
+    parser.add_argument("-a", "--auto_patch", dest="auto_patch",
+                        action="store_true", required=False,
+                        help="Patch the kernel in place")
     params = parser.parse_args()
 
     print("Opening and parsing the dot file %s" % params.dot_file)
@@ -38,4 +41,4 @@ if __name__ == '__main__':
     print(monitor.fill_tracepoint_tooltip())
     print(monitor.fill_makefile_tooltip())
     print(monitor.fill_kconfig_tooltip())
-    print("  - Move %s/ to the kernel's monitor directory (%s/monitors)" % (monitor.name, monitor.rv_dir))
+    print(monitor.fill_monitor_tooltip())
diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index dc56cd1fb0b4..83f4d49853a2 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -27,11 +27,14 @@ class dot2k(Dot2c):
 
         self.monitor_type = MonitorType
         self.__fill_rv_templates_dir()
-        self.main_c = self.__open_file(self.monitor_templates_dir + "main.c")
-        self.trace_h = self.__open_file(self.monitor_templates_dir + "trace.h")
-        self.kconfig = self.__open_file(self.monitor_templates_dir + "Kconfig")
+        self.main_c = self.__read_file(self.monitor_templates_dir + "main.c")
+        self.trace_h = self.__read_file(self.monitor_templates_dir + "trace.h")
+        self.kconfig = self.__read_file(self.monitor_templates_dir + "Kconfig")
         self.enum_suffix = "_%s" % self.name
         self.description = extra_params.get("description", self.name) or "auto-generated"
+        self.auto_patch = extra_params.get("auto_patch")
+        if self.auto_patch:
+            self.__fill_rv_kernel_dir()
 
     def __fill_rv_templates_dir(self):
 
@@ -39,7 +42,7 @@ class dot2k(Dot2c):
             return
 
         if platform.system() != "Linux":
-            raise Exception("I can only run on Linux.")
+            raise OSError("I can only run on Linux.")
 
         kernel_path = "/lib/modules/%s/build/tools/verification/dot2/dot2k_templates/" % (platform.release())
 
@@ -51,17 +54,43 @@ class dot2k(Dot2c):
             self.monitor_templates_dir = "/usr/share/dot2/dot2k_templates/"
             return
 
-        raise Exception("Could not find the template directory, do you have the kernel source installed?")
+        raise FileNotFoundError("Could not find the template directory, do you have the kernel source installed?")
 
+    def __fill_rv_kernel_dir(self):
 
-    def __open_file(self, path):
+        # first try if we are running in the kernel tree root
+        if os.path.exists(self.rv_dir):
+            return
+
+        # offset if we are running inside the kernel tree from verification/dot2
+        kernel_path = os.path.join("../..", self.rv_dir)
+
+        if os.path.exists(kernel_path):
+            self.rv_dir = kernel_path
+            return
+
+        if platform.system() != "Linux":
+            raise OSError("I can only run on Linux.")
+
+        kernel_path = os.path.join("/lib/modules/%s/build" % platform.release(), self.rv_dir)
+
+        # if the current kernel is from a distro this may not be a full kernel tree
+        # verify that one of the files we are going to modify is available
+        if os.path.exists(os.path.join(kernel_path, "rv_trace.h")):
+            self.rv_dir = kernel_path
+            return
+
+        raise FileNotFoundError("Could not find the rv directory, do you have the kernel source installed?")
+
+    def __read_file(self, path):
         try:
-            fd = open(path)
+            fd = open(path, 'r')
         except OSError:
             raise Exception("Cannot open the file: %s" % path)
 
         content = fd.read()
 
+        fd.close()
         return content
 
     def __buff_to_string(self, buff):
@@ -202,14 +231,32 @@ class dot2k(Dot2c):
         kconfig = kconfig.replace("%%DESCRIPTION%%", self.description)
         return kconfig
 
+    def __patch_file(self, file, marker, line):
+        file_to_patch = os.path.join(self.rv_dir, file)
+        content = self.__read_file(file_to_patch)
+        content = content.replace(marker, line + "\n" + marker)
+        self.__write_file(file_to_patch, content)
+
     def fill_tracepoint_tooltip(self):
         monitor_class_type = self.fill_monitor_class_type()
+        if self.auto_patch:
+            self.__patch_file("rv_trace.h",
+                            "// Add new monitors based on CONFIG_%s here" % monitor_class_type,
+                            "#include <monitors/%s/%s_trace.h>" % (self.name, self.name))
+            return "  - Patching %s/rv_trace.h, double check the result" % self.rv_dir
+
         return """  - Edit %s/rv_trace.h:
 Add this line where other tracepoints are included and %s is defined:
 #include <monitors/%s/%s_trace.h>
 """ % (self.rv_dir, monitor_class_type, self.name, self.name)
 
     def fill_kconfig_tooltip(self):
+        if self.auto_patch:
+            self.__patch_file("Kconfig",
+                            "# Add new monitors here",
+                            "source \"kernel/trace/rv/monitors/%s/Kconfig\"" % (self.name))
+            return "  - Patching %s/Kconfig, double check the result" % self.rv_dir
+
         return """  - Edit %s/Kconfig:
 Add this line where other monitors are included:
 source \"kernel/trace/rv/monitors/%s/Kconfig\"
@@ -218,32 +265,49 @@ source \"kernel/trace/rv/monitors/%s/Kconfig\"
     def fill_makefile_tooltip(self):
         name = self.name
         name_up = name.upper()
+        if self.auto_patch:
+            self.__patch_file("Makefile",
+                            "# Add new monitors here",
+                            "obj-$(CONFIG_RV_MON_%s) += monitors/%s/%s.o" % (name_up, name, name))
+            return "  - Patching %s/Makefile, double check the result" % self.rv_dir
+
         return """  - Edit %s/Makefile:
 Add this line where other monitors are included:
 obj-$(CONFIG_RV_MON_%s) += monitors/%s/%s.o
 """ % (self.rv_dir, name_up, name, name)
 
+    def fill_monitor_tooltip(self):
+        if self.auto_patch:
+            return "  - Monitor created in %s/monitors/%s" % (self.rv_dir, self. name)
+        return "  - Move %s/ to the kernel's monitor directory (%s/monitors)" % (self.name, self.rv_dir)
+
     def __create_directory(self):
+        path = self.name
+        if self.auto_patch:
+            path = os.path.join(self.rv_dir, "monitors", path)
         try:
-            os.mkdir(self.name)
+            os.mkdir(path)
         except FileExistsError:
             return
         except:
             print("Fail creating the output dir: %s" % self.name)
 
-    def __create_file(self, file_name, content):
-        path = "%s/%s" % (self.name, file_name)
+    def __write_file(self, file_name, content):
         try:
-            file = open(path, 'w')
-        except FileExistsError:
-            return
+            file = open(file_name, 'w')
         except:
-            print("Fail creating file: %s" % path)
+            print("Fail writing to file: %s" % file_name)
 
         file.write(content)
 
         file.close()
 
+    def __create_file(self, file_name, content):
+        path = "%s/%s" % (self.name, file_name)
+        if self.auto_patch:
+            path = os.path.join(self.rv_dir, "monitors", path)
+        self.__write_file(path, content)
+
     def __get_main_name(self):
         path = "%s/%s" % (self.name, "main.c")
         if not os.path.exists(path):
-- 
cgit v1.2.3


From 87c5d7f5e5938f713bde4e7435e6b207372a7f8e Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Fri, 27 Dec 2024 15:47:52 +0100
Subject: verification/dot2k: Implement event type detection

Currently dot2k treats all events equally and registers them with a
general da_handle_event. This is however just part of the work because
some events are necessary to understand when the monitor is entering the
initial state.

Specifically, the da_handle_start_event takes care of setting the
monitor in the initial state and da_handle_start_run_event also
registers the current event in the newly enabled monitor.
da_handle_start_event can be used on events that only lead to the
initial state (as it is currently done in the example monitors), while
da_handle_start_run_event could be used on events that are only valid
from the initial one.
Failing to set at least one of those functions to handle events makes
the monitor useless, since it will never be activated.

This patch adapts dot2k to parse the events that surely lead to the
initial state and set da_handle_start_event for those, if no such event
is found but some events are only valid in the initial event, we instead
set da_handle_start_run_event (it isn't necessary to set both).
We still add a comment to warn the user to make sure this change is
matching the model definition.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20241227144752.362911-9-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/automata.py | 32 ++++++++++++++++++++++++++++++++
 tools/verification/dot2/dot2k.py    | 11 +++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/dot2/automata.py b/tools/verification/dot2/automata.py
index f6921cf3c914..d9a3fe2b74bf 100644
--- a/tools/verification/dot2/automata.py
+++ b/tools/verification/dot2/automata.py
@@ -26,6 +26,7 @@ class Automata:
         self.states, self.initial_state, self.final_states = self.__get_state_variables()
         self.events = self.__get_event_variables()
         self.function = self.__create_matrix()
+        self.events_start, self.events_start_run = self.__store_init_events()
 
     def __get_model_name(self):
         basename = ntpath.basename(self.__dot_path)
@@ -172,3 +173,34 @@ class Automata:
             cursor += 1
 
         return matrix
+
+    def __store_init_events(self):
+        events_start = [False] * len(self.events)
+        events_start_run = [False] * len(self.events)
+        for i, _ in enumerate(self.events):
+            curr_event_will_init = 0
+            curr_event_from_init = False
+            curr_event_used = 0
+            for j, _ in enumerate(self.states):
+                if self.function[j][i] != self.invalid_state_str:
+                    curr_event_used += 1
+                if self.function[j][i] == self.initial_state:
+                    curr_event_will_init += 1
+            if self.function[0][i] != self.invalid_state_str:
+                curr_event_from_init = True
+            # this event always leads to init
+            if curr_event_will_init and curr_event_used == curr_event_will_init:
+                events_start[i] = True
+            # this event is only called from init
+            if curr_event_from_init and curr_event_used == 1:
+                events_start_run[i] = True
+        return events_start, events_start_run
+
+    def is_start_event(self, event):
+        return self.events_start[self.events.index(event)]
+
+    def is_start_run_event(self, event):
+        # prefer handle_start_event if there
+        if any(self.events_start):
+            return False
+        return self.events_start_run[self.events.index(event)]
diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py
index 83f4d49853a2..7547eb290b7d 100644
--- a/tools/verification/dot2/dot2k.py
+++ b/tools/verification/dot2/dot2k.py
@@ -110,11 +110,18 @@ class dot2k(Dot2c):
         for event in self.events:
             buff.append("static void handle_%s(void *data, /* XXX: fill header */)" % event)
             buff.append("{")
+            handle = "handle_event"
+            if self.is_start_event(event):
+                buff.append("\t/* XXX: validate that this event always leads to the initial state */")
+                handle = "handle_start_event"
+            elif self.is_start_run_event(event):
+                buff.append("\t/* XXX: validate that this event is only valid in the initial state */")
+                handle = "handle_start_run_event"
             if self.monitor_type == "per_task":
                 buff.append("\tstruct task_struct *p = /* XXX: how do I get p? */;");
-                buff.append("\tda_handle_event_%s(p, %s%s);" % (self.name, event, self.enum_suffix));
+                buff.append("\tda_%s_%s(p, %s%s);" % (handle, self.name, event, self.enum_suffix));
             else:
-                buff.append("\tda_handle_event_%s(%s%s);" % (self.name, event, self.enum_suffix));
+                buff.append("\tda_%s_%s(%s%s);" % (handle, self.name, event, self.enum_suffix));
             buff.append("}")
             buff.append("")
         return self.__buff_to_string(buff)
-- 
cgit v1.2.3


From 31ad36a271290648e7c2288a03d7b933d20254d6 Mon Sep 17 00:00:00 2001
From: chenchangcheng <ccc194101@163.com>
Date: Fri, 20 Dec 2024 15:48:47 +0800
Subject: objtool: Add bch2_trans_unlocked_error() to bcachefs noreturns

Fix the following objtool warning during build time:

    fs/bcachefs/btree_trans_commit.o: warning: objtool: bch2_trans_commit_write_locked.isra.0() falls through to next function do_bch2_trans_commit.isra.0()
    fs/bcachefs/btree_trans_commit.o: warning: objtool: .text: unexpected end of section
......
    fs/bcachefs/btree_update.o: warning: objtool: bch2_trans_update_get_key_cache() falls through to next function flush_new_cached_update()
    fs/bcachefs/btree_update.o: warning: objtool: flush_new_cached_update() falls through to next function bch2_trans_update_by_path()

bch2_trans_unlocked_error() is an Obviously Correct (tm) panic() wrapper,
add it to the list of known noreturns.

[ mingo: Improved the changelog ]

Fixes: fd104e2967b7 ("bcachefs: bch2_trans_verify_not_unlocked()")
Signed-off-by: chenchangcheng <chenchangcheng@kylinos.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20241220074847.3418134-1-ccc194101@163.com
---
 tools/objtool/noreturns.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index f37614cc2c1b..b2174894f9f7 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -19,6 +19,7 @@ NORETURN(__x64_sys_exit_group)
 NORETURN(arch_cpu_idle_dead)
 NORETURN(bch2_trans_in_restart_error)
 NORETURN(bch2_trans_restart_error)
+NORETURN(bch2_trans_unlocked_error)
 NORETURN(cpu_bringup_and_idle)
 NORETURN(cpu_startup_entry)
 NORETURN(do_exit)
-- 
cgit v1.2.3


From cc57f6cbef65c796a5661decaeffe3f5de397d19 Mon Sep 17 00:00:00 2001
From: Andrew Jones <ajones@ventanamicro.com>
Date: Thu, 17 Oct 2024 09:45:41 +0200
Subject: KVM: riscv: selftests: Add SBI SUSP to get-reg-list test

KVM supports SBI SUSP, so add it to the get-reg-list test.

Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20241017074538.18867-6-ajones@ventanamicro.com
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 tools/testing/selftests/kvm/riscv/get-reg-list.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index 4bc1051848e5..ea4f31660bc7 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -112,6 +112,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_HSM:
 	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_PMU:
 	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_DBCN:
+	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_SUSP:
 	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_STA:
 	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_EXPERIMENTAL:
 	case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_VENDOR:
@@ -535,10 +536,11 @@ static const char *sbi_ext_single_id_to_str(__u64 reg_off)
 		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_SRST),
 		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_HSM),
 		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_PMU),
+		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_DBCN),
+		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_SUSP),
 		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_STA),
 		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_EXPERIMENTAL),
 		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_VENDOR),
-		KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_DBCN),
 	};
 
 	if (reg_off >= ARRAY_SIZE(kvm_sbi_ext_reg_name))
@@ -949,6 +951,7 @@ KVM_SBI_EXT_SUBLIST_CONFIG(base, BASE);
 KVM_SBI_EXT_SUBLIST_CONFIG(sta, STA);
 KVM_SBI_EXT_SIMPLE_CONFIG(pmu, PMU);
 KVM_SBI_EXT_SIMPLE_CONFIG(dbcn, DBCN);
+KVM_SBI_EXT_SIMPLE_CONFIG(susp, SUSP);
 
 KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA);
 KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F);
@@ -1017,6 +1020,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_sbi_sta,
 	&config_sbi_pmu,
 	&config_sbi_dbcn,
+	&config_sbi_susp,
 	&config_aia,
 	&config_fp_f,
 	&config_fp_d,
-- 
cgit v1.2.3


From 144dfe4017bfe13cc2d459c2c4a7a4dc832c100c Mon Sep 17 00:00:00 2001
From: Quan Zhou <zhouquan@iscas.ac.cn>
Date: Mon, 2 Dec 2024 11:22:12 +0800
Subject: KVM: riscv: selftests: Add Svvptc/Zabha/Ziccrse exts to get-reg-list
 test

The KVM RISC-V allows Svvptc/Zabha/Ziccrse extensions for Guest/VM
so add them to get-reg-list test.

Signed-off-by: Quan Zhou <zhouquan@iscas.ac.cn>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/35163f0443993a942e0a021c6006bc5d2f0f5d5f.1732854096.git.zhouquan@iscas.ac.cn
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 tools/testing/selftests/kvm/riscv/get-reg-list.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index ea4f31660bc7..8515921dfdbf 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -52,6 +52,8 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVPBMT:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVVPTC:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZABHA:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZACAS:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZAWRS:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBA:
@@ -71,6 +73,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFHMIN:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICBOM:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICBOZ:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICCRSE:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICNTR:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICOND:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICSR:
@@ -430,6 +433,8 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(SVINVAL),
 		KVM_ISA_EXT_ARR(SVNAPOT),
 		KVM_ISA_EXT_ARR(SVPBMT),
+		KVM_ISA_EXT_ARR(SVVPTC),
+		KVM_ISA_EXT_ARR(ZABHA),
 		KVM_ISA_EXT_ARR(ZACAS),
 		KVM_ISA_EXT_ARR(ZAWRS),
 		KVM_ISA_EXT_ARR(ZBA),
@@ -449,6 +454,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(ZFHMIN),
 		KVM_ISA_EXT_ARR(ZICBOM),
 		KVM_ISA_EXT_ARR(ZICBOZ),
+		KVM_ISA_EXT_ARR(ZICCRSE),
 		KVM_ISA_EXT_ARR(ZICNTR),
 		KVM_ISA_EXT_ARR(ZICOND),
 		KVM_ISA_EXT_ARR(ZICSR),
@@ -967,6 +973,8 @@ KVM_ISA_EXT_SIMPLE_CONFIG(svadu, SVADU);
 KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL);
 KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT);
 KVM_ISA_EXT_SIMPLE_CONFIG(svpbmt, SVPBMT);
+KVM_ISA_EXT_SIMPLE_CONFIG(svvptc, SVVPTC);
+KVM_ISA_EXT_SIMPLE_CONFIG(zabha, ZABHA);
 KVM_ISA_EXT_SIMPLE_CONFIG(zacas, ZACAS);
 KVM_ISA_EXT_SIMPLE_CONFIG(zawrs, ZAWRS);
 KVM_ISA_EXT_SIMPLE_CONFIG(zba, ZBA);
@@ -986,6 +994,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zfh, ZFH);
 KVM_ISA_EXT_SIMPLE_CONFIG(zfhmin, ZFHMIN);
 KVM_ISA_EXT_SUBLIST_CONFIG(zicbom, ZICBOM);
 KVM_ISA_EXT_SUBLIST_CONFIG(zicboz, ZICBOZ);
+KVM_ISA_EXT_SIMPLE_CONFIG(ziccrse, ZICCRSE);
 KVM_ISA_EXT_SIMPLE_CONFIG(zicntr, ZICNTR);
 KVM_ISA_EXT_SIMPLE_CONFIG(zicond, ZICOND);
 KVM_ISA_EXT_SIMPLE_CONFIG(zicsr, ZICSR);
@@ -1035,6 +1044,8 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_svinval,
 	&config_svnapot,
 	&config_svpbmt,
+	&config_svvptc,
+	&config_zabha,
 	&config_zacas,
 	&config_zawrs,
 	&config_zba,
@@ -1054,6 +1065,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_zfhmin,
 	&config_zicbom,
 	&config_zicboz,
+	&config_ziccrse,
 	&config_zicntr,
 	&config_zicond,
 	&config_zicsr,
-- 
cgit v1.2.3


From 1846dd8e3a3e28f58e72cadbf4d81f374e63a085 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Mon, 30 Dec 2024 14:31:22 -0700
Subject: libbpf: Set MFD_NOEXEC_SEAL when creating memfd

Starting from 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and
MFD_EXEC") and until 1717449b4417 ("memfd: drop warning for missing
exec-related flags"), the kernel would print a warning if neither
MFD_NOEXEC_SEAL nor MFD_EXEC is set in memfd_create().

If libbpf runs on on a kernel between these two commits (eg. on an
improperly backported system), it'll trigger this warning.

To avoid this warning (and also be more secure), explicitly set
MFD_NOEXEC_SEAL. But since libbpf can be run on potentially very old
kernels, leave a fallback for kernels without MFD_NOEXEC_SEAL support.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/6e62c2421ad7eb1da49cbf16da95aaaa7f94d394.1735594195.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/libbpf.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 66173ddb5a2d..46492cc0927d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1731,12 +1731,24 @@ static int sys_memfd_create(const char *name, unsigned flags)
 #ifndef MFD_CLOEXEC
 #define MFD_CLOEXEC 0x0001U
 #endif
+#ifndef MFD_NOEXEC_SEAL
+#define MFD_NOEXEC_SEAL 0x0008U
+#endif
 
 static int create_placeholder_fd(void)
 {
+	unsigned int flags = MFD_CLOEXEC | MFD_NOEXEC_SEAL;
+	const char *name = "libbpf-placeholder-fd";
 	int fd;
 
-	fd = ensure_good_fd(sys_memfd_create("libbpf-placeholder-fd", MFD_CLOEXEC));
+	fd = ensure_good_fd(sys_memfd_create(name, flags));
+	if (fd >= 0)
+		return fd;
+	else if (errno != EINVAL)
+		return -errno;
+
+	/* Possibly running on kernel without MFD_NOEXEC_SEAL */
+	fd = ensure_good_fd(sys_memfd_create(name, flags & ~MFD_NOEXEC_SEAL));
 	if (fd < 0)
 		return -errno;
 	return fd;
-- 
cgit v1.2.3


From 75137d9ebe9e75358e859fda37fa1ca9f05c1a59 Mon Sep 17 00:00:00 2001
From: Matan Shachnai <m.shachnai@gmail.com>
Date: Tue, 17 Dec 2024 22:23:35 -0500
Subject: selftests/bpf: Add testcases for BPF_MUL

The previous commit improves precision of BPF_MUL.
Add tests to exercise updated BPF_MUL.

Signed-off-by: Matan Shachnai <m.shachnai@gmail.com>
Link: https://lore.kernel.org/r/20241218032337.12214-3-m.shachnai@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/verifier_bounds.c  | 134 +++++++++++++++++++++
 1 file changed, 134 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index a0bb7fb40ea5..0eb33bb801b5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -1200,4 +1200,138 @@ l0_%=:	r0 = 0;						\
 	: __clobber_all);
 }
 
+SEC("tc")
+__description("multiply mixed sign bounds. test 1")
+__success __log_level(2)
+__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))")
+__naked void mult_mixed0_sign(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"r6 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r6 &= 0xf;"
+	"r6 -= 1000000000;"
+	"r7 &= 0xf;"
+	"r7 -= 2000000000;"
+	"r6 *= r7;"
+	"exit"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_skb_store_bytes)
+	: __clobber_all);
+}
+
+SEC("tc")
+__description("multiply mixed sign bounds. test 2")
+__success __log_level(2)
+__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=smin32=-100,smax=smax32=200)")
+__naked void mult_mixed1_sign(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"r6 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r6 &= 0xf;"
+	"r6 -= 0xa;"
+	"r7 &= 0xf;"
+	"r7 -= 0x14;"
+	"r6 *= r7;"
+	"exit"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_skb_store_bytes)
+	: __clobber_all);
+}
+
+SEC("tc")
+__description("multiply negative bounds")
+__success __log_level(2)
+__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))")
+__naked void mult_sign_bounds(void)
+{
+	asm volatile (
+	"r8 = 0x7fff;"
+	"call %[bpf_get_prandom_u32];"
+	"r6 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r6 &= 0xa;"
+	"r6 -= r8;"
+	"r7 &= 0xf;"
+	"r7 -= r8;"
+	"r6 *= r7;"
+	"exit"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_skb_store_bytes)
+	: __clobber_all);
+}
+
+SEC("tc")
+__description("multiply bounds that don't cross signed boundary")
+__success __log_level(2)
+__msg("r8 *= r6 {{.*}}; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8_w=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))")
+__naked void mult_no_sign_crossing(void)
+{
+	asm volatile (
+	"r6 = 0xb;"
+	"r8 = 0xb3c3f8c99262687 ll;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r6 &= r7;"
+	"r8 *= r6;"
+	"exit"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_skb_store_bytes)
+	: __clobber_all);
+}
+
+SEC("tc")
+__description("multiplication overflow, result in unbounded reg. test 1")
+__success __log_level(2)
+__msg("r6 *= r7 {{.*}}; R6_w=scalar()")
+__naked void mult_unsign_ovf(void)
+{
+	asm volatile (
+	"r8 = 0x7ffffffffff ll;"
+	"call %[bpf_get_prandom_u32];"
+	"r6 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r6 &= 0x7fffffff;"
+	"r7 &= r8;"
+	"r6 *= r7;"
+	"exit"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_skb_store_bytes)
+	: __clobber_all);
+}
+
+SEC("tc")
+__description("multiplication overflow, result in unbounded reg. test 2")
+__success __log_level(2)
+__msg("r6 *= r7 {{.*}}; R6_w=scalar()")
+__naked void mult_sign_ovf(void)
+{
+	asm volatile (
+	"r8 = 0x7ffffffff ll;"
+	"call %[bpf_get_prandom_u32];"
+	"r6 = r0;"
+	"call %[bpf_get_prandom_u32];"
+	"r7 = r0;"
+	"r6 &= 0xa;"
+	"r6 -= r8;"
+	"r7 &= 0x7fffffff;"
+	"r6 *= r7;"
+	"exit"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_skb_store_bytes)
+	: __clobber_all);
+}
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 9468f39ba478d001f2603ce5bf0e1ab4b97452b8 Mon Sep 17 00:00:00 2001
From: Mahe Tardy <mahe.tardy@gmail.com>
Date: Fri, 20 Dec 2024 15:22:18 +0000
Subject: selftests/bpf: fix veristat comp mode with new stats

Commit 82c1f13de315 ("selftests/bpf: Add more stats into veristat")
introduced new stats, added by default in the CSV output, that were not
added to parse_stat_value, used in parse_stats_csv which is used in
comparison mode. Thus it broke comparison mode altogether making it fail
with "Unrecognized stat #7" and EINVAL.

One quirk is that PROG_TYPE and ATTACH_TYPE have been transformed to
strings using libbpf_bpf_prog_type_str and libbpf_bpf_attach_type_str
respectively. Since we might not want to compare those string values, we
just skip the parsing in this patch. We might want to translate it back
to the enum value or compare the string value directly.

Fixes: 82c1f13de315 ("selftests/bpf: Add more stats into veristat")
Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
Tested-by: Mykyta Yatsenko<yatsenko@meta.com>
Link: https://lore.kernel.org/r/20241220152218.28405-1-mahe.tardy@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 9d17b4dfc170..476bf95cf684 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -1672,7 +1672,10 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats
 	case TOTAL_STATES:
 	case PEAK_STATES:
 	case MAX_STATES_PER_INSN:
-	case MARK_READ_MAX_LEN: {
+	case MARK_READ_MAX_LEN:
+	case SIZE:
+	case JITED_SIZE:
+	case STACK: {
 		long val;
 		int err, n;
 
@@ -1685,6 +1688,9 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats
 		st->stats[id] = val;
 		break;
 	}
+	case PROG_TYPE:
+	case ATTACH_TYPE:
+		break;
 	default:
 		fprintf(stderr, "Unrecognized stat #%d\n", id);
 		return -EINVAL;
-- 
cgit v1.2.3


From ea0916e01d0b0f2cce1369ac1494239a79827270 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 28 Nov 2024 15:06:18 +0000
Subject: selftests/memfd: add test for mapping write-sealed memfd read-only

Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only,
assert that we are able to do this in a test to ensure that we do not
regress this again.

Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Julian Orth <ju.orth@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 43 ++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 0a0b55516028..c0c53451a16d 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -282,6 +282,24 @@ static void *mfd_assert_mmap_shared(int fd)
 	return p;
 }
 
+static void *mfd_assert_mmap_read_shared(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 mfd_def_size,
+		 PROT_READ,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
 static void *mfd_assert_mmap_private(int fd)
 {
 	void *p;
@@ -980,6 +998,30 @@ static void test_seal_future_write(void)
 	close(fd);
 }
 
+static void test_seal_write_map_read_shared(void)
+{
+	int fd;
+	void *p;
+
+	printf("%s SEAL-WRITE-MAP-READ\n", memfd_str);
+
+	fd = mfd_assert_new("kern_memfd_seal_write_map_read",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+
+	p = mfd_assert_mmap_read_shared(fd);
+
+	mfd_assert_read(fd);
+	mfd_assert_read_shared(fd);
+	mfd_fail_write(fd);
+
+	munmap(p, mfd_def_size);
+	close(fd);
+}
+
 /*
  * Test SEAL_SHRINK
  * Test whether SEAL_SHRINK actually prevents shrinking
@@ -1593,6 +1635,7 @@ int main(int argc, char **argv)
 
 	test_seal_write();
 	test_seal_future_write();
+	test_seal_write_map_read_shared();
 	test_seal_shrink();
 	test_seal_grow();
 	test_seal_resize();
-- 
cgit v1.2.3


From 2f84d072bdcb7d6ec66cc4d0de9f37a3dc394cd2 Mon Sep 17 00:00:00 2001
From: Alejandro Lucero <alucerop@amd.com>
Date: Tue, 3 Dec 2024 16:21:12 +0000
Subject: cxl/pci: Add CXL Type 1/2 support to cxl_dvsec_rr_decode()

In cxl_dvsec_rr_decode() the pci driver expects to retrieve a cxlds,
struct cxl_dev_state, from the driver_data field of struct device.
While that works for Type 3, drivers for Type 1/2 devices may not
put a cxlds in the driver_data field.

In preparation for supporting Type 1/2 devices, replace parameter
'struct device' with 'struct cxl_dev_state' in cxl_dvsec_rr_decode().

Remove the unused parameter 'cxl_port' in cxl_dvsec_rr_decode().

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20241203162112.5088-1-alucerop@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 tools/testing/cxl/test/mock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 450c7566c33f..af2594e4f35d 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -228,16 +228,16 @@ int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_hdm_decode_init, "CXL");
 
-int __wrap_cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port,
+int __wrap_cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
 			       struct cxl_endpoint_dvsec_info *info)
 {
 	int rc = 0, index;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
 
-	if (ops && ops->is_mock_dev(dev))
+	if (ops && ops->is_mock_dev(cxlds->dev))
 		rc = 0;
 	else
-		rc = cxl_dvsec_rr_decode(dev, port, info);
+		rc = cxl_dvsec_rr_decode(cxlds, info);
 	put_cxl_mock_ops(index);
 
 	return rc;
-- 
cgit v1.2.3


From f1e8bf56320a7fb32095b6c51b707459361b403b Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Tue, 24 Dec 2024 21:05:03 +0800
Subject: driver core: Constify API device_find_child() and adapt for various
 usages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Constify the following API:
struct device *device_find_child(struct device *dev, void *data,
		int (*match)(struct device *dev, void *data));
To :
struct device *device_find_child(struct device *dev, const void *data,
                                 device_match_t match);
typedef int (*device_match_t)(struct device *dev, const void *data);
with the following reasons:

- Protect caller's match data @*data which is for comparison and lookup
  and the API does not actually need to modify @*data.

- Make the API's parameters (@match)() and @data have the same type as
  all of other device finding APIs (bus|class|driver)_find_device().

- All kinds of existing device match functions can be directly taken
  as the API's argument, they were exported by driver core.

Constify the API and adapt for various existing usages.

BTW, various subsystem changes are squashed into this commit to meet
'git bisect' requirement, and this commit has the minimal and simplest
changes to complement squashing shortcoming, and that may bring extra
code improvement.

Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Acked-by: Uwe Kleine-König <ukleinek@kernel.org> # for drivers/pwm
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20241224-const_dfc_done-v5-4-6623037414d4@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/cxl/test/cxl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index d0337c11f9ee..cc8948f49117 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -725,7 +725,7 @@ static void default_mock_decoder(struct cxl_decoder *cxld)
 	cxld->reset = mock_decoder_reset;
 }
 
-static int first_decoder(struct device *dev, void *data)
+static int first_decoder(struct device *dev, const void *data)
 {
 	struct cxl_decoder *cxld;
 
-- 
cgit v1.2.3


From 991c8aacfb6e3088027b8c776ad31d2c093905b8 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 20 Nov 2024 08:26:15 -0800
Subject: tools/power/x86/intel-speed-select: Fix TRL restore after SST-TF
 disable

When SST-TF is disabled, the TRL (Turbo Ratio Limit) of config level 0
is getting restored. But the TRL of current level should be restored
which may not be config level 0.

This is caused by a bug in treating config level as TRL level. So
arguments needs to be swapped.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 tools/power/x86/intel-speed-select/isst-core-tpmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-core-tpmi.c b/tools/power/x86/intel-speed-select/isst-core-tpmi.c
index 32ea70c7dbd8..da53aaa27fc9 100644
--- a/tools/power/x86/intel-speed-select/isst-core-tpmi.c
+++ b/tools/power/x86/intel-speed-select/isst-core-tpmi.c
@@ -329,7 +329,7 @@ static int tpmi_get_get_trls(struct isst_id *id, int config_index,
 	return 0;
 }
 
-static int tpmi_get_get_trl(struct isst_id *id, int level, int config_index,
+static int tpmi_get_get_trl(struct isst_id *id, int config_index, int level,
 			    int *trl)
 {
 	struct isst_pkg_ctdp_level_info ctdp_level;
-- 
cgit v1.2.3


From 600c8f24319cebe671a70722df99b8006daebe21 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 20 Nov 2024 08:35:40 -0800
Subject: tools/power/x86/intel-speed-select: v1.21 release

This version has one fix:
- Fix restoring TRL after SST-TF disable

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 5127be34869e..fadfb02b8611 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -16,7 +16,7 @@ struct process_cmd_struct {
 	int arg;
 };
 
-static const char *version_str = "v1.20";
+static const char *version_str = "v1.21";
 
 static const int supported_api_ver = 3;
 static struct isst_if_platform_info isst_platform_info;
-- 
cgit v1.2.3


From 15858da53542360931a457f32bcdc4287d13731f Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 2 Jan 2025 09:22:57 +0100
Subject: selftests: coredump: Add stackdump test

Add a test which checks that the kstkesp field in /proc/pid/stat can be
read for all threads of a coredumping process.

For full details including the motivation for this test and how it works,
see the README file added by this commit.

Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/50e737b6576208566d14efcf1934fe840de6b1f4.1735805772.git.namcao@linutronix.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/coredump/Makefile         |   7 +
 tools/testing/selftests/coredump/README.rst       |  50 +++++++
 tools/testing/selftests/coredump/stackdump        |  14 ++
 tools/testing/selftests/coredump/stackdump_test.c | 151 ++++++++++++++++++++++
 4 files changed, 222 insertions(+)
 create mode 100644 tools/testing/selftests/coredump/Makefile
 create mode 100644 tools/testing/selftests/coredump/README.rst
 create mode 100755 tools/testing/selftests/coredump/stackdump
 create mode 100644 tools/testing/selftests/coredump/stackdump_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile
new file mode 100644
index 000000000000..ed210037b29d
--- /dev/null
+++ b/tools/testing/selftests/coredump/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS = $(KHDR_INCLUDES)
+
+TEST_GEN_PROGS := stackdump_test
+TEST_FILES := stackdump
+
+include ../lib.mk
diff --git a/tools/testing/selftests/coredump/README.rst b/tools/testing/selftests/coredump/README.rst
new file mode 100644
index 000000000000..164a7aa181c8
--- /dev/null
+++ b/tools/testing/selftests/coredump/README.rst
@@ -0,0 +1,50 @@
+coredump selftest
+=================
+
+Background context
+------------------
+
+`coredump` is a feature which dumps a process's memory space when the process terminates
+unexpectedly (e.g. due to segmentation fault), which can be useful for debugging. By default,
+`coredump` dumps the memory to the file named `core`, but this behavior can be changed by writing a
+different file name to `/proc/sys/kernel/core_pattern`. Furthermore, `coredump` can be piped to a
+user-space program by writing the pipe symbol (`|`) followed by the command to be executed to
+`/proc/sys/kernel/core_pattern`. For the full description, see `man 5 core`.
+
+The piped user program may be interested in reading the stack pointers of the crashed process. The
+crashed process's stack pointers can be read from `procfs`: it is the `kstkesp` field in
+`/proc/$PID/stat`. See `man 5 proc` for all the details.
+
+The problem
+-----------
+While a thread is active, the stack pointer is unsafe to read and therefore the `kstkesp` field
+reads zero. But when the thread is dead (e.g. during a coredump), this field should have valid
+value.
+
+However, this was broken in the past and `kstkesp` was zero even during coredump:
+
+* commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") changed kstkesp to
+  always be zero
+
+* commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping") fixed it for the
+  coredumping thread. However, other threads in a coredumping process still had the problem.
+
+* commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads") fixed
+  for all threads in a coredumping process.
+
+* commit 92307383082d ("coredump:  Don't perform any cleanups before dumping core") broke it again
+  for the other threads in a coredumping process.
+
+The problem has been fixed now, but considering the history, it may appear again in the future.
+
+The goal of this test
+---------------------
+This test detects problem with reading `kstkesp` during coredump by doing the following:
+
+#. Tell the kernel to execute the "stackdump" script when a coredump happens. This script
+   reads the stack pointers of all threads of crashed processes.
+
+#. Spawn a child process who creates some threads and then crashes.
+
+#. Read the output from the "stackdump" script, and make sure all stack pointer values are
+   non-zero.
diff --git a/tools/testing/selftests/coredump/stackdump b/tools/testing/selftests/coredump/stackdump
new file mode 100755
index 000000000000..96714ce42d12
--- /dev/null
+++ b/tools/testing/selftests/coredump/stackdump
@@ -0,0 +1,14 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+CRASH_PROGRAM_ID=$1
+STACKDUMP_FILE=$2
+
+TMP=$(mktemp)
+
+for t in /proc/$CRASH_PROGRAM_ID/task/*; do
+	tid=$(basename $t)
+	cat /proc/$tid/stat | awk '{print $29}' >> $TMP
+done
+
+mv $TMP $STACKDUMP_FILE
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
new file mode 100644
index 000000000000..137b2364a082
--- /dev/null
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <fcntl.h>
+#include <libgen.h>
+#include <linux/limits.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#define STACKDUMP_FILE "stack_values"
+#define STACKDUMP_SCRIPT "stackdump"
+#define NUM_THREAD_SPAWN 128
+
+static void *do_nothing(void *)
+{
+	while (1)
+		pause();
+}
+
+static void crashing_child(void)
+{
+	pthread_t thread;
+	int i;
+
+	for (i = 0; i < NUM_THREAD_SPAWN; ++i)
+		pthread_create(&thread, NULL, do_nothing, NULL);
+
+	/* crash on purpose */
+	i = *(int *)NULL;
+}
+
+FIXTURE(coredump)
+{
+	char original_core_pattern[256];
+};
+
+FIXTURE_SETUP(coredump)
+{
+	char buf[PATH_MAX];
+	FILE *file;
+	char *dir;
+	int ret;
+
+	file = fopen("/proc/sys/kernel/core_pattern", "r");
+	ASSERT_NE(NULL, file);
+
+	ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+	ASSERT_TRUE(ret || feof(file));
+	ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+	self->original_core_pattern[ret] = '\0';
+
+	ret = fclose(file);
+	ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+	const char *reason;
+	FILE *file;
+	int ret;
+
+	unlink(STACKDUMP_FILE);
+
+	file = fopen("/proc/sys/kernel/core_pattern", "w");
+	if (!file) {
+		reason = "Unable to open core_pattern";
+		goto fail;
+	}
+
+	ret = fprintf(file, "%s", self->original_core_pattern);
+	if (ret < 0) {
+		reason = "Unable to write to core_pattern";
+		goto fail;
+	}
+
+	ret = fclose(file);
+	if (ret) {
+		reason = "Unable to close core_pattern";
+		goto fail;
+	}
+
+	return;
+fail:
+	/* This should never happen */
+	fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason);
+}
+
+TEST_F(coredump, stackdump)
+{
+	struct sigaction action = {};
+	unsigned long long stack;
+	char *test_dir, *line;
+	size_t line_length;
+	char buf[PATH_MAX];
+	int ret, i;
+	FILE *file;
+	pid_t pid;
+
+	/*
+	 * Step 1: Setup core_pattern so that the stackdump script is executed when the child
+	 * process crashes
+	 */
+	ret = readlink("/proc/self/exe", buf, sizeof(buf));
+	ASSERT_NE(-1, ret);
+	ASSERT_LT(ret, sizeof(buf));
+	buf[ret] = '\0';
+
+	test_dir = dirname(buf);
+
+	file = fopen("/proc/sys/kernel/core_pattern", "w");
+	ASSERT_NE(NULL, file);
+
+	ret = fprintf(file, "|%1$s/%2$s %%P %1$s/%3$s", test_dir, STACKDUMP_SCRIPT, STACKDUMP_FILE);
+	ASSERT_LT(0, ret);
+
+	ret = fclose(file);
+	ASSERT_EQ(0, ret);
+
+	/* Step 2: Create a process who spawns some threads then crashes */
+	pid = fork();
+	ASSERT_TRUE(pid >= 0);
+	if (pid == 0)
+		crashing_child();
+
+	/*
+	 * Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file
+	 */
+	for (i = 0; i < 10; ++i) {
+		file = fopen(STACKDUMP_FILE, "r");
+		if (file)
+			break;
+		sleep(1);
+	}
+	ASSERT_NE(file, NULL);
+
+	/* Step 4: Make sure all stack pointer values are non-zero */
+	for (i = 0; -1 != getline(&line, &line_length, file); ++i) {
+		stack = strtoull(line, NULL, 10);
+		ASSERT_NE(stack, 0);
+	}
+
+	ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN);
+
+	fclose(file);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From e95274dfe86490ec2a5633035c24b2de6722841f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 3 Jan 2025 10:24:58 -0800
Subject: selftests: tc-testing: reduce rshift value

After previous change rshift >= 32 is no longer allowed.
Modify the test to use 31, the test doesn't seem to send
any traffic so the exact value shouldn't matter.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250103182458.1213486-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/tc-testing/tc-tests/filters/flow.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
index 996448afe31b..91d120548bf5 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
@@ -78,10 +78,10 @@
         "setup": [
             "$TC qdisc add dev $DEV1 ingress"
         ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0xff",
+        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0x1f",
         "expExitCode": "0",
         "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol ip prio 1 flow",
-        "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 255 baseclass",
+        "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 31 baseclass",
         "matchCount": "1",
         "teardown": [
             "$TC qdisc del dev $DEV1 ingress"
-- 
cgit v1.2.3


From b9ed315d3c4c0c294a4348edb6874d489bac47fa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 21 Dec 2024 00:46:56 +0100
Subject: netkit: Allow for configuring needed_{head,tail}room

Allow the user to configure needed_{head,tail}room for both netkit
devices. The idea is similar to 163e529200af ("veth: implement
ndo_set_rx_headroom") with the difference that the two parameters
can be specified upon device creation. By default the current behavior
stays as is which is needed_{head,tail}room is 0.

In case of Cilium, for example, the netkit devices are not enslaved
into a bridge or openvswitch device (rather, BPF-based redirection
is used out of tcx), and as such these parameters are not propagated
into the Pod's netns via peer device.

Given Cilium can run in vxlan/geneve tunneling mode (needed_headroom)
and/or be used in combination with WireGuard (needed_{head,tail}room),
allow the Cilium CNI plugin to specify these two upon netkit device
creation.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/bpf/20241220234658.490686-1-daniel@iogearbox.net
---
 tools/include/uapi/linux/if_link.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 8516c1ccd57a..7e46ca4cd31b 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -1315,6 +1315,8 @@ enum {
 	IFLA_NETKIT_MODE,
 	IFLA_NETKIT_SCRUB,
 	IFLA_NETKIT_PEER_SCRUB,
+	IFLA_NETKIT_HEADROOM,
+	IFLA_NETKIT_TAILROOM,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
-- 
cgit v1.2.3


From 058268e23fcadc2bdb9297c6dff3a010c70f9762 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 21 Dec 2024 00:46:58 +0100
Subject: selftests/bpf: Extend netkit tests to validate set {head,tail}room

Extend the netkit selftests to specify and validate the {head,tail}room
on the netdevice:

  # ./vmtest.sh -- ./test_progs -t netkit
  [...]
  ./test_progs -t netkit
  [    1.174147] bpf_testmod: loading out-of-tree module taints kernel.
  [    1.174585] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
  [    1.422307] tsc: Refined TSC clocksource calibration: 3407.983 MHz
  [    1.424511] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fc3e5084, max_idle_ns: 440795359833 ns
  [    1.428092] clocksource: Switched to clocksource tsc
  #363     tc_netkit_basic:OK
  #364     tc_netkit_device:OK
  #365     tc_netkit_multi_links:OK
  #366     tc_netkit_multi_opts:OK
  #367     tc_netkit_neigh_links:OK
  #368     tc_netkit_pkt_type:OK
  #369     tc_netkit_scrub:OK
  Summary: 7/0 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/bpf/20241220234658.490686-3-daniel@iogearbox.net
---
 tools/testing/selftests/bpf/prog_tests/tc_netkit.c | 49 ++++++++++++++--------
 tools/testing/selftests/bpf/progs/test_tc_link.c   | 15 +++++++
 2 files changed, 46 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
index 151a4210028f..2461d183dee5 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
@@ -14,10 +14,16 @@
 #include "netlink_helpers.h"
 #include "tc_helpers.h"
 
+#define NETKIT_HEADROOM	32
+#define NETKIT_TAILROOM	8
+
 #define MARK		42
 #define PRIO		0xeb9f
 #define ICMP_ECHO	8
 
+#define FLAG_ADJUST_ROOM (1 << 0)
+#define FLAG_SAME_NETNS  (1 << 1)
+
 struct icmphdr {
 	__u8		type;
 	__u8		code;
@@ -35,7 +41,7 @@ struct iplink_req {
 };
 
 static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
-			 bool same_netns, int scrub, int peer_scrub)
+			 int scrub, int peer_scrub, __u32 flags)
 {
 	struct rtnl_handle rth = { .fd = -1 };
 	struct iplink_req req = {};
@@ -63,6 +69,10 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
+	if (flags & FLAG_ADJUST_ROOM) {
+		addattr16(&req.n, sizeof(req), IFLA_NETKIT_HEADROOM, NETKIT_HEADROOM);
+		addattr16(&req.n, sizeof(req), IFLA_NETKIT_TAILROOM, NETKIT_TAILROOM);
+	}
 	addattr_nest_end(&req.n, data);
 	addattr_nest_end(&req.n, linkinfo);
 
@@ -87,7 +97,7 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
 				 " addr ee:ff:bb:cc:aa:dd"),
 				 "set hwaddress");
 	}
-	if (same_netns) {
+	if (flags & FLAG_SAME_NETNS) {
 		ASSERT_OK(system("ip link set dev " netkit_peer " up"),
 				 "up peer");
 		ASSERT_OK(system("ip addr add dev " netkit_peer " 10.0.0.2/24"),
@@ -184,8 +194,8 @@ void serial_test_tc_netkit_basic(void)
 	int err, ifindex;
 
 	err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -299,8 +309,8 @@ static void serial_test_tc_netkit_multi_links_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -428,8 +438,8 @@ static void serial_test_tc_netkit_multi_opts_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -543,8 +553,8 @@ void serial_test_tc_netkit_device(void)
 	int err, ifindex, ifindex2;
 
 	err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, true, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS);
 	if (err)
 		return;
 
@@ -655,8 +665,8 @@ static void serial_test_tc_netkit_neigh_links_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -733,8 +743,8 @@ static void serial_test_tc_netkit_pkt_type_mode(int mode)
 	struct bpf_link *link;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, true, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS);
 	if (err)
 		return;
 
@@ -799,7 +809,7 @@ void serial_test_tc_netkit_pkt_type(void)
 	serial_test_tc_netkit_pkt_type_mode(NETKIT_L3);
 }
 
-static void serial_test_tc_netkit_scrub_type(int scrub)
+static void serial_test_tc_netkit_scrub_type(int scrub, bool room)
 {
 	LIBBPF_OPTS(bpf_netkit_opts, optl);
 	struct test_tc_link *skel;
@@ -807,7 +817,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub)
 	int err, ifindex;
 
 	err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, scrub, scrub);
+			    &ifindex, scrub, scrub,
+			    room ? FLAG_ADJUST_ROOM : 0);
 	if (err)
 		return;
 
@@ -842,6 +853,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub)
 	ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8");
 	ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark");
 	ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio");
+	ASSERT_EQ(skel->bss->headroom, room ? NETKIT_HEADROOM : 0, "headroom");
+	ASSERT_EQ(skel->bss->tailroom, room ? NETKIT_TAILROOM : 0, "tailroom");
 cleanup:
 	test_tc_link__destroy(skel);
 
@@ -852,6 +865,6 @@ cleanup:
 
 void serial_test_tc_netkit_scrub(void)
 {
-	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT);
-	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE);
+	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT, false);
+	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE, true);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c
index 10d825928499..630f12e51b07 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_link.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_link.c
@@ -8,6 +8,7 @@
 #include <linux/if_packet.h>
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
 
 char LICENSE[] SEC("license") = "GPL";
 
@@ -27,6 +28,7 @@ bool seen_host;
 bool seen_mcast;
 
 int mark, prio;
+unsigned short headroom, tailroom;
 
 SEC("tc/ingress")
 int tc1(struct __sk_buff *skb)
@@ -104,11 +106,24 @@ out:
 	return TCX_PASS;
 }
 
+struct sk_buff {
+	struct net_device *dev;
+};
+
+struct net_device {
+	unsigned short needed_headroom;
+	unsigned short needed_tailroom;
+};
+
 SEC("tc/egress")
 int tc8(struct __sk_buff *skb)
 {
+	struct net_device *dev = BPF_CORE_READ((struct sk_buff *)skb, dev);
+
 	seen_tc8 = true;
 	mark = skb->mark;
 	prio = skb->priority;
+	headroom = BPF_CORE_READ(dev, needed_headroom);
+	tailroom = BPF_CORE_READ(dev, needed_tailroom);
 	return TCX_PASS;
 }
-- 
cgit v1.2.3


From 73b9075f334f5debf28646884a320b796b27768d Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <mrpre@163.com>
Date: Tue, 24 Dec 2024 15:59:57 +0800
Subject: selftests/bpf: Avoid generating untracked files when running bpf
 selftests

Currently, when we run the BPF selftests with the following command:

  make -C tools/testing/selftests TARGETS=bpf SKIP_TARGETS=""

The command generates untracked files and directories with make version
less than 4.4:

'''
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	tools/testing/selftests/bpfFEATURE-DUMP.selftests
	tools/testing/selftests/bpffeature/
'''

We lost slash after word "bpf". The reason is slash appending code is as
follow:

'''
OUTPUT := $(OUTPUT)/
$(eval include ../../../build/Makefile.feature)
OUTPUT := $(patsubst %/,%,$(OUTPUT))
'''

This way of assigning values to OUTPUT will never be effective for the
variable OUTPUT provided via the command argument [1] and BPF makefile
is called from parent Makfile(tools/testing/selftests/Makefile) like:

'''
all:
  ...
	$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET
'''

According to GNU make, we can use override Directive to fix this issue [2].

  [1] https://www.gnu.org/software/make/manual/make.html#Overriding
  [2] https://www.gnu.org/software/make/manual/make.html#Override-Directive

Fixes: dc3a8804d790 ("selftests/bpf: Adapt OUTPUT appending logic to lower versions of Make")
Signed-off-by: Jiayuan Chen <mrpre@163.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20241224075957.288018-1-mrpre@163.com
---
 tools/testing/selftests/bpf/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9e870e519c30..eb4d21651aa7 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -202,9 +202,9 @@ ifeq ($(shell expr $(MAKE_VERSION) \>= 4.4), 1)
 $(let OUTPUT,$(OUTPUT)/,\
 	$(eval include ../../../build/Makefile.feature))
 else
-OUTPUT := $(OUTPUT)/
+override OUTPUT := $(OUTPUT)/
 $(eval include ../../../build/Makefile.feature)
-OUTPUT := $(patsubst %/,%,$(OUTPUT))
+override OUTPUT := $(patsubst %/,%,$(OUTPUT))
 endif
 endif
 
-- 
cgit v1.2.3


From 87091dd986db51406e64dd5e8c9d22617c66c6af Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Sat, 4 Jan 2025 15:25:28 -0500
Subject: selftests/bpf: test bpf_for within spin lock section

Add a selftest to ensure BPF for loops within critical sections are
accepted by the verifier.

Signed-off-by: Emil Tsalapatis (Meta) <emil@etsalapatis.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250104202528.882482-3-emil@etsalapatis.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_spin_lock.c       | 26 ++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
index 25599eac9a70..d9d7b05cf6d2 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spin_lock.c
@@ -530,4 +530,30 @@ l1_%=:	exit;						\
 	: __clobber_all);
 }
 
+SEC("tc")
+__description("spin_lock: loop within a locked region")
+__success __failure_unpriv __msg_unpriv("")
+__retval(0)
+int bpf_loop_inside_locked_region(void)
+{
+	const int zero = 0;
+	struct val *val;
+	int i, j = 0;
+
+	val = bpf_map_lookup_elem(&map_spin_lock, &zero);
+	if (!val)
+		return -1;
+
+	bpf_spin_lock(&val->l);
+	bpf_for(i, 0, 10) {
+		j++;
+		/* Silence "unused variable" warnings. */
+		if (j == 10)
+			break;
+	}
+	bpf_spin_unlock(&val->l);
+
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From f44275e7155dc310d36516fc25be503da099781c Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Mon, 6 Jan 2025 20:17:31 +0000
Subject: selftests/bpf: add -fno-strict-aliasing to BPF_CFLAGS

Following the discussion at [1], set -fno-strict-aliasing flag for all
BPF object build rules. Remove now unnecessary <test>-CFLAGS variables.

[1] https://lore.kernel.org/bpf/20250106185447.951609-1-ihor.solodrai@pm.me/

CC: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250106201728.1219791-1-ihor.solodrai@pm.me
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index eb4d21651aa7..d5be2f94deef 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -54,21 +54,6 @@ PCAP_LIBS	:= $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null)
 LDLIBS += $(PCAP_LIBS)
 CFLAGS += $(PCAP_CFLAGS)
 
-# The following tests perform type punning and they may break strict
-# aliasing rules, which are exploited by both GCC and clang by default
-# while optimizing.  This can lead to broken programs.
-progs/bind4_prog.c-CFLAGS := -fno-strict-aliasing
-progs/bind6_prog.c-CFLAGS := -fno-strict-aliasing
-progs/dynptr_fail.c-CFLAGS := -fno-strict-aliasing
-progs/linked_list_fail.c-CFLAGS := -fno-strict-aliasing
-progs/map_kptr_fail.c-CFLAGS := -fno-strict-aliasing
-progs/syscall.c-CFLAGS := -fno-strict-aliasing
-progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing
-progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing
-progs/timer_crash.c-CFLAGS := -fno-strict-aliasing
-progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing
-progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing
-
 # Some utility functions use LLVM libraries
 jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS)
 
@@ -103,18 +88,6 @@ progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error
 progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error
 progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error
 
-# The following tests do type-punning, via the __imm_insn macro, from
-# `struct bpf_insn' to long and then uses the value.  This triggers an
-# "is used uninitialized" warning in GCC due to strict-aliasing
-# rules.
-progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
-progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
 endif
 
 ifneq ($(CLANG_CPUV4),)
@@ -474,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
 BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN)	\
 	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)			\
 	     -I$(abspath $(OUTPUT)/../usr/include)			\
+	     -fno-strict-aliasing 					\
 	     -Wno-compare-distinct-pointer-types
 # TODO: enable me -Wsign-compare
 
-- 
cgit v1.2.3


From 46c61cbeb82f8a4e6354a692d2be1a35cb0bde29 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Mon, 6 Jan 2025 14:43:21 +0000
Subject: selftests/bpf: Handle prog/attach type comparison in veristat

Implemented handling of prog type and attach type stats comparison in
veristat.
To test this change:
```
./veristat pyperf600.bpf.o -o csv > base1.csv
./veristat pyperf600.bpf.o -o csv > base2.csv
./veristat -C base2.csv base1.csv -o csv
...,raw_tracepoint,raw_tracepoint,MATCH,
...,cgroup_inet_ingress,cgroup_inet_ingress,MATCH
```

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20250106144321.32337-1-mykyta.yatsenko5@gmail.com
---
 tools/testing/selftests/bpf/veristat.c | 37 ++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 476bf95cf684..974c808f9321 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -1688,9 +1688,42 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats
 		st->stats[id] = val;
 		break;
 	}
-	case PROG_TYPE:
-	case ATTACH_TYPE:
+	case PROG_TYPE: {
+		enum bpf_prog_type prog_type = 0;
+		const char *type;
+
+		while ((type = libbpf_bpf_prog_type_str(prog_type)))  {
+			if (strcmp(type, str) == 0) {
+				st->stats[id] = prog_type;
+				break;
+			}
+			prog_type++;
+		}
+
+		if (!type) {
+			fprintf(stderr, "Unrecognized prog type %s\n", str);
+			return -EINVAL;
+		}
 		break;
+	}
+	case ATTACH_TYPE: {
+		enum bpf_attach_type attach_type = 0;
+		const char *type;
+
+		while ((type = libbpf_bpf_attach_type_str(attach_type)))  {
+			if (strcmp(type, str) == 0) {
+				st->stats[id] = attach_type;
+				break;
+			}
+			attach_type++;
+		}
+
+		if (!type) {
+			fprintf(stderr, "Unrecognized attach type %s\n", str);
+			return -EINVAL;
+		}
+		break;
+	}
 	default:
 		fprintf(stderr, "Unrecognized stat #%d\n", id);
 		return -EINVAL;
-- 
cgit v1.2.3


From 912d6f6697251b0024e56ed24b7873b4800822e7 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 3 Jan 2025 06:31:14 -0500
Subject: selftests/net: packetdrill: report benign debug flakes as xfail

A few recently added packetdrill tests that are known time sensitive
(e.g., because testing timestamping) occasionally fail in debug mode:
https://netdev.bots.linux.dev/contest.html?executor=vmksft-packetdrill-dbg

These failures are well understood. Correctness of the tests is
verified in non-debug mode. Continue running in debug mode also, to
keep coverage with debug instrumentation.

But, only in debug mode, mark these tests with well understood
timing issues as XFAIL (known failing) rather than FAIL when failing.

Introduce an allow list xfail_list with known cases.

Expand the ktap infrastructure with XFAIL support.

Fixes: eab35989cc37 ("selftests/net: packetdrill: import tcp/fast_recovery, tcp/nagle, tcp/timestamping")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20241218100013.0c698629@kernel.org/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250103113142.129251-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/kselftest/ktap_helpers.sh  | 15 ++++++++++++--
 .../selftests/net/packetdrill/ksft_runner.sh       | 23 +++++++++++++++++-----
 2 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh
index 79a125eb24c2..05a461890671 100644
--- a/tools/testing/selftests/kselftest/ktap_helpers.sh
+++ b/tools/testing/selftests/kselftest/ktap_helpers.sh
@@ -7,6 +7,7 @@
 KTAP_TESTNO=1
 KTAP_CNT_PASS=0
 KTAP_CNT_FAIL=0
+KTAP_CNT_XFAIL=0
 KTAP_CNT_SKIP=0
 
 KSFT_PASS=0
@@ -69,6 +70,16 @@ ktap_test_skip() {
 	KTAP_CNT_SKIP=$((KTAP_CNT_SKIP+1))
 }
 
+ktap_test_xfail() {
+	description="$1"
+
+	result="ok"
+	directive="XFAIL"
+	__ktap_test "$result" "$description" "$directive"
+
+	KTAP_CNT_XFAIL=$((KTAP_CNT_XFAIL+1))
+}
+
 ktap_test_fail() {
 	description="$1"
 
@@ -99,7 +110,7 @@ ktap_exit_fail_msg() {
 ktap_finished() {
 	ktap_print_totals
 
-	if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then
+	if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP + KTAP_CNT_XFAIL)) -eq "$KSFT_NUM_TESTS" ]; then
 		exit "$KSFT_PASS"
 	else
 		exit "$KSFT_FAIL"
@@ -107,5 +118,5 @@ ktap_finished() {
 }
 
 ktap_print_totals() {
-	echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:0 xpass:0 skip:$KTAP_CNT_SKIP error:0"
+	echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:$KTAP_CNT_XFAIL xpass:0 skip:$KTAP_CNT_SKIP error:0"
 }
diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index 4071c133f29e..ff989c325eef 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -23,7 +23,7 @@ if [ $# -ne 1 ]; then
 	ktap_exit_fail_msg "usage: $0 <script>"
 	exit "$KSFT_FAIL"
 fi
-script="$1"
+script="$(basename $1)"
 
 if [ -z "$(which packetdrill)" ]; then
 	ktap_skip_all "packetdrill not found in PATH"
@@ -31,16 +31,29 @@ if [ -z "$(which packetdrill)" ]; then
 fi
 
 declare -a optargs
+failfunc=ktap_test_fail
+
 if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
 	optargs+=('--tolerance_usecs=14000')
+
+	# xfail tests that are known flaky with dbg config, not fixable.
+	# still run them for coverage (and expect 100% pass without dbg).
+	declare -ar xfail_list=(
+		"tcp_fast_recovery_prr-ss.*.pkt"
+		"tcp_timestamping.*.pkt"
+		"tcp_user_timeout_user-timeout-probe.pkt"
+		"tcp_zerocopy_epoll_.*.pkt"
+	)
+	readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$"
+	[[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail
 fi
 
 ktap_print_header
 ktap_set_plan 2
 
-unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $(basename $script) > /dev/null \
-	&& ktap_test_pass "ipv4" || ktap_test_fail "ipv4"
-unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $(basename $script) > /dev/null \
-	&& ktap_test_pass "ipv6" || ktap_test_fail "ipv6"
+unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $script > /dev/null \
+	&& ktap_test_pass "ipv4" || $failfunc "ipv4"
+unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $script > /dev/null \
+	&& ktap_test_pass "ipv6" || $failfunc "ipv6"
 
 ktap_finished
-- 
cgit v1.2.3


From b07f6a30c7a42661ce7c0222a642c8e91d69c8b1 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Mon, 16 Dec 2024 10:21:36 +0100
Subject: KVM: s390: selftests: Add ucontrol flic attr selftests

Add some superficial selftests for the floating interrupt controller
when using ucontrol VMs. These tests are intended to cover very basic
calls only.

Some of the calls may trigger null pointer dereferences on kernels not
containing the fixes in this patch series.

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Tested-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20241216092140.329196-3-schlameuss@linux.ibm.com
Message-ID: <20241216092140.329196-3-schlameuss@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 148 ++++++++++++++++++++++
 1 file changed, 148 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index 0c112319dab1..b003abda8495 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -635,4 +635,152 @@ TEST_F(uc_kvm, uc_skey)
 	uc_assert_diag44(self);
 }
 
+static char uc_flic_b[PAGE_SIZE];
+static struct kvm_s390_io_adapter uc_flic_ioa = { .id = 0 };
+static struct kvm_s390_io_adapter_req uc_flic_ioam = { .id = 0 };
+static struct kvm_s390_ais_req uc_flic_asim = { .isc = 0 };
+static struct kvm_s390_ais_all uc_flic_asima = { .simm = 0 };
+static struct uc_flic_attr_test {
+	char *name;
+	struct kvm_device_attr a;
+	int hasrc;
+	int geterrno;
+	int seterrno;
+} uc_flic_attr_tests[] = {
+	{
+		.name = "KVM_DEV_FLIC_GET_ALL_IRQS",
+		.seterrno = EINVAL,
+		.a = {
+			.group = KVM_DEV_FLIC_GET_ALL_IRQS,
+			.addr = (u64)&uc_flic_b,
+			.attr = PAGE_SIZE,
+		},
+	},
+	{
+		.name = "KVM_DEV_FLIC_ENQUEUE",
+		.geterrno = EINVAL,
+		.a = { .group = KVM_DEV_FLIC_ENQUEUE, },
+	},
+	{
+		.name = "KVM_DEV_FLIC_CLEAR_IRQS",
+		.geterrno = EINVAL,
+		.a = { .group = KVM_DEV_FLIC_CLEAR_IRQS, },
+	},
+	{
+		.name = "KVM_DEV_FLIC_ADAPTER_REGISTER",
+		.geterrno = EINVAL,
+		.a = {
+			.group = KVM_DEV_FLIC_ADAPTER_REGISTER,
+			.addr = (u64)&uc_flic_ioa,
+		},
+	},
+	{
+		.name = "KVM_DEV_FLIC_ADAPTER_MODIFY",
+		.geterrno = EINVAL,
+		.seterrno = EINVAL,
+		.a = {
+			.group = KVM_DEV_FLIC_ADAPTER_MODIFY,
+			.addr = (u64)&uc_flic_ioam,
+			.attr = sizeof(uc_flic_ioam),
+		},
+	},
+	{
+		.name = "KVM_DEV_FLIC_CLEAR_IO_IRQ",
+		.geterrno = EINVAL,
+		.seterrno = EINVAL,
+		.a = {
+			.group = KVM_DEV_FLIC_CLEAR_IO_IRQ,
+			.attr = 32,
+		},
+	},
+	{
+		.name = "KVM_DEV_FLIC_AISM",
+		.geterrno = EINVAL,
+		.seterrno = ENOTSUP,
+		.a = {
+			.group = KVM_DEV_FLIC_AISM,
+			.addr = (u64)&uc_flic_asim,
+		},
+	},
+	{
+		.name = "KVM_DEV_FLIC_AIRQ_INJECT",
+		.geterrno = EINVAL,
+		.a = { .group = KVM_DEV_FLIC_AIRQ_INJECT, },
+	},
+	{
+		.name = "KVM_DEV_FLIC_AISM_ALL",
+		.geterrno = ENOTSUP,
+		.seterrno = ENOTSUP,
+		.a = {
+			.group = KVM_DEV_FLIC_AISM_ALL,
+			.addr = (u64)&uc_flic_asima,
+			.attr = sizeof(uc_flic_asima),
+		},
+	},
+	{
+		.name = "KVM_DEV_FLIC_APF_ENABLE",
+		.geterrno = EINVAL,
+		.seterrno = EINVAL,
+		.a = { .group = KVM_DEV_FLIC_APF_ENABLE, },
+	},
+	{
+		.name = "KVM_DEV_FLIC_APF_DISABLE_WAIT",
+		.geterrno = EINVAL,
+		.seterrno = EINVAL,
+		.a = { .group = KVM_DEV_FLIC_APF_DISABLE_WAIT, },
+	},
+};
+
+TEST_F(uc_kvm, uc_flic_attrs)
+{
+	struct kvm_create_device cd = { .type = KVM_DEV_TYPE_FLIC };
+	struct kvm_device_attr attr;
+	u64 value;
+	int rc, i;
+
+	rc = ioctl(self->vm_fd, KVM_CREATE_DEVICE, &cd);
+	ASSERT_EQ(0, rc) TH_LOG("create device failed with err %s (%i)",
+				strerror(errno), errno);
+
+	for (i = 0; i < ARRAY_SIZE(uc_flic_attr_tests); i++) {
+		TH_LOG("test %s", uc_flic_attr_tests[i].name);
+		attr = (struct kvm_device_attr) {
+			.group = uc_flic_attr_tests[i].a.group,
+			.attr = uc_flic_attr_tests[i].a.attr,
+			.addr = uc_flic_attr_tests[i].a.addr,
+		};
+		if (attr.addr == 0)
+			attr.addr = (u64)&value;
+
+		rc = ioctl(cd.fd, KVM_HAS_DEVICE_ATTR, &attr);
+		EXPECT_EQ(uc_flic_attr_tests[i].hasrc, !!rc)
+			TH_LOG("expected dev attr missing %s",
+			       uc_flic_attr_tests[i].name);
+
+		rc = ioctl(cd.fd, KVM_GET_DEVICE_ATTR, &attr);
+		EXPECT_EQ(!!uc_flic_attr_tests[i].geterrno, !!rc)
+			TH_LOG("get dev attr rc not expected on %s %s (%i)",
+			       uc_flic_attr_tests[i].name,
+			       strerror(errno), errno);
+		if (uc_flic_attr_tests[i].geterrno)
+			EXPECT_EQ(uc_flic_attr_tests[i].geterrno, errno)
+				TH_LOG("get dev attr errno not expected on %s %s (%i)",
+				       uc_flic_attr_tests[i].name,
+				       strerror(errno), errno);
+
+		rc = ioctl(cd.fd, KVM_SET_DEVICE_ATTR, &attr);
+		EXPECT_EQ(!!uc_flic_attr_tests[i].seterrno, !!rc)
+			TH_LOG("set sev attr rc not expected on %s %s (%i)",
+			       uc_flic_attr_tests[i].name,
+			       strerror(errno), errno);
+		if (uc_flic_attr_tests[i].seterrno)
+			EXPECT_EQ(uc_flic_attr_tests[i].seterrno, errno)
+				TH_LOG("set dev attr errno not expected on %s %s (%i)",
+				       uc_flic_attr_tests[i].name,
+				       strerror(errno), errno);
+	}
+
+	close(cd.fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From b1da33b0e3dcd0d14cf375be2fd05b54cf75df56 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Mon, 16 Dec 2024 10:21:38 +0100
Subject: KVM: s390: selftests: Add ucontrol gis routing test

Add a selftests for the interrupt routing configuration when using
ucontrol VMs.

Calling the test may trigger a null pointer dereferences on kernels not
containing the fixes in this patch series.

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Tested-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20241216092140.329196-5-schlameuss@linux.ibm.com
Message-ID: <20241216092140.329196-5-schlameuss@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index b003abda8495..8f306395696e 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -783,4 +783,23 @@ TEST_F(uc_kvm, uc_flic_attrs)
 	close(cd.fd);
 }
 
+TEST_F(uc_kvm, uc_set_gsi_routing)
+{
+	struct kvm_irq_routing *routing = kvm_gsi_routing_create();
+	struct kvm_irq_routing_entry ue = {
+		.type = KVM_IRQ_ROUTING_S390_ADAPTER,
+		.gsi = 1,
+		.u.adapter = (struct kvm_irq_routing_s390_adapter) {
+			.ind_addr = 0,
+		},
+	};
+	int rc;
+
+	routing->entries[0] = ue;
+	routing->nr = 1;
+	rc = ioctl(self->vm_fd, KVM_SET_GSI_ROUTING, routing);
+	ASSERT_EQ(-1, rc) TH_LOG("err %s (%i)", strerror(errno), errno);
+	ASSERT_EQ(EINVAL, errno) TH_LOG("err %s (%i)", strerror(errno), errno);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From e376d958871c0eeb7e97cf95655015fc343d209c Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <schlameuss@linux.ibm.com>
Date: Mon, 16 Dec 2024 10:21:40 +0100
Subject: KVM: s390: selftests: Add has device attr check to uc_attr_mem_limit
 selftest

Fixup the uc_attr_mem_limit test case to also cover the
KVM_HAS_DEVICE_ATTR ioctl.

Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Tested-by: Hariharan Mari <hari55@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20241216092140.329196-7-schlameuss@linux.ibm.com
Message-ID: <20241216092140.329196-7-schlameuss@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/ucontrol_test.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
index 8f306395696e..135ee22856cf 100644
--- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -210,10 +210,13 @@ TEST_F(uc_kvm, uc_attr_mem_limit)
 	struct kvm_device_attr attr = {
 		.group = KVM_S390_VM_MEM_CTRL,
 		.attr = KVM_S390_VM_MEM_LIMIT_SIZE,
-		.addr = (unsigned long)&limit,
+		.addr = (u64)&limit,
 	};
 	int rc;
 
+	rc = ioctl(self->vm_fd, KVM_HAS_DEVICE_ATTR, &attr);
+	EXPECT_EQ(0, rc);
+
 	rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr);
 	EXPECT_EQ(0, rc);
 	EXPECT_EQ(~0UL, limit);
-- 
cgit v1.2.3


From 80c3e28528ff9f269937fcfe73895213a2e14905 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 29 Dec 2024 22:24:39 +0900
Subject: selftests/tracing: Add hist poll() support test

Add a testcase for poll() on hist file. This introduces a helper binary
to the ftracetest, because there is no good way to reliably execute
poll() on hist file.

Cc: Shuah Khan <shuah@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/173547867935.569911.10127126796879854182.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/testing/selftests/ftrace/Makefile            |  2 +
 tools/testing/selftests/ftrace/poll.c              | 74 ++++++++++++++++++++++
 .../ftrace/test.d/trigger/trigger-hist-poll.tc     | 74 ++++++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/poll.c
 create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile
index a1e955d2de4c..49d96bb16355 100644
--- a/tools/testing/selftests/ftrace/Makefile
+++ b/tools/testing/selftests/ftrace/Makefile
@@ -6,4 +6,6 @@ TEST_PROGS := ftracetest-ktap
 TEST_FILES := test.d settings
 EXTRA_CLEAN := $(OUTPUT)/logs/*
 
+TEST_GEN_PROGS = poll
+
 include ../lib.mk
diff --git a/tools/testing/selftests/ftrace/poll.c b/tools/testing/selftests/ftrace/poll.c
new file mode 100644
index 000000000000..53258f7515e7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/poll.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Simple poll on a file.
+ *
+ * Copyright (c) 2024 Google LLC.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define BUFSIZE 4096
+
+/*
+ * Usage:
+ *  poll [-I|-P] [-t timeout] FILE
+ */
+int main(int argc, char *argv[])
+{
+	struct pollfd pfd = {.events = POLLIN};
+	char buf[BUFSIZE];
+	int timeout = -1;
+	int ret, opt;
+
+	while ((opt = getopt(argc, argv, "IPt:")) != -1) {
+		switch (opt) {
+		case 'I':
+			pfd.events = POLLIN;
+			break;
+		case 'P':
+			pfd.events = POLLPRI;
+			break;
+		case 't':
+			timeout = atoi(optarg);
+			break;
+		default:
+			fprintf(stderr, "Usage: %s [-I|-P] [-t timeout] FILE\n",
+				argv[0]);
+			return -1;
+		}
+	}
+	if (optind >= argc) {
+		fprintf(stderr, "Error: Polling file is not specified\n");
+		return -1;
+	}
+
+	pfd.fd = open(argv[optind], O_RDONLY);
+	if (pfd.fd < 0) {
+		fprintf(stderr, "failed to open %s", argv[optind]);
+		perror("open");
+		return -1;
+	}
+
+	/* Reset poll by read if POLLIN is specified. */
+	if (pfd.events & POLLIN)
+		do {} while (read(pfd.fd, buf, BUFSIZE) == BUFSIZE);
+
+	ret = poll(&pfd, 1, timeout);
+	if (ret < 0 && errno != EINTR) {
+		perror("poll");
+		return -1;
+	}
+	close(pfd.fd);
+
+	/* If timeout happned (ret == 0), exit code is 1 */
+	if (ret == 0)
+		return 1;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc
new file mode 100644
index 000000000000..8d275e3238d9
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc
@@ -0,0 +1,74 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test poll wait on histogram
+# requires: set_event events/sched/sched_process_free/trigger events/sched/sched_process_free/hist
+# flags: instance
+
+POLL=${FTRACETEST_ROOT}/poll
+
+if [ ! -x ${POLL} ]; then
+  echo "poll program is not compiled!"
+  exit_unresolved
+fi
+
+EVENT=events/sched/sched_process_free/
+
+# Check poll ops is supported. Before implementing poll on hist file, it
+# returns soon with POLLIN | POLLOUT, but not POLLPRI.
+
+# This must wait >1 sec and return 1 (timeout).
+set +e
+${POLL} -I -t 1000 ${EVENT}/hist
+ret=$?
+set -e
+if [ ${ret} != 1 ]; then
+  echo "poll on hist file is not supported"
+  exit_unsupported
+fi
+
+# Test POLLIN
+echo > trace
+echo 'hist:key=comm if comm =="sleep"' > ${EVENT}/trigger
+echo 1 > ${EVENT}/enable
+
+# This sleep command will exit after 2 seconds.
+sleep 2 &
+BGPID=$!
+# if timeout happens, poll returns 1.
+${POLL} -I -t 4000 ${EVENT}/hist
+echo 0 > tracing_on
+
+if [ -d /proc/${BGPID} ]; then
+  echo "poll exits too soon"
+  kill -KILL ${BGPID} ||:
+  exit_fail
+fi
+
+if ! grep -qw "sleep" trace; then
+  echo "poll exits before event happens"
+  exit_fail
+fi
+
+# Test POLLPRI
+echo > trace
+echo 1 > tracing_on
+
+# This sleep command will exit after 2 seconds.
+sleep 2 &
+BGPID=$!
+# if timeout happens, poll returns 1.
+${POLL} -P -t 4000 ${EVENT}/hist
+echo 0 > tracing_on
+
+if [ -d /proc/${BGPID} ]; then
+  echo "poll exits too soon"
+  kill -KILL ${BGPID} ||:
+  exit_fail
+fi
+
+if ! grep -qw "sleep" trace; then
+  echo "poll exits before event happens"
+  exit_fail
+fi
+
+exit_pass
-- 
cgit v1.2.3


From a73bca3d9cc0a27dd7061a55841a1d752ec7365a Mon Sep 17 00:00:00 2001
From: Shizhao Chen <shichen@redhat.com>
Date: Tue, 7 Jan 2025 23:52:19 +0800
Subject: sched_ext: Add option -l in selftest runner to list all available
 tests

The selftest runner currently allows selecting tests via the -t
option. This patch adds a new -l option that lists all available tests,
providing users with an overview of the tests they can choose from. This
enhancement is especially useful for scripting and automation purposes,
making it easier to discover and run tests.

Signed-off-by: Shizhao Chen <shichen@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/runner.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index eab48c7ff309..aa2d7d32dda9 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -22,11 +22,12 @@ const char help_fmt[] =
 "\n"
 "  -t TEST       Only run tests whose name includes this string\n"
 "  -s            Include print output for skipped tests\n"
+"  -l            List all available tests\n"
 "  -q            Don't print the test descriptions during run\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
-static bool quiet, print_skipped;
+static bool quiet, print_skipped, list;
 
 #define MAX_SCX_TESTS 2048
 
@@ -133,7 +134,7 @@ int main(int argc, char **argv)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	while ((opt = getopt(argc, argv, "qst:h")) != -1) {
+	while ((opt = getopt(argc, argv, "qslt:h")) != -1) {
 		switch (opt) {
 		case 'q':
 			quiet = true;
@@ -141,6 +142,9 @@ int main(int argc, char **argv)
 		case 's':
 			print_skipped = true;
 			break;
+		case 'l':
+			list = true;
+			break;
 		case 't':
 			filter = optarg;
 			break;
@@ -154,6 +158,13 @@ int main(int argc, char **argv)
 		enum scx_test_status status;
 		struct scx_test *test = &__scx_tests[i];
 
+		if (list) {
+			printf("%s\n", test->name);
+			if (i == (__scx_num_tests - 1))
+				return 0;
+			continue;
+		}
+
 		if (filter && should_skip_test(test, filter)) {
 			/*
 			 * Printing the skipped tests and their preambles can
-- 
cgit v1.2.3


From 69072db934dfc7a566d4eb1fac04146e97ab365f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 6 Jan 2025 18:28:18 -0800
Subject: tools: ynl: correctly handle overrides of fields in subset

We stated in documentation [1] and previous discussions [2]
that the need for overriding fields in members of subsets
is anticipated. Implement it.

Since each attr is now a new object we need to make sure
that the modifications are propagated. Specifically C codegen
wants to annotate which attrs are used in requests and replies
to generate the right validation artifacts.

[1] https://docs.kernel.org/next/userspace-api/netlink/specs.html#subset-of
[2] https://lore.kernel.org/netdev/20231004171350.1f59cd1d@kernel.org/

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250107022820.2087101-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/nlspec.py |  5 ++++-
 tools/net/ynl/ynl-gen-c.py  | 26 ++++++++++++++++++++++----
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index a745739655ad..314ec8007496 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -219,7 +219,10 @@ class SpecAttrSet(SpecElement):
         else:
             real_set = family.attr_sets[self.subset_of]
             for elem in self.yaml['attributes']:
-                attr = real_set[elem['name']]
+                real_attr = real_set[elem['name']]
+                combined_elem = real_attr.yaml | elem
+                attr = self.new_attr(combined_elem, real_attr.value)
+
                 self.attrs[attr.name] = attr
                 self.attrs_by_val[attr.value] = attr
 
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index ec2288948795..58657dd7dedb 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -79,6 +79,20 @@ class Type(SpecAttr):
         self.enum_name = None
         delattr(self, "enum_name")
 
+    def _get_real_attr(self):
+        # if the attr is for a subset return the "real" attr (just one down, does not recurse)
+        return self.family.attr_sets[self.attr_set.subset_of][self.name]
+
+    def set_request(self):
+        self.request = True
+        if self.attr_set.subset_of:
+            self._get_real_attr().set_request()
+
+    def set_reply(self):
+        self.reply = True
+        if self.attr_set.subset_of:
+            self._get_real_attr().set_reply()
+
     def get_limit(self, limit, default=None):
         value = self.checks.get(limit, default)
         if value is None:
@@ -106,6 +120,10 @@ class Type(SpecAttr):
             enum_name = f"{self.attr_set.name_prefix}{self.name}"
         self.enum_name = c_upper(enum_name)
 
+        if self.attr_set.subset_of:
+            if self.checks != self._get_real_attr().checks:
+                raise Exception("Overriding checks not supported by codegen, yet")
+
     def is_multi_val(self):
         return None
 
@@ -1119,17 +1137,17 @@ class Family(SpecFamily):
         for _, struct in self.pure_nested_structs.items():
             if struct.request:
                 for _, arg in struct.member_list():
-                    arg.request = True
+                    arg.set_request()
             if struct.reply:
                 for _, arg in struct.member_list():
-                    arg.reply = True
+                    arg.set_reply()
 
         for root_set, rs_members in self.root_sets.items():
             for attr, spec in self.attr_sets[root_set].items():
                 if attr in rs_members['request']:
-                    spec.request = True
+                    spec.set_request()
                 if attr in rs_members['reply']:
-                    spec.reply = True
+                    spec.set_reply()
 
     def _load_global_policy(self):
         global_set = set()
-- 
cgit v1.2.3


From 7aae6505351e4c9fc2f3108d16fd06ce76f4b475 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 6 Jan 2025 18:28:19 -0800
Subject: tools: ynl: print some information about attribute we can't parse

When parsing throws an exception one often has to figure out which
attribute couldn't be parsed from first principles. For families
with large message parsing trees like rtnetlink guessing the
attribute can be hard.

Print a bit of information as the exception travels out, e.g.:

  # when dumping rt links
  Error decoding 'flags' from 'linkinfo-ip6tnl-attrs'
  Error decoding 'data' from 'linkinfo-attrs'
  Error decoding 'linkinfo' from 'link-attrs'
  Traceback (most recent call last):
    File "/home/kicinski/linux/./tools/net/ynl/cli.py", line 119, in <module>
      main()
    File "/home/kicinski/linux/./tools/net/ynl/cli.py", line 100, in main
      reply = ynl.dump(args.dump, attrs)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 1064, in dump
      return self._op(method, vals, dump=True)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 1058, in _op
      return self._ops(ops)[0]
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 1045, in _ops
      rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 738, in _decode
      subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 763, in _decode
      decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 714, in _decode_sub_msg
      subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 749, in _decode
      decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
    File "/home/kicinski/linux/tools/net/ynl/lib/ynl.py", line 147, in as_scalar
      return format.unpack(self.raw)[0]
  struct.error: unpack requires a buffer of 2 bytes

The Traceback is what we would previously see, the "Error..."
messages are new. We print a message per level (in the stack
order). Printing single combined message gets tricky quickly
given sub-messages etc.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250107022820.2087101-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py | 74 +++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index eea29359a899..08f8bf89cfc2 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -733,41 +733,45 @@ class YnlFamily(SpecFamily):
                 self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr))
                 continue
 
-            if attr_spec["type"] == 'nest':
-                subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
-                decoded = subdict
-            elif attr_spec["type"] == 'string':
-                decoded = attr.as_strz()
-            elif attr_spec["type"] == 'binary':
-                decoded = self._decode_binary(attr, attr_spec)
-            elif attr_spec["type"] == 'flag':
-                decoded = True
-            elif attr_spec.is_auto_scalar:
-                decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order)
-            elif attr_spec["type"] in NlAttr.type_formats:
-                decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
-                if 'enum' in attr_spec:
-                    decoded = self._decode_enum(decoded, attr_spec)
-                elif attr_spec.display_hint:
-                    decoded = self._formatted_string(decoded, attr_spec.display_hint)
-            elif attr_spec["type"] == 'indexed-array':
-                decoded = self._decode_array_attr(attr, attr_spec)
-            elif attr_spec["type"] == 'bitfield32':
-                value, selector = struct.unpack("II", attr.raw)
-                if 'enum' in attr_spec:
-                    value = self._decode_enum(value, attr_spec)
-                    selector = self._decode_enum(selector, attr_spec)
-                decoded = {"value": value, "selector": selector}
-            elif attr_spec["type"] == 'sub-message':
-                decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
-            elif attr_spec["type"] == 'nest-type-value':
-                decoded = self._decode_nest_type_value(attr, attr_spec)
-            else:
-                if not self.process_unknown:
-                    raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
-                decoded = self._decode_unknown(attr)
-
-            self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
+            try:
+                if attr_spec["type"] == 'nest':
+                    subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
+                    decoded = subdict
+                elif attr_spec["type"] == 'string':
+                    decoded = attr.as_strz()
+                elif attr_spec["type"] == 'binary':
+                    decoded = self._decode_binary(attr, attr_spec)
+                elif attr_spec["type"] == 'flag':
+                    decoded = True
+                elif attr_spec.is_auto_scalar:
+                    decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order)
+                elif attr_spec["type"] in NlAttr.type_formats:
+                    decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
+                    if 'enum' in attr_spec:
+                        decoded = self._decode_enum(decoded, attr_spec)
+                    elif attr_spec.display_hint:
+                        decoded = self._formatted_string(decoded, attr_spec.display_hint)
+                elif attr_spec["type"] == 'indexed-array':
+                    decoded = self._decode_array_attr(attr, attr_spec)
+                elif attr_spec["type"] == 'bitfield32':
+                    value, selector = struct.unpack("II", attr.raw)
+                    if 'enum' in attr_spec:
+                        value = self._decode_enum(value, attr_spec)
+                        selector = self._decode_enum(selector, attr_spec)
+                    decoded = {"value": value, "selector": selector}
+                elif attr_spec["type"] == 'sub-message':
+                    decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
+                elif attr_spec["type"] == 'nest-type-value':
+                    decoded = self._decode_nest_type_value(attr, attr_spec)
+                else:
+                    if not self.process_unknown:
+                        raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
+                    decoded = self._decode_unknown(attr)
+
+                self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
+            except:
+                print(f"Error decoding '{attr_spec.name}' from '{space}'")
+                raise
 
         return rsp
 
-- 
cgit v1.2.3


From 7f853a252cdefa9f574c2dce4f9f38a7c10d6b5a Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:34:57 -0600
Subject: selftests: ntsync: Add some tests for semaphore state.

Wine has tests for its synchronization primitives, but these are more accessible
to kernel developers, and also allow us to test some edge cases that Wine does
not care about.

This patch adds tests for semaphore-specific ioctls NTSYNC_IOC_SEM_POST and
NTSYNC_IOC_SEM_READ, and waiting on semaphores.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-17-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/Makefile                  |   1 +
 tools/testing/selftests/drivers/ntsync/.gitignore |   1 +
 tools/testing/selftests/drivers/ntsync/Makefile   |   7 ++
 tools/testing/selftests/drivers/ntsync/config     |   1 +
 tools/testing/selftests/drivers/ntsync/ntsync.c   | 145 ++++++++++++++++++++++
 5 files changed, 155 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore
 create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile
 create mode 100644 tools/testing/selftests/drivers/ntsync/config
 create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 2401e973c359..a8c9648e5adc 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -18,6 +18,7 @@ TARGETS += devices/error_logs
 TARGETS += devices/probe
 TARGETS += dmabuf-heaps
 TARGETS += drivers/dma-buf
+TARGETS += drivers/ntsync
 TARGETS += drivers/s390x/uvdevice
 TARGETS += drivers/net
 TARGETS += drivers/net/bonding
diff --git a/tools/testing/selftests/drivers/ntsync/.gitignore b/tools/testing/selftests/drivers/ntsync/.gitignore
new file mode 100644
index 000000000000..848573a3d3ea
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/.gitignore
@@ -0,0 +1 @@
+ntsync
diff --git a/tools/testing/selftests/drivers/ntsync/Makefile b/tools/testing/selftests/drivers/ntsync/Makefile
new file mode 100644
index 000000000000..dbf2b055c0b2
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/Makefile
@@ -0,0 +1,7 @@
+# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only
+TEST_GEN_PROGS := ntsync
+
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lpthread
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/ntsync/config b/tools/testing/selftests/drivers/ntsync/config
new file mode 100644
index 000000000000..60539c826d06
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/config
@@ -0,0 +1 @@
+CONFIG_WINESYNC=y
diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
new file mode 100644
index 000000000000..cc72da0a91de
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Various unit tests for the "ntsync" synchronization primitive driver.
+ *
+ * Copyright (C) 2021-2022 Elizabeth Figura <zfigura@codeweavers.com>
+ */
+
+#define _GNU_SOURCE
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/ntsync.h>
+#include "../../kselftest_harness.h"
+
+static int read_sem_state(int sem, __u32 *count, __u32 *max)
+{
+	struct ntsync_sem_args args;
+	int ret;
+
+	memset(&args, 0xcc, sizeof(args));
+	ret = ioctl(sem, NTSYNC_IOC_SEM_READ, &args);
+	*count = args.count;
+	*max = args.max;
+	return ret;
+}
+
+#define check_sem_state(sem, count, max) \
+	({ \
+		__u32 __count, __max; \
+		int ret = read_sem_state((sem), &__count, &__max); \
+		EXPECT_EQ(0, ret); \
+		EXPECT_EQ((count), __count); \
+		EXPECT_EQ((max), __max); \
+	})
+
+static int release_sem(int sem, __u32 *count)
+{
+	return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count);
+}
+
+static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
+{
+	struct ntsync_wait_args args = {0};
+	struct timespec timeout;
+	int ret;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	args.timeout = timeout.tv_sec * 1000000000 + timeout.tv_nsec;
+	args.count = count;
+	args.objs = (uintptr_t)objs;
+	args.owner = owner;
+	args.index = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_WAIT_ANY, &args);
+	*index = args.index;
+	return ret;
+}
+
+TEST(semaphore_state)
+{
+	struct ntsync_sem_args sem_args;
+	struct timespec timeout;
+	__u32 count, index;
+	int fd, ret, sem;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 3;
+	sem_args.max = 2;
+	sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(-1, sem);
+	EXPECT_EQ(EINVAL, errno);
+
+	sem_args.count = 2;
+	sem_args.max = 2;
+	sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, sem);
+	check_sem_state(sem, 2, 2);
+
+	count = 0;
+	ret = release_sem(sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+	check_sem_state(sem, 2, 2);
+
+	count = 1;
+	ret = release_sem(sem, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOVERFLOW, errno);
+	check_sem_state(sem, 2, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem, 1, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem, 0, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	count = 3;
+	ret = release_sem(sem, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOVERFLOW, errno);
+	check_sem_state(sem, 0, 2);
+
+	count = 2;
+	ret = release_sem(sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(sem, 2, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+
+	count = 1;
+	ret = release_sem(sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(sem, 1, 2);
+
+	count = ~0u;
+	ret = release_sem(sem, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOVERFLOW, errno);
+	check_sem_state(sem, 1, 2);
+
+	close(sem);
+
+	close(fd);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From ae071aef1486be85b6fc2d219368900ff4d55408 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:34:58 -0600
Subject: selftests: ntsync: Add some tests for mutex state.

Test mutex-specific ioctls NTSYNC_IOC_MUTEX_UNLOCK and NTSYNC_IOC_MUTEX_READ,
and waiting on mutexes.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-18-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 187 ++++++++++++++++++++++++
 1 file changed, 187 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index cc72da0a91de..4db65490b6a1 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -40,6 +40,39 @@ static int release_sem(int sem, __u32 *count)
 	return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count);
 }
 
+static int read_mutex_state(int mutex, __u32 *count, __u32 *owner)
+{
+	struct ntsync_mutex_args args;
+	int ret;
+
+	memset(&args, 0xcc, sizeof(args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &args);
+	*count = args.count;
+	*owner = args.owner;
+	return ret;
+}
+
+#define check_mutex_state(mutex, count, owner) \
+	({ \
+		__u32 __count, __owner; \
+		int ret = read_mutex_state((mutex), &__count, &__owner); \
+		EXPECT_EQ(0, ret); \
+		EXPECT_EQ((count), __count); \
+		EXPECT_EQ((owner), __owner); \
+	})
+
+static int unlock_mutex(int mutex, __u32 owner, __u32 *count)
+{
+	struct ntsync_mutex_args args;
+	int ret;
+
+	args.owner = owner;
+	args.count = 0xdeadbeef;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_UNLOCK, &args);
+	*count = args.count;
+	return ret;
+}
+
 static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
 {
 	struct ntsync_wait_args args = {0};
@@ -142,4 +175,158 @@ TEST(semaphore_state)
 	close(fd);
 }
 
+TEST(mutex_state)
+{
+	struct ntsync_mutex_args mutex_args;
+	__u32 owner, count, index;
+	struct timespec timeout;
+	int fd, ret, mutex;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 0;
+	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(-1, mutex);
+	EXPECT_EQ(EINVAL, errno);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 2;
+	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(-1, mutex);
+	EXPECT_EQ(EINVAL, errno);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 2;
+	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, mutex);
+	check_mutex_state(mutex, 2, 123);
+
+	ret = unlock_mutex(mutex, 0, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	ret = unlock_mutex(mutex, 456, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EPERM, errno);
+	check_mutex_state(mutex, 2, 123);
+
+	ret = unlock_mutex(mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+	check_mutex_state(mutex, 1, 123);
+
+	ret = unlock_mutex(mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, count);
+	check_mutex_state(mutex, 0, 0);
+
+	ret = unlock_mutex(mutex, 123, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EPERM, errno);
+
+	ret = wait_any(fd, 1, &mutex, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 456);
+
+	ret = wait_any(fd, 1, &mutex, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 2, 456);
+
+	ret = unlock_mutex(mutex, 456, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+	check_mutex_state(mutex, 1, 456);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	owner = 0;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	owner = 123;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EPERM, errno);
+	check_mutex_state(mutex, 1, 456);
+
+	owner = 456;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	memset(&mutex_args, 0xcc, sizeof(mutex_args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, mutex_args.count);
+	EXPECT_EQ(0, mutex_args.owner);
+
+	memset(&mutex_args, 0xcc, sizeof(mutex_args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, mutex_args.count);
+	EXPECT_EQ(0, mutex_args.owner);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 123);
+
+	owner = 123;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	memset(&mutex_args, 0xcc, sizeof(mutex_args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, mutex_args.count);
+	EXPECT_EQ(0, mutex_args.owner);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 123);
+
+	close(mutex);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, mutex);
+	check_mutex_state(mutex, 0, 0);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 123);
+
+	close(mutex);
+
+	mutex_args.owner = 123;
+	mutex_args.count = ~0u;
+	mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, mutex);
+	check_mutex_state(mutex, ~0u, 123);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	close(mutex);
+
+	close(fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 4455456958aa2006a3df287b2ab1a0d1e130be3c Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:34:59 -0600
Subject: selftests: ntsync: Add some tests for NTSYNC_IOC_WAIT_ANY.

Test basic synchronous functionality of NTSYNC_IOC_WAIT_ANY, when objects are
considered signaled or not signaled, and how they are affected by a successful
wait.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-19-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 114 ++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 4db65490b6a1..9781a74253ee 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -329,4 +329,118 @@ TEST(mutex_state)
 	close(fd);
 }
 
+TEST(test_wait_any)
+{
+	int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret;
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 owner, index, count, i;
+	struct timespec timeout;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 2;
+	sem_args.max = 3;
+	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[0]);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, objs[1]);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 1, 3);
+	check_mutex_state(objs[1], 0, 0);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 0, 3);
+	check_mutex_state(objs[1], 0, 0);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+	check_sem_state(objs[0], 0, 3);
+	check_mutex_state(objs[1], 1, 123);
+
+	count = 1;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 0, 3);
+	check_mutex_state(objs[1], 1, 123);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+	check_sem_state(objs[0], 0, 3);
+	check_mutex_state(objs[1], 2, 123);
+
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	owner = 123;
+	ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(1, index);
+
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+
+	close(objs[1]);
+
+	/* test waiting on the same object twice */
+
+	count = 2;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	objs[1] = objs[0];
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 1, 3);
+
+	ret = wait_any(fd, 0, NULL, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	for (i = 1; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i)
+		objs[i] = objs[0];
+
+	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT + 1, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	ret = wait_any(fd, -1, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	close(objs[0]);
+
+	close(fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From d168f68939a978b3b1276bd841596e1fc563797b Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:00 -0600
Subject: selftests: ntsync: Add some tests for NTSYNC_IOC_WAIT_ALL.

Test basic synchronous functionality of NTSYNC_IOC_WAIT_ALL, and when objects
are considered simultaneously signaled.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-20-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 93 ++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 9781a74253ee..aba11eaf9a7c 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -73,7 +73,8 @@ static int unlock_mutex(int mutex, __u32 owner, __u32 *count)
 	return ret;
 }
 
-static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
+static int wait_objs(int fd, unsigned long request, __u32 count,
+		     const int *objs, __u32 owner, __u32 *index)
 {
 	struct ntsync_wait_args args = {0};
 	struct timespec timeout;
@@ -86,11 +87,21 @@ static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *in
 	args.objs = (uintptr_t)objs;
 	args.owner = owner;
 	args.index = 0xdeadbeef;
-	ret = ioctl(fd, NTSYNC_IOC_WAIT_ANY, &args);
+	ret = ioctl(fd, request, &args);
 	*index = args.index;
 	return ret;
 }
 
+static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, index);
+}
+
+static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, index);
+}
+
 TEST(semaphore_state)
 {
 	struct ntsync_sem_args sem_args;
@@ -443,4 +454,82 @@ TEST(test_wait_any)
 	close(fd);
 }
 
+TEST(test_wait_all)
+{
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 owner, index, count;
+	int objs[2], fd, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 2;
+	sem_args.max = 3;
+	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[0]);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, objs[1]);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 1, 3);
+	check_mutex_state(objs[1], 1, 123);
+
+	ret = wait_all(fd, 2, objs, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+	check_sem_state(objs[0], 1, 3);
+	check_mutex_state(objs[1], 1, 123);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 0, 3);
+	check_mutex_state(objs[1], 2, 123);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+	check_sem_state(objs[0], 0, 3);
+	check_mutex_state(objs[1], 2, 123);
+
+	count = 3;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 2, 3);
+	check_mutex_state(objs[1], 3, 123);
+
+	owner = 123;
+	ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	check_sem_state(objs[0], 1, 3);
+	check_mutex_state(objs[1], 1, 123);
+
+	close(objs[1]);
+
+	/* test waiting on the same object twice */
+	objs[1] = objs[0];
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	close(objs[0]);
+
+	close(fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From f23279852ad53c463442c952acb5a1df71b2e737 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:01 -0600
Subject: selftests: ntsync: Add some tests for wakeup signaling with
 WINESYNC_IOC_WAIT_ANY.

Test contended "wait-for-any" waits, to make sure that scheduling and wakeup
logic works correctly.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-21-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 143 ++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index aba11eaf9a7c..ad3f031c0b92 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -532,4 +532,147 @@ TEST(test_wait_all)
 	close(fd);
 }
 
+struct wake_args {
+	int fd;
+	int obj;
+};
+
+struct wait_args {
+	int fd;
+	unsigned long request;
+	struct ntsync_wait_args *args;
+	int ret;
+	int err;
+};
+
+static void *wait_thread(void *arg)
+{
+	struct wait_args *args = arg;
+
+	args->ret = ioctl(args->fd, args->request, args->args);
+	args->err = errno;
+	return NULL;
+}
+
+static __u64 get_abs_timeout(unsigned int ms)
+{
+	struct timespec timeout;
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+	return (timeout.tv_sec * 1000000000) + timeout.tv_nsec + (ms * 1000000);
+}
+
+static int wait_for_thread(pthread_t thread, unsigned int ms)
+{
+	struct timespec timeout;
+
+	clock_gettime(CLOCK_REALTIME, &timeout);
+	timeout.tv_nsec += ms * 1000000;
+	timeout.tv_sec += (timeout.tv_nsec / 1000000000);
+	timeout.tv_nsec %= 1000000000;
+	return pthread_timedjoin_np(thread, NULL, &timeout);
+}
+
+TEST(wake_any)
+{
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_wait_args wait_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	struct wait_args thread_args;
+	int objs[2], fd, ret;
+	__u32 count, index;
+	pthread_t thread;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 0;
+	sem_args.max = 3;
+	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[0]);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 1;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, objs[1]);
+
+	/* test waking the semaphore */
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 456;
+	wait_args.index = 0xdeadbeef;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ANY;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	count = 1;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(objs[0], 0, 3);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(0, wait_args.index);
+
+	/* test waking the mutex */
+
+	/* first grab it again for owner 123 */
+	ret = wait_any(fd, 1, &objs[1], 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.owner = 456;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = unlock_mutex(objs[1], 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+
+	ret = pthread_tryjoin_np(thread, NULL);
+	EXPECT_EQ(EBUSY, ret);
+
+	ret = unlock_mutex(objs[1], 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, mutex_args.count);
+	check_mutex_state(objs[1], 1, 456);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	/* delete an object while it's being waited on */
+
+	wait_args.timeout = get_abs_timeout(200);
+	wait_args.owner = 123;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	close(objs[0]);
+	close(objs[1]);
+
+	ret = wait_for_thread(thread, 200);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(-1, thread_args.ret);
+	EXPECT_EQ(ETIMEDOUT, thread_args.err);
+
+	close(fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 72a651c13159247e051754c04b008f1d7f40b7a6 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:02 -0600
Subject: selftests: ntsync: Add some tests for wakeup signaling with
 WINESYNC_IOC_WAIT_ALL.

Test contended "wait-for-all" waits, to make sure that scheduling and wakeup
logic works correctly, and that the wait only exits once objects are all
simultaneously signaled.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-22-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 91 +++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index ad3f031c0b92..6bf0f10679d8 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -675,4 +675,95 @@ TEST(wake_any)
 	close(fd);
 }
 
+TEST(wake_all)
+{
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_wait_args wait_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	struct wait_args thread_args;
+	int objs[2], fd, ret;
+	__u32 count, index;
+	pthread_t thread;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 0;
+	sem_args.max = 3;
+	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[0]);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 1;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, objs[1]);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 456;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ALL;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	count = 1;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	ret = pthread_tryjoin_np(thread, NULL);
+	EXPECT_EQ(EBUSY, ret);
+
+	check_sem_state(objs[0], 1, 3);
+
+	ret = wait_any(fd, 1, &objs[0], 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = unlock_mutex(objs[1], 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, count);
+
+	ret = pthread_tryjoin_np(thread, NULL);
+	EXPECT_EQ(EBUSY, ret);
+
+	check_mutex_state(objs[1], 0, 0);
+
+	count = 2;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(objs[0], 1, 3);
+	check_mutex_state(objs[1], 1, 456);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+
+	/* delete an object while it's being waited on */
+
+	wait_args.timeout = get_abs_timeout(200);
+	wait_args.owner = 123;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	close(objs[0]);
+	close(objs[1]);
+
+	ret = wait_for_thread(thread, 200);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(-1, thread_args.ret);
+	EXPECT_EQ(ETIMEDOUT, thread_args.err);
+
+	close(fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From d2083b5f51a2977cbe91d9f05dd2d1c6307f5bd8 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:03 -0600
Subject: selftests: ntsync: Add some tests for manual-reset event state.

Test event-specific ioctls NTSYNC_IOC_EVENT_SET, NTSYNC_IOC_EVENT_RESET,
NTSYNC_IOC_EVENT_PULSE, NTSYNC_IOC_EVENT_READ for manual-reset events, and
waiting on manual-reset events.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-23-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 86 +++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 6bf0f10679d8..4024a5c48bbb 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -73,6 +73,27 @@ static int unlock_mutex(int mutex, __u32 owner, __u32 *count)
 	return ret;
 }
 
+static int read_event_state(int event, __u32 *signaled, __u32 *manual)
+{
+	struct ntsync_event_args args;
+	int ret;
+
+	memset(&args, 0xcc, sizeof(args));
+	ret = ioctl(event, NTSYNC_IOC_EVENT_READ, &args);
+	*signaled = args.signaled;
+	*manual = args.manual;
+	return ret;
+}
+
+#define check_event_state(event, signaled, manual) \
+	({ \
+		__u32 __signaled, __manual; \
+		int ret = read_event_state((event), &__signaled, &__manual); \
+		EXPECT_EQ(0, ret); \
+		EXPECT_EQ((signaled), __signaled); \
+		EXPECT_EQ((manual), __manual); \
+	})
+
 static int wait_objs(int fd, unsigned long request, __u32 count,
 		     const int *objs, __u32 owner, __u32 *index)
 {
@@ -340,6 +361,71 @@ TEST(mutex_state)
 	close(fd);
 }
 
+TEST(manual_event_state)
+{
+	struct ntsync_event_args event_args;
+	__u32 index, signaled;
+	int fd, event, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	event_args.manual = 1;
+	event_args.signaled = 0;
+	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, event);
+	check_event_state(event, 0, 1);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 1, 1);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 1, 1);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_event_state(event, 1, 1);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 0, 1);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 1);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 0, 1);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 1);
+
+	close(event);
+
+	close(fd);
+}
+
 TEST(test_wait_any)
 {
 	int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret;
-- 
cgit v1.2.3


From b4e4dd5d2f7002b9a4b5dbc750b64d45cc7e5245 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:04 -0600
Subject: selftests: ntsync: Add some tests for auto-reset event state.

Test event-specific ioctls NTSYNC_IOC_EVENT_SET, NTSYNC_IOC_EVENT_RESET,
NTSYNC_IOC_EVENT_PULSE, NTSYNC_IOC_EVENT_READ for auto-reset events, and
waiting on auto-reset events.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-24-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 56 +++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 4024a5c48bbb..66a096cb2045 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -426,6 +426,62 @@ TEST(manual_event_state)
 	close(fd);
 }
 
+TEST(auto_event_state)
+{
+	struct ntsync_event_args event_args;
+	__u32 index, signaled;
+	int fd, event, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	event_args.manual = 0;
+	event_args.signaled = 1;
+	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, event);
+
+	check_event_state(event, 1, 0);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 1, 0);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_event_state(event, 0, 0);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 0);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 0, 0);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 0);
+
+	close(event);
+
+	close(fd);
+}
+
 TEST(test_wait_any)
 {
 	int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret;
-- 
cgit v1.2.3


From a2e5a8cea74502cef32ffadb9796df0073513181 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:05 -0600
Subject: selftests: ntsync: Add some tests for wakeup signaling with events.

Expand the contended wait tests, which previously only covered events and
semaphores, to cover events as well.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-25-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 145 +++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 66a096cb2045..33348f0b621f 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -598,6 +598,7 @@ TEST(test_wait_any)
 
 TEST(test_wait_all)
 {
+	struct ntsync_event_args event_args = {0};
 	struct ntsync_mutex_args mutex_args = {0};
 	struct ntsync_sem_args sem_args = {0};
 	__u32 owner, index, count;
@@ -663,6 +664,19 @@ TEST(test_wait_all)
 
 	close(objs[1]);
 
+	event_args.manual = true;
+	event_args.signaled = true;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, objs[1]);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(objs[0], 0, 3);
+	check_event_state(objs[1], 1, 1);
+
+	close(objs[1]);
+
 	/* test waiting on the same object twice */
 	objs[1] = objs[0];
 	ret = wait_all(fd, 2, objs, 123, &index);
@@ -716,12 +730,13 @@ static int wait_for_thread(pthread_t thread, unsigned int ms)
 
 TEST(wake_any)
 {
+	struct ntsync_event_args event_args = {0};
 	struct ntsync_mutex_args mutex_args = {0};
 	struct ntsync_wait_args wait_args = {0};
 	struct ntsync_sem_args sem_args = {0};
 	struct wait_args thread_args;
+	__u32 count, index, signaled;
 	int objs[2], fd, ret;
-	__u32 count, index;
 	pthread_t thread;
 
 	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
@@ -796,6 +811,94 @@ TEST(wake_any)
 	EXPECT_EQ(0, thread_args.ret);
 	EXPECT_EQ(1, wait_args.index);
 
+	close(objs[1]);
+
+	/* test waking events */
+
+	event_args.manual = false;
+	event_args.signaled = false;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, objs[1]);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(objs[1], 0, 0);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(objs[1], 0, 0);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	close(objs[1]);
+
+	event_args.manual = true;
+	event_args.signaled = false;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, objs[1]);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(objs[1], 1, 1);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(objs[1], 0, 1);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
 	/* delete an object while it's being waited on */
 
 	wait_args.timeout = get_abs_timeout(200);
@@ -819,12 +922,14 @@ TEST(wake_any)
 
 TEST(wake_all)
 {
+	struct ntsync_event_args manual_event_args = {0};
+	struct ntsync_event_args auto_event_args = {0};
 	struct ntsync_mutex_args mutex_args = {0};
 	struct ntsync_wait_args wait_args = {0};
 	struct ntsync_sem_args sem_args = {0};
 	struct wait_args thread_args;
-	int objs[2], fd, ret;
-	__u32 count, index;
+	__u32 count, index, signaled;
+	int objs[4], fd, ret;
 	pthread_t thread;
 
 	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
@@ -840,9 +945,19 @@ TEST(wake_all)
 	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
 	EXPECT_LE(0, objs[1]);
 
+	manual_event_args.manual = true;
+	manual_event_args.signaled = true;
+	objs[2] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args);
+	EXPECT_LE(0, objs[2]);
+
+	auto_event_args.manual = false;
+	auto_event_args.signaled = true;
+	objs[3] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args);
+	EXPECT_EQ(0, objs[3]);
+
 	wait_args.timeout = get_abs_timeout(1000);
 	wait_args.objs = (uintptr_t)objs;
-	wait_args.count = 2;
+	wait_args.count = 4;
 	wait_args.owner = 456;
 	thread_args.fd = fd;
 	thread_args.args = &wait_args;
@@ -876,12 +991,32 @@ TEST(wake_all)
 
 	check_mutex_state(objs[1], 0, 0);
 
+	ret = ioctl(objs[2], NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+
 	count = 2;
 	ret = release_sem(objs[0], &count);
 	EXPECT_EQ(0, ret);
 	EXPECT_EQ(0, count);
+	check_sem_state(objs[0], 2, 3);
+
+	ret = ioctl(objs[3], NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+
+	ret = ioctl(objs[2], NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	ret = ioctl(objs[3], NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
 	check_sem_state(objs[0], 1, 3);
 	check_mutex_state(objs[1], 1, 456);
+	check_event_state(objs[2], 1, 1);
+	check_event_state(objs[3], 0, 0);
 
 	ret = wait_for_thread(thread, 100);
 	EXPECT_EQ(0, ret);
@@ -899,6 +1034,8 @@ TEST(wake_all)
 
 	close(objs[0]);
 	close(objs[1]);
+	close(objs[2]);
+	close(objs[3]);
 
 	ret = wait_for_thread(thread, 200);
 	EXPECT_EQ(0, ret);
-- 
cgit v1.2.3


From dd914e0d07423c172cd8fdcf25e1ee92485a95cf Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:06 -0600
Subject: selftests: ntsync: Add tests for alertable waits.

Test the "alert" functionality of NTSYNC_IOC_WAIT_ALL and NTSYNC_IOC_WAIT_ANY,
when a wait is woken with an alert and when it is woken by an object.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-26-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 167 +++++++++++++++++++++++-
 1 file changed, 164 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 33348f0b621f..72f078dde444 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -95,7 +95,7 @@ static int read_event_state(int event, __u32 *signaled, __u32 *manual)
 	})
 
 static int wait_objs(int fd, unsigned long request, __u32 count,
-		     const int *objs, __u32 owner, __u32 *index)
+		     const int *objs, __u32 owner, int alert, __u32 *index)
 {
 	struct ntsync_wait_args args = {0};
 	struct timespec timeout;
@@ -108,6 +108,7 @@ static int wait_objs(int fd, unsigned long request, __u32 count,
 	args.objs = (uintptr_t)objs;
 	args.owner = owner;
 	args.index = 0xdeadbeef;
+	args.alert = alert;
 	ret = ioctl(fd, request, &args);
 	*index = args.index;
 	return ret;
@@ -115,12 +116,26 @@ static int wait_objs(int fd, unsigned long request, __u32 count,
 
 static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
 {
-	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, index);
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, 0, index);
 }
 
 static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
 {
-	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, index);
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, 0, index);
+}
+
+static int wait_any_alert(int fd, __u32 count, const int *objs,
+			  __u32 owner, int alert, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY,
+			 count, objs, owner, alert, index);
+}
+
+static int wait_all_alert(int fd, __u32 count, const int *objs,
+			  __u32 owner, int alert, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL,
+			 count, objs, owner, alert, index);
 }
 
 TEST(semaphore_state)
@@ -1045,4 +1060,150 @@ TEST(wake_all)
 	close(fd);
 }
 
+TEST(alert_any)
+{
+	struct ntsync_event_args event_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 index, count, signaled;
+	int objs[2], event, fd, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 0;
+	sem_args.max = 2;
+	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[0]);
+
+	sem_args.count = 1;
+	sem_args.max = 2;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[1]);
+
+	event_args.manual = true;
+	event_args.signaled = true;
+	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, event);
+
+	ret = wait_any_alert(fd, 0, NULL, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 0, NULL, 123, event, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	close(event);
+
+	/* test with an auto-reset event */
+
+	event_args.manual = false;
+	event_args.signaled = true;
+	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, event);
+
+	count = 1;
+	ret = release_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	close(event);
+
+	close(objs[0]);
+	close(objs[1]);
+
+	close(fd);
+}
+
+TEST(alert_all)
+{
+	struct ntsync_event_args event_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 index, count, signaled;
+	int objs[2], event, fd, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 2;
+	sem_args.max = 2;
+	objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[0]);
+
+	sem_args.count = 1;
+	sem_args.max = 2;
+	objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_LE(0, objs[1]);
+
+	event_args.manual = true;
+	event_args.signaled = true;
+	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, event);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	close(event);
+
+	/* test with an auto-reset event */
+
+	event_args.manual = false;
+	event_args.signaled = true;
+	event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, event);
+
+	count = 2;
+	ret = release_sem(objs[1], &count);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	close(event);
+
+	close(objs[0]);
+	close(objs[1]);
+
+	close(fd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From c52b9cb13fd1bd03199c3e5c58c5b7fcaa82d459 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:07 -0600
Subject: selftests: ntsync: Add some tests for wakeup signaling via alerts.

Expand the alert tests to cover alerting a thread mid-wait, to test that the
relevant scheduling logic works correctly.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-27-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 62 +++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 72f078dde444..298b279729aa 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -1063,9 +1063,12 @@ TEST(wake_all)
 TEST(alert_any)
 {
 	struct ntsync_event_args event_args = {0};
+	struct ntsync_wait_args wait_args = {0};
 	struct ntsync_sem_args sem_args = {0};
 	__u32 index, count, signaled;
+	struct wait_args thread_args;
 	int objs[2], event, fd, ret;
+	pthread_t thread;
 
 	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
 	ASSERT_LE(0, fd);
@@ -1107,6 +1110,34 @@ TEST(alert_any)
 	EXPECT_EQ(0, ret);
 	EXPECT_EQ(2, index);
 
+	/* test wakeup via alert */
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 123;
+	wait_args.index = 0xdeadbeef;
+	wait_args.alert = event;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ANY;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(2, wait_args.index);
+
 	close(event);
 
 	/* test with an auto-reset event */
@@ -1143,9 +1174,12 @@ TEST(alert_any)
 TEST(alert_all)
 {
 	struct ntsync_event_args event_args = {0};
+	struct ntsync_wait_args wait_args = {0};
 	struct ntsync_sem_args sem_args = {0};
+	struct wait_args thread_args;
 	__u32 index, count, signaled;
 	int objs[2], event, fd, ret;
+	pthread_t thread;
 
 	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
 	ASSERT_LE(0, fd);
@@ -1173,6 +1207,34 @@ TEST(alert_all)
 	EXPECT_EQ(0, ret);
 	EXPECT_EQ(2, index);
 
+	/* test wakeup via alert */
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 123;
+	wait_args.index = 0xdeadbeef;
+	wait_args.alert = event;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ALL;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(2, wait_args.index);
+
 	close(event);
 
 	/* test with an auto-reset event */
-- 
cgit v1.2.3


From a22860e57b54e9b55028243764f7899fe7c4ae4b Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Fri, 13 Dec 2024 13:35:08 -0600
Subject: selftests: ntsync: Add a stress test for contended waits.

Test a more realistic usage pattern, and one with heavy contention, in order to
actually exercise ntsync's internal synchronization.

This test has several threads in a tight loop acquiring a mutex, modifying some
shared data, and then releasing the mutex. At the end we check if the data is
consistent.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20241213193511.457338-28-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/testing/selftests/drivers/ntsync/ntsync.c | 72 +++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
index 298b279729aa..3aad311574c4 100644
--- a/tools/testing/selftests/drivers/ntsync/ntsync.c
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -1268,4 +1268,76 @@ TEST(alert_all)
 	close(fd);
 }
 
+#define STRESS_LOOPS 10000
+#define STRESS_THREADS 4
+
+static unsigned int stress_counter;
+static int stress_device, stress_start_event, stress_mutex;
+
+static void *stress_thread(void *arg)
+{
+	struct ntsync_wait_args wait_args = {0};
+	__u32 index, count, i;
+	int ret;
+
+	wait_args.timeout = UINT64_MAX;
+	wait_args.count = 1;
+	wait_args.objs = (uintptr_t)&stress_start_event;
+	wait_args.owner = gettid();
+	wait_args.index = 0xdeadbeef;
+
+	ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args);
+
+	wait_args.objs = (uintptr_t)&stress_mutex;
+
+	for (i = 0; i < STRESS_LOOPS; ++i) {
+		ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args);
+
+		++stress_counter;
+
+		unlock_mutex(stress_mutex, wait_args.owner, &count);
+	}
+
+	return NULL;
+}
+
+TEST(stress_wait)
+{
+	struct ntsync_event_args event_args;
+	struct ntsync_mutex_args mutex_args;
+	pthread_t threads[STRESS_THREADS];
+	__u32 signaled, i;
+	int ret;
+
+	stress_device = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, stress_device);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	stress_mutex = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_LE(0, stress_mutex);
+
+	event_args.manual = 1;
+	event_args.signaled = 0;
+	stress_start_event = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_LE(0, stress_start_event);
+
+	for (i = 0; i < STRESS_THREADS; ++i)
+		pthread_create(&threads[i], NULL, stress_thread, NULL);
+
+	ret = ioctl(stress_start_event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	for (i = 0; i < STRESS_THREADS; ++i) {
+		ret = pthread_join(threads[i], NULL);
+		EXPECT_EQ(0, ret);
+	}
+
+	EXPECT_EQ(STRESS_LOOPS * STRESS_THREADS, stress_counter);
+
+	close(stress_start_event);
+	close(stress_mutex);
+	close(stress_device);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 8600640d21cf90f3c5c4f06a5b214fbe4be9a74a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 7 Jan 2025 22:59:45 +0000
Subject: kselftest/arm64: Add 2024 dpISA extensions to hwcap test

Add coverage of the hwcaps for the 2024 dpISA extensions to the hwcap
test.

We don't actually test SIGILL generation for CMPBR since the need to
branch makes it a pain to generate and the SIGILL detection would be
unreliable anyway. Since this should be very unusual we provide a stub
function rather than supporting a missing test.

The sigill functions aren't well sorted in the file so the ordering is a
bit random.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20250107-arm64-2024-dpisa-v5-5-7578da51fc3d@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 235 +++++++++++++++++++++++++++++-
 1 file changed, 233 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 0029ed9c5c9a..35f521e5f41c 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -46,6 +46,12 @@ static void atomics_sigill(void)
 	asm volatile(".inst 0xb82003ff" : : : );
 }
 
+static void cmpbr_sigill(void)
+{
+	/* Not implemented, too complicated and unreliable anyway */
+}
+
+
 static void crc32_sigill(void)
 {
 	/* CRC32W W0, W0, W1 */
@@ -82,6 +88,18 @@ static void f8fma_sigill(void)
 	asm volatile(".inst 0xec0fc00");
 }
 
+static void f8mm4_sigill(void)
+{
+	/* FMMLA V0.4SH, V0.16B, V0.16B */
+	asm volatile(".inst 0x6e00ec00");
+}
+
+static void f8mm8_sigill(void)
+{
+	/* FMMLA V0.4S, V0.16B, V0.16B */
+	asm volatile(".inst 0x6e80ec00");
+}
+
 static void faminmax_sigill(void)
 {
 	/* FAMIN V0.4H, V0.4H, V0.4H */
@@ -98,6 +116,12 @@ static void fpmr_sigill(void)
 	asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
 }
 
+static void fprcvt_sigill(void)
+{
+	/* FCVTAS S0, H0 */
+	asm volatile(".inst 0x1efa0000");
+}
+
 static void gcs_sigill(void)
 {
 	unsigned long *gcspr;
@@ -226,6 +250,42 @@ static void sme2p1_sigill(void)
 	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
 }
 
+static void sme2p2_sigill(void)
+{
+	/* SMSTART SM */
+	asm volatile("msr S0_3_C4_C3_3, xzr" : : : );
+
+	/* UXTB Z0.D, P0/Z, Z0.D  */
+	asm volatile(".inst 0x4c1a000" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void sme_aes_sigill(void)
+{
+	/* SMSTART SM */
+	asm volatile("msr S0_3_C4_C3_3, xzr" : : : );
+
+	/* AESD z0.b, z0.b, z0.b */
+	asm volatile(".inst 0x4522e400" : : : "z0");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void sme_sbitperm_sigill(void)
+{
+	/* SMSTART SM */
+	asm volatile("msr S0_3_C4_C3_3, xzr" : : : );
+
+	/* BDEP Z0.B, Z0.B, Z0.B */
+	asm volatile(".inst 0x4500b400" : : : "z0");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
 static void smei16i32_sigill(void)
 {
 	/* SMSTART */
@@ -339,8 +399,44 @@ static void smesf8fma_sigill(void)
 	/* SMSTART */
 	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
 
-	/* FMLALB V0.8H, V0.16B, V0.16B */
-	asm volatile(".inst 0xec0fc00");
+	/* FMLALB Z0.8H, Z0.B, Z0.B */
+	asm volatile(".inst 0x64205000");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesfexpa_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FEXPA Z0.D, Z0.D */
+	asm volatile(".inst 0x04e0b800");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesmop4_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* SMOP4A ZA0.S, Z0.B, { Z0.B - Z1.B } */
+	asm volatile(".inst 0x80108000");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smestmop_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* STMOPA ZA0.S, { Z0.H - Z1.H }, Z0.H, Z20[0] */
+	asm volatile(".inst 0x80408008");
 
 	/* SMSTOP */
 	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
@@ -364,18 +460,42 @@ static void sve2p1_sigill(void)
 	asm volatile(".inst 0x65000000" : : : "z0");
 }
 
+static void sve2p2_sigill(void)
+{
+	/* NOT Z0.D, P0/Z, Z0.D */
+	asm volatile(".inst 0x4cea000" : : : "z0");
+}
+
 static void sveaes_sigill(void)
 {
 	/* AESD z0.b, z0.b, z0.b */
 	asm volatile(".inst 0x4522e400" : : : "z0");
 }
 
+static void sveaes2_sigill(void)
+{
+	/* AESD {Z0.B - Z1.B }, { Z0.B - Z1.B }, Z0.Q */
+	asm volatile(".inst 0x4522ec00" : : : "z0");
+}
+
 static void sveb16b16_sigill(void)
 {
 	/* BFADD Z0.H, Z0.H, Z0.H */
 	asm volatile(".inst 0x65000000" : : : );
 }
 
+static void svebfscale_sigill(void)
+{
+	/* BFSCALE Z0.H, P0/M, Z0.H, Z0.H */
+	asm volatile(".inst 0x65098000" : : : "z0");
+}
+
+static void svef16mm_sigill(void)
+{
+	/* FMMLA Z0.S, Z0.H, Z0.H */
+	asm volatile(".inst 0x6420e400");
+}
+
 static void svepmull_sigill(void)
 {
 	/* PMULLB Z0.Q, Z0.D, Z0.D */
@@ -394,6 +514,12 @@ static void svesha3_sigill(void)
 	asm volatile(".inst 0x4203800" : : : "z0");
 }
 
+static void sveeltperm_sigill(void)
+{
+	/* COMPACT Z0.B, P0, Z0.B */
+	asm volatile(".inst 0x5218000" : : : "x0");
+}
+
 static void svesm4_sigill(void)
 {
 	/* SM4E Z0.S, Z0.S, Z0.S */
@@ -469,6 +595,13 @@ static const struct hwcap_data {
 		.cpuinfo = "aes",
 		.sigill_fn = aes_sigill,
 	},
+	{
+		.name = "CMPBR",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_CMPBR,
+		.cpuinfo = "cmpbr",
+		.sigill_fn = cmpbr_sigill,
+	},
 	{
 		.name = "CRC32",
 		.at_hwcap = AT_HWCAP,
@@ -523,6 +656,20 @@ static const struct hwcap_data {
 		.cpuinfo = "f8fma",
 		.sigill_fn = f8fma_sigill,
 	},
+	{
+		.name = "F8MM8",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_F8MM8,
+		.cpuinfo = "f8mm8",
+		.sigill_fn = f8mm8_sigill,
+	},
+	{
+		.name = "F8MM4",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_F8MM4,
+		.cpuinfo = "f8mm4",
+		.sigill_fn = f8mm4_sigill,
+	},
 	{
 		.name = "FAMINMAX",
 		.at_hwcap = AT_HWCAP2,
@@ -545,6 +692,13 @@ static const struct hwcap_data {
 		.sigill_fn = fpmr_sigill,
 		.sigill_reliable = true,
 	},
+	{
+		.name = "FPRCVT",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_FPRCVT,
+		.cpuinfo = "fprcvt",
+		.sigill_fn = fprcvt_sigill,
+	},
 	{
 		.name = "GCS",
 		.at_hwcap = AT_HWCAP,
@@ -691,6 +845,20 @@ static const struct hwcap_data {
 		.cpuinfo = "sme2p1",
 		.sigill_fn = sme2p1_sigill,
 	},
+	{
+		.name = "SME 2.2",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SME2P2,
+		.cpuinfo = "sme2p2",
+		.sigill_fn = sme2p2_sigill,
+	},
+	{
+		.name = "SME AES",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SME_AES,
+		.cpuinfo = "smeaes",
+		.sigill_fn = sme_aes_sigill,
+	},
 	{
 		.name = "SME I16I32",
 		.at_hwcap = AT_HWCAP2,
@@ -740,6 +908,13 @@ static const struct hwcap_data {
 		.cpuinfo = "smelutv2",
 		.sigill_fn = smelutv2_sigill,
 	},
+	{
+		.name = "SME SBITPERM",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SME_SBITPERM,
+		.cpuinfo = "smesbitperm",
+		.sigill_fn = sme_sbitperm_sigill,
+	},
 	{
 		.name = "SME SF8FMA",
 		.at_hwcap = AT_HWCAP2,
@@ -761,6 +936,27 @@ static const struct hwcap_data {
 		.cpuinfo = "smesf8dp4",
 		.sigill_fn = smesf8dp4_sigill,
 	},
+	{
+		.name = "SME SFEXPA",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SME_SFEXPA,
+		.cpuinfo = "smesfexpa",
+		.sigill_fn = smesfexpa_sigill,
+	},
+	{
+		.name = "SME SMOP4",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SME_SMOP4,
+		.cpuinfo = "smesmop4",
+		.sigill_fn = smesmop4_sigill,
+	},
+	{
+		.name = "SME STMOP",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SME_STMOP,
+		.cpuinfo = "smestmop",
+		.sigill_fn = smestmop_sigill,
+	},
 	{
 		.name = "SVE",
 		.at_hwcap = AT_HWCAP,
@@ -783,6 +979,13 @@ static const struct hwcap_data {
 		.cpuinfo = "sve2p1",
 		.sigill_fn = sve2p1_sigill,
 	},
+	{
+		.name = "SVE 2.2",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SVE2P2,
+		.cpuinfo = "sve2p2",
+		.sigill_fn = sve2p2_sigill,
+	},
 	{
 		.name = "SVE AES",
 		.at_hwcap = AT_HWCAP2,
@@ -790,6 +993,34 @@ static const struct hwcap_data {
 		.cpuinfo = "sveaes",
 		.sigill_fn = sveaes_sigill,
 	},
+	{
+		.name = "SVE AES2",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SVE_AES2,
+		.cpuinfo = "sveaes2",
+		.sigill_fn = sveaes2_sigill,
+	},
+	{
+		.name = "SVE BFSCALE",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SVE_BFSCALE,
+		.cpuinfo = "svebfscale",
+		.sigill_fn = svebfscale_sigill,
+	},
+	{
+		.name = "SVE ELTPERM",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SVE_ELTPERM,
+		.cpuinfo = "sveeltperm",
+		.sigill_fn = sveeltperm_sigill,
+	},
+	{
+		.name = "SVE F16MM",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_SVE_F16MM,
+		.cpuinfo = "svef16mm",
+		.sigill_fn = svef16mm_sigill,
+	},
 	{
 		.name = "SVE2 B16B16",
 		.at_hwcap = AT_HWCAP2,
-- 
cgit v1.2.3


From bab18c7db44d3aa6c84450095451580922359c7a Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@pm.me>
Date: Tue, 7 Jan 2025 23:58:18 +0000
Subject: selftests/bpf: add -std=gnu11 to BPF_CFLAGS and CFLAGS

Latest versions of GCC BPF use C23 standard by default. This causes
compilation errors in vmlinux.h due to bool types declarations.

Add -std=gnu11 to BPF_CFLAGS and CFLAGS. This aligns with the version
of the standard used when building the kernel currently [1].

For more details see the discussions at [2] and [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Makefile#n465
[2] https://lore.kernel.org/bpf/EYcXjcKDCJY7Yb0GGtAAb7nLKPEvrgWdvWpuNzXm2qi6rYMZDixKv5KwfVVMBq17V55xyC-A1wIjrqG3aw-Imqudo9q9X7D7nLU2gWgbN0w=@pm.me/
[3] https://lore.kernel.org/bpf/20250106202715.1232864-1-ihor.solodrai@pm.me/

CC: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@pm.me>
Link: https://lore.kernel.org/r/20250107235813.2964472-1-ihor.solodrai@pm.me
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d5be2f94deef..ea9cee5de0f8 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -41,7 +41,7 @@ srctree := $(patsubst %/,%,$(dir $(srctree)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
 endif
 
-CFLAGS += -g $(OPT_FLAGS) -rdynamic					\
+CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11				\
 	  -Wall -Werror -fno-omit-frame-pointer				\
 	  $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS)			\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
@@ -447,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
 BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN)	\
 	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)			\
 	     -I$(abspath $(OUTPUT)/../usr/include)			\
+	     -std=gnu11		 					\
 	     -fno-strict-aliasing 					\
 	     -Wno-compare-distinct-pointer-types
 # TODO: enable me -Wsign-compare
@@ -787,9 +788,12 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp
 	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
 # Make sure we are able to include and link libbpf against c++.
+CXXFLAGS += $(CFLAGS)
+CXXFLAGS := $(subst -D_GNU_SOURCE=,,$(CXXFLAGS))
+CXXFLAGS := $(subst -std=gnu11,-std=gnu++11,$(CXXFLAGS))
 $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 	$(call msg,CXX,,$@)
-	$(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@
+	$(Q)$(CXX) $(CXXFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@
 
 # Benchmark runner
 $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ)
-- 
cgit v1.2.3


From bfaac2a0b9e59b595f08606e7762bee50f01a074 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 6 Jan 2025 18:50:48 +0100
Subject: selftests/bpf: Add kprobe session recursion check test

Adding kprobe.session probe to bpf_kfunc_common_test that misses bpf
program execution due to recursion check and making sure it increases
the program missed count properly.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20250106175048.1443905-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/missed.c             | 1 +
 tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/missed.c b/tools/testing/selftests/bpf/prog_tests/missed.c
index 70d90c43537c..ed8857ae914a 100644
--- a/tools/testing/selftests/bpf/prog_tests/missed.c
+++ b/tools/testing/selftests/bpf/prog_tests/missed.c
@@ -85,6 +85,7 @@ static void test_missed_kprobe_recursion(void)
 	ASSERT_GE(get_missed_count(bpf_program__fd(skel->progs.test3)), 1, "test3_recursion_misses");
 	ASSERT_GE(get_missed_count(bpf_program__fd(skel->progs.test4)), 1, "test4_recursion_misses");
 	ASSERT_GE(get_missed_count(bpf_program__fd(skel->progs.test5)), 1, "test5_recursion_misses");
+	ASSERT_EQ(get_missed_count(bpf_program__fd(skel->progs.test6)), 1, "test6_recursion_misses");
 
 cleanup:
 	missed_kprobe_recursion__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c b/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c
index c4bf679a9876..29c18d869ec1 100644
--- a/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c
+++ b/tools/testing/selftests/bpf/progs/missed_kprobe_recursion.c
@@ -46,3 +46,9 @@ int test5(struct pt_regs *ctx)
 {
 	return 0;
 }
+
+SEC("kprobe.session/bpf_kfunc_common_test")
+int test6(struct pt_regs *ctx)
+{
+	return 0;
+}
-- 
cgit v1.2.3


From 4b92b79c5645b21e529caa65100e4797d3d4c87c Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sun, 8 Sep 2024 12:00:12 +0200
Subject: selftests/nolibc: run-tests.sh: detect missing toolchain
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The script tries to resolve the path to the current toolchain using
realpath, which fails in case it's not installed, and since it's run
under -e, it doesn't have the opportunity to display a help message.
Let's detect the absence of the required toolchain before running that
command and provide a friendlier message when this happens.

Link: https://lore.kernel.org/all/ZtlQbpgpn9OQOPyI@1wt.eu/
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/run-tests.sh | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index e7ecda4ae796..0f67e80051dc 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -143,6 +143,13 @@ test_arch() {
 	arch=$1
 	ct_arch=$(crosstool_arch "$arch")
 	ct_abi=$(crosstool_abi "$1")
+
+	if [ ! -d "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/." ]; then
+		echo "No toolchain found in ${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}."
+		echo "Did you install the toolchains or set the correct arch ? Rerun with -h for help."
+		return 1
+	fi
+
 	cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-")
 	build_dir="${build_location}/${arch}"
 	if [ "$werror" -ne 0 ]; then
-- 
cgit v1.2.3


From 31eae6d995870211348345521e4865c2816478f5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 6 Jan 2025 18:29:32 -0800
Subject: selftests: drv-net: test drivers sleeping in ndo_get_stats64

Most of our tests use rtnetlink to read device stats, so they
don't expose the drivers much to paths in which device stats
are read under RCU. Add tests which hammer profcs reads to
make sure drivers:
 - don't sleep while reporting stats,
 - can handle parallel reads,
 - can handle device going down while reading.

Set ifname on the env class in NetDrvEnv, we already do that
in NetDrvEpEnv.

  KTAP version 1
  1..7
  ok 1 stats.check_pause
  ok 2 stats.check_fec
  ok 3 stats.pkt_byte_sum
  ok 4 stats.qstat_by_ifindex
  ok 5 stats.check_down
  ok 6 stats.procfs_hammer
  # completed up/down cycles: 6
  ok 7 stats.procfs_downup_hammer
  # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250107022932.2087744-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/lib/py/env.py |  1 +
 tools/testing/selftests/drivers/net/stats.py      | 94 ++++++++++++++++++++++-
 tools/testing/selftests/net/lib/py/ksft.py        |  5 ++
 3 files changed, 97 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index fea343f209ea..987e452d3a45 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -48,6 +48,7 @@ class NetDrvEnv:
         else:
             self._ns = NetdevSimDev(**kwargs)
             self.dev = self._ns.nsims[0].dev
+        self.ifname = self.dev['ifname']
         self.ifindex = self.dev['ifindex']
 
     def __enter__(self):
diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
index 031ac9def6c0..efcc1e10575b 100755
--- a/tools/testing/selftests/drivers/net/stats.py
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -2,12 +2,15 @@
 # SPDX-License-Identifier: GPL-2.0
 
 import errno
+import subprocess
+import time
 from lib.py import ksft_run, ksft_exit, ksft_pr
-from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx
+from lib.py import ksft_ge, ksft_eq, ksft_is, ksft_in, ksft_lt, ksft_true, ksft_raises
+from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import ksft_disruptive
 from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
 from lib.py import NetDrvEnv
-from lib.py import ip, defer
+from lib.py import cmd, ip, defer
 
 ethnl = EthtoolFamily()
 netfam = NetdevFamily()
@@ -174,10 +177,95 @@ def check_down(cfg) -> None:
     netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True)
 
 
+def __run_inf_loop(body):
+    body = body.strip()
+    if body[-1] != ';':
+        body += ';'
+
+    return subprocess.Popen(f"while true; do {body} done", shell=True,
+                            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+
+def __stats_increase_sanely(old, new) -> None:
+    for k in old.keys():
+        ksft_ge(new[k], old[k])
+        ksft_lt(new[k] - old[k], 1 << 31, comment="likely wrapping error")
+
+
+def procfs_hammer(cfg) -> None:
+    """
+    Reading stats via procfs only holds the RCU lock, which is not an exclusive
+    lock, make sure drivers can handle parallel reads of stats.
+    """
+    one = __run_inf_loop("cat /proc/net/dev")
+    defer(one.kill)
+    two = __run_inf_loop("cat /proc/net/dev")
+    defer(two.kill)
+
+    time.sleep(1)
+    # Make sure the processes are running
+    ksft_is(one.poll(), None)
+    ksft_is(two.poll(), None)
+
+    rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    time.sleep(2)
+    rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    __stats_increase_sanely(rtstat1, rtstat2)
+    # defers will kill the loops
+
+
+@ksft_disruptive
+def procfs_downup_hammer(cfg) -> None:
+    """
+    Reading stats via procfs only holds the RCU lock, drivers often try
+    to sleep when reading the stats, or don't protect against races.
+    """
+    # Max out the queues, we'll flip between max and 1
+    channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    if channels['combined-count'] == 0:
+        rx_type = 'rx'
+    else:
+        rx_type = 'combined'
+    cur_queue_cnt = channels[f'{rx_type}-count']
+    max_queue_cnt = channels[f'{rx_type}-max']
+
+    cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}")
+    defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}")
+
+    # Real test stats
+    stats = __run_inf_loop("cat /proc/net/dev")
+    defer(stats.kill)
+
+    ipset = f"ip link set dev {cfg.ifname}"
+    defer(ip, f"link set dev {cfg.ifname} up")
+    # The "echo -n 1" lets us count iterations below
+    updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \
+             f"ethtool -L {cfg.ifname} {rx_type} 1; " + \
+             f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \
+              "echo -n 1"
+    updown = __run_inf_loop(updown)
+    kill_updown = defer(updown.kill)
+
+    time.sleep(1)
+    # Make sure the processes are running
+    ksft_is(stats.poll(), None)
+    ksft_is(updown.poll(), None)
+
+    rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    # We're looking for crashes, give it extra time
+    time.sleep(9)
+    rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    __stats_increase_sanely(rtstat1, rtstat2)
+
+    kill_updown.exec()
+    stdout, _ = updown.communicate(timeout=5)
+    ksft_pr("completed up/down cycles:", len(stdout.decode('utf-8')))
+
+
 def main() -> None:
     with NetDrvEnv(__file__, queue_count=100) as cfg:
         ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex,
-                  check_down],
+                  check_down, procfs_hammer, procfs_downup_hammer],
                  args=(cfg, ))
     ksft_exit()
 
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index 477ae76de93d..3efe005436cd 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -71,6 +71,11 @@ def ksft_in(a, b, comment=""):
         _fail("Check failed", a, "not in", b, comment)
 
 
+def ksft_is(a, b, comment=""):
+    if a is not b:
+        _fail("Check failed", a, "is not", b, comment)
+
+
 def ksft_ge(a, b, comment=""):
     if a < b:
         _fail("Check failed", a, "<", b, comment)
-- 
cgit v1.2.3


From eb721f117e7d43b561e81dd878c4acfa2de13ee2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 7 Jan 2025 08:08:46 -0800
Subject: selftests: net: test listing NAPI vs queue resets

Test listing netdevsim NAPIs before and after a single queue
has been reset (and NAPIs re-added).

Start from resetting the middle queue because edge cases
(first / last) may actually be less likely to trigger bugs.

  # ./tools/testing/selftests/net/nl_netdev.py
  KTAP version 1
  1..4
  ok 1 nl_netdev.empty_check
  ok 2 nl_netdev.lo_check
  ok 3 nl_netdev.page_pool_check
  ok 4 nl_netdev.napi_list_check
  # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0

Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/nl_netdev.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
index 93d9d914529b..93e8cb671c3d 100755
--- a/tools/testing/selftests/net/nl_netdev.py
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -18,6 +18,23 @@ def lo_check(nf) -> None:
     ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0)
 
 
+def napi_list_check(nf) -> None:
+    with NetdevSimDev(queue_count=100) as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        ip(f"link set dev {nsim.ifname} up")
+
+        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+        ksft_eq(len(napis), 100)
+
+        for q in [50, 0, 99]:
+            for i in range(4):
+                nsim.dfs_write("queue_reset", f"{q} {i}")
+                napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+                ksft_eq(len(napis), 100,
+                        comment=f"queue count after reset queue {q} mode {i}")
+
+
 def page_pool_check(nf) -> None:
     with NetdevSimDev() as nsimdev:
         nsim = nsimdev.nsims[0]
@@ -89,7 +106,7 @@ def page_pool_check(nf) -> None:
 
 def main() -> None:
     nf = NetdevFamily()
-    ksft_run([empty_check, lo_check, page_pool_check],
+    ksft_run([empty_check, lo_check, page_pool_check, napi_list_check],
              args=(nf, ))
     ksft_exit()
 
-- 
cgit v1.2.3


From cae73d3bdce5dc6cdbf454fcc37a6fb2f025151e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:46 +0100
Subject: seltests: move nsfs into filesystems subfolder

I'm going to be adding new tests for it and it belongs under
filesystem selftests.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-7-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../testing/selftests/filesystems/nsfs/.gitignore  |  3 +
 tools/testing/selftests/filesystems/nsfs/Makefile  |  6 ++
 tools/testing/selftests/filesystems/nsfs/config    |  3 +
 tools/testing/selftests/filesystems/nsfs/owner.c   | 92 ++++++++++++++++++++++
 tools/testing/selftests/filesystems/nsfs/pidns.c   | 79 +++++++++++++++++++
 tools/testing/selftests/nsfs/.gitignore            |  3 -
 tools/testing/selftests/nsfs/Makefile              |  6 --
 tools/testing/selftests/nsfs/config                |  3 -
 tools/testing/selftests/nsfs/owner.c               | 92 ----------------------
 tools/testing/selftests/nsfs/pidns.c               | 79 -------------------
 10 files changed, 183 insertions(+), 183 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/nsfs/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/nsfs/Makefile
 create mode 100644 tools/testing/selftests/filesystems/nsfs/config
 create mode 100644 tools/testing/selftests/filesystems/nsfs/owner.c
 create mode 100644 tools/testing/selftests/filesystems/nsfs/pidns.c
 delete mode 100644 tools/testing/selftests/nsfs/.gitignore
 delete mode 100644 tools/testing/selftests/nsfs/Makefile
 delete mode 100644 tools/testing/selftests/nsfs/config
 delete mode 100644 tools/testing/selftests/nsfs/owner.c
 delete mode 100644 tools/testing/selftests/nsfs/pidns.c

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore
new file mode 100644
index 000000000000..ed79ebdf286e
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+owner
+pidns
diff --git a/tools/testing/selftests/filesystems/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile
new file mode 100644
index 000000000000..c2f3ca6e488e
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+TEST_GEN_PROGS := owner pidns
+
+CFLAGS := -Wall -Werror
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/nsfs/config b/tools/testing/selftests/filesystems/nsfs/config
new file mode 100644
index 000000000000..598d0a225fc9
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/config
@@ -0,0 +1,3 @@
+CONFIG_USER_NS=y
+CONFIG_UTS_NS=y
+CONFIG_PID_NS=y
diff --git a/tools/testing/selftests/filesystems/nsfs/owner.c b/tools/testing/selftests/filesystems/nsfs/owner.c
new file mode 100644
index 000000000000..96a976c74550
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/owner.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define NSIO    0xb7
+#define NS_GET_USERNS   _IO(NSIO, 0x1)
+
+#define pr_err(fmt, ...) \
+		({ \
+			fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+				__func__, __LINE__, ##__VA_ARGS__); \
+			1; \
+		})
+
+int main(int argc, char *argvp[])
+{
+	int pfd[2], ns, uns, init_uns;
+	struct stat st1, st2;
+	char path[128];
+	pid_t pid;
+	char c;
+
+	if (pipe(pfd))
+		return 1;
+
+	pid = fork();
+	if (pid < 0)
+		return pr_err("fork");
+	if (pid == 0) {
+		prctl(PR_SET_PDEATHSIG, SIGKILL);
+		if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
+			return pr_err("unshare");
+		close(pfd[0]);
+		close(pfd[1]);
+		while (1)
+			sleep(1);
+		return 0;
+	}
+	close(pfd[1]);
+	if (read(pfd[0], &c, 1) != 0)
+		return pr_err("Unable to read from pipe");
+	close(pfd[0]);
+
+	snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+	ns = open(path, O_RDONLY);
+	if (ns < 0)
+		return pr_err("Unable to open %s", path);
+
+	uns = ioctl(ns, NS_GET_USERNS);
+	if (uns < 0)
+		return pr_err("Unable to get an owning user namespace");
+
+	if (fstat(uns, &st1))
+		return pr_err("fstat");
+
+	snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+	if (stat(path, &st2))
+		return pr_err("stat");
+
+	if (st1.st_ino != st2.st_ino)
+		return pr_err("NS_GET_USERNS returned a wrong namespace");
+
+	init_uns = ioctl(uns, NS_GET_USERNS);
+	if (uns < 0)
+		return pr_err("Unable to get an owning user namespace");
+
+	if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+		return pr_err("Don't get EPERM");
+
+	if (unshare(CLONE_NEWUSER))
+		return pr_err("unshare");
+
+	if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
+		return pr_err("Don't get EPERM");
+	if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+		return pr_err("Don't get EPERM");
+
+	kill(pid, SIGKILL);
+	wait(NULL);
+	return 0;
+}
diff --git a/tools/testing/selftests/filesystems/nsfs/pidns.c b/tools/testing/selftests/filesystems/nsfs/pidns.c
new file mode 100644
index 000000000000..e3c772c6a7c7
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/pidns.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define pr_err(fmt, ...) \
+		({ \
+			fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+				__func__, __LINE__, ##__VA_ARGS__); \
+			1; \
+		})
+
+#define NSIO	0xb7
+#define NS_GET_USERNS   _IO(NSIO, 0x1)
+#define NS_GET_PARENT   _IO(NSIO, 0x2)
+
+#define __stack_aligned__	__attribute__((aligned(16)))
+struct cr_clone_arg {
+	char stack[128] __stack_aligned__;
+	char stack_ptr[];
+};
+
+static int child(void *args)
+{
+	prctl(PR_SET_PDEATHSIG, SIGKILL);
+	while (1)
+		sleep(1);
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	char *ns_strs[] = {"pid", "user"};
+	char path[] = "/proc/0123456789/ns/pid";
+	struct cr_clone_arg ca;
+	struct stat st1, st2;
+	int ns, pns, i;
+	pid_t pid;
+
+	pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
+	if (pid < 0)
+		return pr_err("clone");
+
+	for (i = 0; i < 2; i++) {
+		snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
+		ns = open(path, O_RDONLY);
+		if (ns < 0)
+			return pr_err("Unable to open %s", path);
+
+		pns = ioctl(ns, NS_GET_PARENT);
+		if (pns < 0)
+			return pr_err("Unable to get a parent pidns");
+
+		snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
+		if (stat(path, &st2))
+			return pr_err("Unable to stat %s", path);
+		if (fstat(pns, &st1))
+			return pr_err("Unable to stat the parent pidns");
+		if (st1.st_ino != st2.st_ino)
+			return pr_err("NS_GET_PARENT returned a wrong namespace");
+
+		if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
+			return pr_err("Don't get EPERM");
+	}
+
+	kill(pid, SIGKILL);
+	wait(NULL);
+	return 0;
+}
diff --git a/tools/testing/selftests/nsfs/.gitignore b/tools/testing/selftests/nsfs/.gitignore
deleted file mode 100644
index ed79ebdf286e..000000000000
--- a/tools/testing/selftests/nsfs/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-owner
-pidns
diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/nsfs/Makefile
deleted file mode 100644
index dd9bd50b7b93..000000000000
--- a/tools/testing/selftests/nsfs/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-TEST_GEN_PROGS := owner pidns
-
-CFLAGS := -Wall -Werror
-
-include ../lib.mk
diff --git a/tools/testing/selftests/nsfs/config b/tools/testing/selftests/nsfs/config
deleted file mode 100644
index 598d0a225fc9..000000000000
--- a/tools/testing/selftests/nsfs/config
+++ /dev/null
@@ -1,3 +0,0 @@
-CONFIG_USER_NS=y
-CONFIG_UTS_NS=y
-CONFIG_PID_NS=y
diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/nsfs/owner.c
deleted file mode 100644
index 96a976c74550..000000000000
--- a/tools/testing/selftests/nsfs/owner.c
+++ /dev/null
@@ -1,92 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sched.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-
-#define NSIO    0xb7
-#define NS_GET_USERNS   _IO(NSIO, 0x1)
-
-#define pr_err(fmt, ...) \
-		({ \
-			fprintf(stderr, "%s:%d:" fmt ": %m\n", \
-				__func__, __LINE__, ##__VA_ARGS__); \
-			1; \
-		})
-
-int main(int argc, char *argvp[])
-{
-	int pfd[2], ns, uns, init_uns;
-	struct stat st1, st2;
-	char path[128];
-	pid_t pid;
-	char c;
-
-	if (pipe(pfd))
-		return 1;
-
-	pid = fork();
-	if (pid < 0)
-		return pr_err("fork");
-	if (pid == 0) {
-		prctl(PR_SET_PDEATHSIG, SIGKILL);
-		if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
-			return pr_err("unshare");
-		close(pfd[0]);
-		close(pfd[1]);
-		while (1)
-			sleep(1);
-		return 0;
-	}
-	close(pfd[1]);
-	if (read(pfd[0], &c, 1) != 0)
-		return pr_err("Unable to read from pipe");
-	close(pfd[0]);
-
-	snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
-	ns = open(path, O_RDONLY);
-	if (ns < 0)
-		return pr_err("Unable to open %s", path);
-
-	uns = ioctl(ns, NS_GET_USERNS);
-	if (uns < 0)
-		return pr_err("Unable to get an owning user namespace");
-
-	if (fstat(uns, &st1))
-		return pr_err("fstat");
-
-	snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
-	if (stat(path, &st2))
-		return pr_err("stat");
-
-	if (st1.st_ino != st2.st_ino)
-		return pr_err("NS_GET_USERNS returned a wrong namespace");
-
-	init_uns = ioctl(uns, NS_GET_USERNS);
-	if (uns < 0)
-		return pr_err("Unable to get an owning user namespace");
-
-	if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
-		return pr_err("Don't get EPERM");
-
-	if (unshare(CLONE_NEWUSER))
-		return pr_err("unshare");
-
-	if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
-		return pr_err("Don't get EPERM");
-	if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
-		return pr_err("Don't get EPERM");
-
-	kill(pid, SIGKILL);
-	wait(NULL);
-	return 0;
-}
diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/nsfs/pidns.c
deleted file mode 100644
index e3c772c6a7c7..000000000000
--- a/tools/testing/selftests/nsfs/pidns.c
+++ /dev/null
@@ -1,79 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sched.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-
-#define pr_err(fmt, ...) \
-		({ \
-			fprintf(stderr, "%s:%d:" fmt ": %m\n", \
-				__func__, __LINE__, ##__VA_ARGS__); \
-			1; \
-		})
-
-#define NSIO	0xb7
-#define NS_GET_USERNS   _IO(NSIO, 0x1)
-#define NS_GET_PARENT   _IO(NSIO, 0x2)
-
-#define __stack_aligned__	__attribute__((aligned(16)))
-struct cr_clone_arg {
-	char stack[128] __stack_aligned__;
-	char stack_ptr[];
-};
-
-static int child(void *args)
-{
-	prctl(PR_SET_PDEATHSIG, SIGKILL);
-	while (1)
-		sleep(1);
-	exit(0);
-}
-
-int main(int argc, char *argv[])
-{
-	char *ns_strs[] = {"pid", "user"};
-	char path[] = "/proc/0123456789/ns/pid";
-	struct cr_clone_arg ca;
-	struct stat st1, st2;
-	int ns, pns, i;
-	pid_t pid;
-
-	pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
-	if (pid < 0)
-		return pr_err("clone");
-
-	for (i = 0; i < 2; i++) {
-		snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
-		ns = open(path, O_RDONLY);
-		if (ns < 0)
-			return pr_err("Unable to open %s", path);
-
-		pns = ioctl(ns, NS_GET_PARENT);
-		if (pns < 0)
-			return pr_err("Unable to get a parent pidns");
-
-		snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
-		if (stat(path, &st2))
-			return pr_err("Unable to stat %s", path);
-		if (fstat(pns, &st1))
-			return pr_err("Unable to stat the parent pidns");
-		if (st1.st_ino != st2.st_ino)
-			return pr_err("NS_GET_PARENT returned a wrong namespace");
-
-		if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
-			return pr_err("Don't get EPERM");
-	}
-
-	kill(pid, SIGKILL);
-	wait(NULL);
-	return 0;
-}
-- 
cgit v1.2.3


From 9d87b1067382be1ede395c9246ff1ee0519f0c64 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:47 +0100
Subject: selftests: add tests for mntns iteration

Test that forward and backward iteration works correctly.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-8-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../testing/selftests/filesystems/nsfs/.gitignore  |   1 +
 tools/testing/selftests/filesystems/nsfs/Makefile  |   2 +-
 .../selftests/filesystems/nsfs/iterate_mntns.c     | 149 +++++++++++++++++++++
 3 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/filesystems/nsfs/iterate_mntns.c

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore
index ed79ebdf286e..92a8249006d1 100644
--- a/tools/testing/selftests/filesystems/nsfs/.gitignore
+++ b/tools/testing/selftests/filesystems/nsfs/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 owner
 pidns
+iterate_mntns
diff --git a/tools/testing/selftests/filesystems/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile
index c2f3ca6e488e..231aaa7dfd95 100644
--- a/tools/testing/selftests/filesystems/nsfs/Makefile
+++ b/tools/testing/selftests/filesystems/nsfs/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-TEST_GEN_PROGS := owner pidns
+TEST_GEN_PROGS := owner pidns iterate_mntns
 
 CFLAGS := -Wall -Werror
 
diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
new file mode 100644
index 000000000000..457cf76f3c5f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+
+#define MNT_NS_COUNT 11
+#define MNT_NS_LAST_INDEX 10
+
+struct mnt_ns_info {
+	__u32 size;
+	__u32 nr_mounts;
+	__u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+
+FIXTURE(iterate_mount_namespaces) {
+	int fd_mnt_ns[MNT_NS_COUNT];
+	__u64 mnt_ns_id[MNT_NS_COUNT];
+};
+
+FIXTURE_SETUP(iterate_mount_namespaces)
+{
+	for (int i = 0; i < MNT_NS_COUNT; i++)
+		self->fd_mnt_ns[i] = -EBADF;
+
+	/*
+	 * Creating a new user namespace let's us guarantee that we only see
+	 * mount namespaces that we did actually create.
+	 */
+	ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
+
+	for (int i = 0; i < MNT_NS_COUNT; i++) {
+		struct mnt_ns_info info = {};
+
+		ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+		self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(self->fd_mnt_ns[i], 0);
+		ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0);
+		self->mnt_ns_id[i] = info.mnt_ns_id;
+	}
+}
+
+FIXTURE_TEARDOWN(iterate_mount_namespaces)
+{
+	for (int i = 0; i < MNT_NS_COUNT; i++) {
+		if (self->fd_mnt_ns[i] < 0)
+			continue;
+		ASSERT_EQ(close(self->fd_mnt_ns[i]), 0);
+	}
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_forward)
+{
+	int fd_mnt_ns_cur, count = 0;
+
+	fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC);
+	ASSERT_GE(fd_mnt_ns_cur, 0);
+
+	for (;; count++) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_next;
+
+		fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+		if (fd_mnt_ns_next < 0 && errno == ENOENT)
+			break;
+		ASSERT_GE(fd_mnt_ns_next, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_next;
+	}
+	ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_backwards)
+{
+	int fd_mnt_ns_cur, count = 0;
+
+	fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC);
+	ASSERT_GE(fd_mnt_ns_cur, 0);
+
+	for (;; count++) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_prev;
+
+		fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+		if (fd_mnt_ns_prev < 0 && errno == ENOENT)
+			break;
+		ASSERT_GE(fd_mnt_ns_prev, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_prev;
+	}
+	ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_forward)
+{
+	int fd_mnt_ns_cur;
+
+	ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0);
+
+	fd_mnt_ns_cur = self->fd_mnt_ns[0];
+	for (int i = 1; i < MNT_NS_COUNT; i++) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_next;
+
+		fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+		ASSERT_GE(fd_mnt_ns_next, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_next;
+		ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+	}
+}
+
+TEST_F(iterate_mount_namespaces, iterate_backward)
+{
+	int fd_mnt_ns_cur;
+
+	ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0);
+
+	fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX];
+	for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_prev;
+
+		fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+		ASSERT_GE(fd_mnt_ns_prev, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_prev;
+		ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+	}
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From d3238e8944e2bd1d6a006d35850e86fa80469751 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Dec 2024 00:03:48 +0100
Subject: selftests: remove unneeded include

The pidfd header will be included in a sample program and this pulls in
all the mount definitions that would be causing problems.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-9-6e3cdaf9b280@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 tools/testing/selftests/pidfd/pidfd.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 88d6830ee004..3a96053e52e7 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -12,7 +12,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <syscall.h>
-#include <sys/mount.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 
-- 
cgit v1.2.3


From 3ab8a0b2a0ff1038412cd644b51714e35970f415 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 15 Dec 2024 21:17:07 +0100
Subject: selftests: add listmount() iteration tests

Add a forward and backward iteration test for listmount().

Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-3-fd55922c4af8@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/statmount/Makefile       |  2 +-
 .../filesystems/statmount/listmount_test.c         | 66 ++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/filesystems/statmount/listmount_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile
index 3af3136e35a4..14ee91a41650 100644
--- a/tools/testing/selftests/filesystems/statmount/Makefile
+++ b/tools/testing/selftests/filesystems/statmount/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
 CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
-TEST_GEN_PROGS := statmount_test statmount_test_ns
+TEST_GEN_PROGS := statmount_test statmount_test_ns listmount_test
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/statmount/listmount_test.c b/tools/testing/selftests/filesystems/statmount/listmount_test.c
new file mode 100644
index 000000000000..15f0834f7557
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/listmount_test.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "statmount.h"
+#include "../../kselftest_harness.h"
+
+#ifndef LISTMOUNT_REVERSE
+#define LISTMOUNT_REVERSE    (1 << 0) /* List later mounts first */
+#endif
+
+#define LISTMNT_BUFFER 10
+
+/* Check that all mount ids are in increasing order. */
+TEST(listmount_forward)
+{
+	uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+	for (;;) {
+		ssize_t nr_mounts;
+
+		nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+				      list, LISTMNT_BUFFER, 0);
+		ASSERT_GE(nr_mounts, 0);
+		if (nr_mounts == 0)
+			break;
+
+		for (size_t cur = 0; cur < nr_mounts; cur++) {
+			if (cur < nr_mounts - 1)
+				ASSERT_LT(list[cur], list[cur + 1]);
+			last_mnt_id = list[cur];
+		}
+	}
+}
+
+/* Check that all mount ids are in decreasing order. */
+TEST(listmount_backward)
+{
+	uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+	for (;;) {
+		ssize_t nr_mounts;
+
+		nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+				      list, LISTMNT_BUFFER, LISTMOUNT_REVERSE);
+		ASSERT_GE(nr_mounts, 0);
+		if (nr_mounts == 0)
+			break;
+
+		for (size_t cur = 0; cur < nr_mounts; cur++) {
+			if (cur < nr_mounts - 1)
+				ASSERT_GT(list[cur], list[cur + 1]);
+			last_mnt_id = list[cur];
+		}
+	}
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 503465d4dc40849af3cc18a517a5c06e155c5e33 Mon Sep 17 00:00:00 2001
From: Yong-Xuan Wang <yongxuan.wang@sifive.com>
Date: Fri, 20 Dec 2024 17:17:26 +0800
Subject: tools: selftests: riscv: Add pass message for v_initval_nolibc

Add the pass message after we successfully complete the test.

Fixes: 5c93c4c72fbc ("selftests: Test RISC-V Vector's first-use handler")
Signed-off-by: Yong-Xuan Wang <yongxuan.wang@sifive.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Andy Chiu <AndybnAC@gmail.com>
Link: https://lore.kernel.org/r/20241220091730.28006-2-yongxuan.wang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/testing/selftests/riscv/vector/v_initval_nolibc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c
index 1dd94197da30..6174ffe016dc 100644
--- a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c
+++ b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c
@@ -25,6 +25,8 @@ int main(void)
 	unsigned long vl;
 	char *datap, *tmp;
 
+	ksft_set_plan(1);
+
 	datap = malloc(MAX_VSIZE);
 	if (!datap) {
 		ksft_test_result_fail("fail to allocate memory for size = %d\n", MAX_VSIZE);
@@ -63,6 +65,8 @@ int main(void)
 	}
 
 	free(datap);
+
+	ksft_test_result_pass("tests for v_initval_nolibc pass\n");
 	ksft_exit_pass();
 	return 0;
 }
-- 
cgit v1.2.3


From ebdc22c51acee963e26cacb2cb63f8fa2f483808 Mon Sep 17 00:00:00 2001
From: Yong-Xuan Wang <yongxuan.wang@sifive.com>
Date: Fri, 20 Dec 2024 17:17:27 +0800
Subject: tools: selftests: riscv: Add test count for vstate_prctl

Add the test count to drop the warning message.
"Planned tests != run tests (0 != 1)"

Fixes: 7cf6198ce22d ("selftests: Test RISC-V Vector prctl interface")
Signed-off-by: Yong-Xuan Wang <yongxuan.wang@sifive.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Andy Chiu <AndybnAC@gmail.com>
Link: https://lore.kernel.org/r/20241220091730.28006-3-yongxuan.wang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/testing/selftests/riscv/vector/vstate_prctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/vstate_prctl.c b/tools/testing/selftests/riscv/vector/vstate_prctl.c
index 895177f6bf4c..40b3bffcbb40 100644
--- a/tools/testing/selftests/riscv/vector/vstate_prctl.c
+++ b/tools/testing/selftests/riscv/vector/vstate_prctl.c
@@ -76,6 +76,8 @@ int main(void)
 	long flag, expected;
 	long rc;
 
+	ksft_set_plan(1);
+
 	pair.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
 	rc = riscv_hwprobe(&pair, 1, 0, NULL, 0);
 	if (rc < 0) {
-- 
cgit v1.2.3


From 93e505a300aa4314995f14a2083d0df97a0d80e3 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 8 Jan 2025 12:07:58 -0800
Subject: tools: ynl-gen-c: improve support for empty nests

Empty nests are the same size as a flag at the netlink level
(just a 4 byte nlattr without a payload). They are sometimes
useful in case we want to only communicate a presence of
something but may want to add more details later.
This may be the case in the upcoming io_uring ZC patches,
for example.

Improve handling of nested empty structs. We already support
empty structs since a lot of netlink replies are empty, but
for nested ones we need minor tweaks to avoid pointless empty
lines and unused variables.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Link: https://patch.msgid.link/20250108200758.2693155-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 58657dd7dedb..d3a7dfbcf929 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -1783,7 +1783,14 @@ def parse_rsp_nested(ri, struct):
                   f'{struct.ptr_name}dst = yarg->data;']
     init_lines = []
 
-    _multi_parse(ri, struct, init_lines, local_vars)
+    if struct.member_list():
+        _multi_parse(ri, struct, init_lines, local_vars)
+    else:
+        # Empty nest
+        ri.cw.block_start()
+        ri.cw.p('return 0;')
+        ri.cw.block_end()
+        ri.cw.nl()
 
 
 def parse_rsp_msg(ri, deref=False):
@@ -2610,7 +2617,8 @@ def render_uapi(family, cw):
                 val = attr.value
             val += 1
             cw.p(attr.enum_name + suffix)
-        cw.nl()
+        if attr_set.items():
+            cw.nl()
         cw.p(attr_set.cnt_name + ('' if max_by_define else ','))
         if not max_by_define:
             cw.p(f"{attr_set.max_name} = {max_value}")
-- 
cgit v1.2.3


From ab88c2b3739a3d839b04f57d9ee0d6b1dc311cc8 Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Wed, 8 Jan 2025 14:56:14 +0100
Subject: tools: ynl: move python code to separate sub-directory

Move python code to a separate directory so it can be
packaged as a python module. Updates existing references
in selftests and docs.

Also rename ynl-gen-[c|rst] to ynl_gen_[c|rst], avoid
dashes as these prevent easy imports for entrypoints.

Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/a4151bad0e6984e7164d395125ce87fd2e048bf1.1736343575.git.jstancek@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/Makefile                    |    2 +
 tools/net/ynl/cli.py                      |  119 --
 tools/net/ynl/ethtool.py                  |  439 -----
 tools/net/ynl/generated/Makefile          |    2 +-
 tools/net/ynl/lib/.gitignore              |    1 -
 tools/net/ynl/lib/Makefile                |    1 -
 tools/net/ynl/lib/__init__.py             |    8 -
 tools/net/ynl/lib/nlspec.py               |  617 ------
 tools/net/ynl/lib/ynl.py                  | 1067 ----------
 tools/net/ynl/pyynl/.gitignore            |    2 +
 tools/net/ynl/pyynl/__init__.py           |    0
 tools/net/ynl/pyynl/cli.py                |  119 ++
 tools/net/ynl/pyynl/ethtool.py            |  439 +++++
 tools/net/ynl/pyynl/lib/__init__.py       |    8 +
 tools/net/ynl/pyynl/lib/nlspec.py         |  617 ++++++
 tools/net/ynl/pyynl/lib/ynl.py            | 1067 ++++++++++
 tools/net/ynl/pyynl/ynl_gen_c.py          | 3044 +++++++++++++++++++++++++++++
 tools/net/ynl/pyynl/ynl_gen_rst.py        |  453 +++++
 tools/net/ynl/ynl-gen-c.py                | 3044 -----------------------------
 tools/net/ynl/ynl-gen-rst.py              |  453 -----
 tools/net/ynl/ynl-regen.sh                |    2 +-
 tools/testing/selftests/net/lib/py/ynl.py |    4 +-
 tools/testing/selftests/net/ynl.mk        |    3 +-
 23 files changed, 5757 insertions(+), 5754 deletions(-)
 delete mode 100755 tools/net/ynl/cli.py
 delete mode 100755 tools/net/ynl/ethtool.py
 delete mode 100644 tools/net/ynl/lib/__init__.py
 delete mode 100644 tools/net/ynl/lib/nlspec.py
 delete mode 100644 tools/net/ynl/lib/ynl.py
 create mode 100644 tools/net/ynl/pyynl/.gitignore
 create mode 100644 tools/net/ynl/pyynl/__init__.py
 create mode 100755 tools/net/ynl/pyynl/cli.py
 create mode 100755 tools/net/ynl/pyynl/ethtool.py
 create mode 100644 tools/net/ynl/pyynl/lib/__init__.py
 create mode 100644 tools/net/ynl/pyynl/lib/nlspec.py
 create mode 100644 tools/net/ynl/pyynl/lib/ynl.py
 create mode 100755 tools/net/ynl/pyynl/ynl_gen_c.py
 create mode 100755 tools/net/ynl/pyynl/ynl_gen_rst.py
 delete mode 100755 tools/net/ynl/ynl-gen-c.py
 delete mode 100755 tools/net/ynl/ynl-gen-rst.py

(limited to 'tools')

diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile
index d1cdf2a8f826..5268b91bf7ed 100644
--- a/tools/net/ynl/Makefile
+++ b/tools/net/ynl/Makefile
@@ -21,5 +21,7 @@ clean distclean:
 		fi \
 	done
 	rm -f libynl.a
+	rm -rf pyynl/__pycache__
+	rm -rf pyynl/lib/__pycache__
 
 .PHONY: all clean distclean $(SUBDIRS)
diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
deleted file mode 100755
index 41d9fa5c818d..000000000000
--- a/tools/net/ynl/cli.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-
-import argparse
-import json
-import pathlib
-import pprint
-import sys
-
-sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
-from lib import YnlFamily, Netlink, NlError
-
-
-class YnlEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, bytes):
-            return bytes.hex(obj)
-        if isinstance(obj, set):
-            return list(obj)
-        return json.JSONEncoder.default(self, obj)
-
-
-def main():
-    description = """
-    YNL CLI utility - a general purpose netlink utility that uses YAML
-    specs to drive protocol encoding and decoding.
-    """
-    epilog = """
-    The --multi option can be repeated to include several do operations
-    in the same netlink payload.
-    """
-
-    parser = argparse.ArgumentParser(description=description,
-                                     epilog=epilog)
-    parser.add_argument('--spec', dest='spec', type=str, required=True)
-    parser.add_argument('--schema', dest='schema', type=str)
-    parser.add_argument('--no-schema', action='store_true')
-    parser.add_argument('--json', dest='json_text', type=str)
-
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str)
-    group.add_argument('--multi', dest='multi', nargs=2, action='append',
-                       metavar=('DO-OPERATION', 'JSON_TEXT'), type=str)
-    group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str)
-    group.add_argument('--list-ops', action='store_true')
-    group.add_argument('--list-msgs', action='store_true')
-
-    parser.add_argument('--duration', dest='duration', type=int,
-                        help='when subscribed, watch for DURATION seconds')
-    parser.add_argument('--sleep', dest='duration', type=int,
-                        help='alias for duration')
-    parser.add_argument('--subscribe', dest='ntf', type=str)
-    parser.add_argument('--replace', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_REPLACE)
-    parser.add_argument('--excl', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_EXCL)
-    parser.add_argument('--create', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_CREATE)
-    parser.add_argument('--append', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_APPEND)
-    parser.add_argument('--process-unknown', action=argparse.BooleanOptionalAction)
-    parser.add_argument('--output-json', action='store_true')
-    parser.add_argument('--dbg-small-recv', default=0, const=4000,
-                        action='store', nargs='?', type=int)
-    args = parser.parse_args()
-
-    def output(msg):
-        if args.output_json:
-            print(json.dumps(msg, cls=YnlEncoder))
-        else:
-            pprint.PrettyPrinter().pprint(msg)
-
-    if args.no_schema:
-        args.schema = ''
-
-    attrs = {}
-    if args.json_text:
-        attrs = json.loads(args.json_text)
-
-    ynl = YnlFamily(args.spec, args.schema, args.process_unknown,
-                    recv_size=args.dbg_small_recv)
-    if args.dbg_small_recv:
-        ynl.set_recv_dbg(True)
-
-    if args.ntf:
-        ynl.ntf_subscribe(args.ntf)
-
-    if args.list_ops:
-        for op_name, op in ynl.ops.items():
-            print(op_name, " [", ", ".join(op.modes), "]")
-    if args.list_msgs:
-        for op_name, op in ynl.msgs.items():
-            print(op_name, " [", ", ".join(op.modes), "]")
-
-    try:
-        if args.do:
-            reply = ynl.do(args.do, attrs, args.flags)
-            output(reply)
-        if args.dump:
-            reply = ynl.dump(args.dump, attrs)
-            output(reply)
-        if args.multi:
-            ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ]
-            reply = ynl.do_multi(ops)
-            output(reply)
-    except NlError as e:
-        print(e)
-        exit(1)
-
-    if args.ntf:
-        try:
-            for msg in ynl.poll_ntf(duration=args.duration):
-                output(msg)
-        except KeyboardInterrupt:
-            pass
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py
deleted file mode 100755
index ebb0a11f67bf..000000000000
--- a/tools/net/ynl/ethtool.py
+++ /dev/null
@@ -1,439 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-
-import argparse
-import json
-import pathlib
-import pprint
-import sys
-import re
-import os
-
-sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
-from lib import YnlFamily
-
-def args_to_req(ynl, op_name, args, req):
-    """
-    Verify and convert command-line arguments to the ynl-compatible request.
-    """
-    valid_attrs = ynl.operation_do_attributes(op_name)
-    valid_attrs.remove('header') # not user-provided
-
-    if len(args) == 0:
-        print(f'no attributes, expected: {valid_attrs}')
-        sys.exit(1)
-
-    i = 0
-    while i < len(args):
-        attr = args[i]
-        if i + 1 >= len(args):
-            print(f'expected value for \'{attr}\'')
-            sys.exit(1)
-
-        if attr not in valid_attrs:
-            print(f'invalid attribute \'{attr}\', expected: {valid_attrs}')
-            sys.exit(1)
-
-        val = args[i+1]
-        i += 2
-
-        req[attr] = val
-
-def print_field(reply, *desc):
-    """
-    Pretty-print a set of fields from the reply. desc specifies the
-    fields and the optional type (bool/yn).
-    """
-    if len(desc) == 0:
-        return print_field(reply, *zip(reply.keys(), reply.keys()))
-
-    for spec in desc:
-        try:
-            field, name, tp = spec
-        except:
-            field, name = spec
-            tp = 'int'
-
-        value = reply.get(field, None)
-        if tp == 'yn':
-            value = 'yes' if value else 'no'
-        elif tp == 'bool' or isinstance(value, bool):
-            value = 'on' if value else 'off'
-        else:
-            value = 'n/a' if value is None else value
-
-        print(f'{name}: {value}')
-
-def print_speed(name, value):
-    """
-    Print out the speed-like strings from the value dict.
-    """
-    speed_re = re.compile(r'[0-9]+base[^/]+/.+')
-    speed = [ k for k, v in value.items() if v and speed_re.match(k) ]
-    print(f'{name}: {" ".join(speed)}')
-
-def doit(ynl, args, op_name):
-    """
-    Prepare request header, parse arguments and doit.
-    """
-    req = {
-        'header': {
-          'dev-name': args.device,
-        },
-    }
-
-    args_to_req(ynl, op_name, args.args, req)
-    ynl.do(op_name, req)
-
-def dumpit(ynl, args, op_name, extra = {}):
-    """
-    Prepare request header, parse arguments and dumpit (filtering out the
-    devices we're not interested in).
-    """
-    reply = ynl.dump(op_name, { 'header': {} } | extra)
-    if not reply:
-        return {}
-
-    for msg in reply:
-        if msg['header']['dev-name'] == args.device:
-            if args.json:
-                pprint.PrettyPrinter().pprint(msg)
-                sys.exit(0)
-            msg.pop('header', None)
-            return msg
-
-    print(f"Not supported for device {args.device}")
-    sys.exit(1)
-
-def bits_to_dict(attr):
-    """
-    Convert ynl-formatted bitmask to a dict of bit=value.
-    """
-    ret = {}
-    if 'bits' not in attr:
-        return dict()
-    if 'bit' not in attr['bits']:
-        return dict()
-    for bit in attr['bits']['bit']:
-        if bit['name'] == '':
-            continue
-        name = bit['name']
-        value = bit.get('value', False)
-        ret[name] = value
-    return ret
-
-def main():
-    parser = argparse.ArgumentParser(description='ethtool wannabe')
-    parser.add_argument('--json', action=argparse.BooleanOptionalAction)
-    parser.add_argument('--show-priv-flags', action=argparse.BooleanOptionalAction)
-    parser.add_argument('--set-priv-flags', action=argparse.BooleanOptionalAction)
-    parser.add_argument('--show-eee', action=argparse.BooleanOptionalAction)
-    parser.add_argument('--set-eee', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-a', '--show-pause', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-A', '--set-pause', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-c', '--show-coalesce', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-C', '--set-coalesce', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-g', '--show-ring', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-G', '--set-ring', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-k', '--show-features', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-K', '--set-features', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-l', '--show-channels', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-L', '--set-channels', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-T', '--show-time-stamping', action=argparse.BooleanOptionalAction)
-    parser.add_argument('-S', '--statistics', action=argparse.BooleanOptionalAction)
-    # TODO: --show-tunnels        tunnel-info-get
-    # TODO: --show-module         module-get
-    # TODO: --get-plca-cfg        plca-get
-    # TODO: --get-plca-status     plca-get-status
-    # TODO: --show-mm             mm-get
-    # TODO: --show-fec            fec-get
-    # TODO: --dump-module-eerpom  module-eeprom-get
-    # TODO:                       pse-get
-    # TODO:                       rss-get
-    parser.add_argument('device', metavar='device', type=str)
-    parser.add_argument('args', metavar='args', type=str, nargs='*')
-    global args
-    args = parser.parse_args()
-
-    script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
-    spec = os.path.join(script_abs_dir,
-                        '../../../Documentation/netlink/specs/ethtool.yaml')
-    schema = os.path.join(script_abs_dir,
-                          '../../../Documentation/netlink/genetlink-legacy.yaml')
-
-    ynl = YnlFamily(spec, schema)
-
-    if args.set_priv_flags:
-        # TODO: parse the bitmask
-        print("not implemented")
-        return
-
-    if args.set_eee:
-        return doit(ynl, args, 'eee-set')
-
-    if args.set_pause:
-        return doit(ynl, args, 'pause-set')
-
-    if args.set_coalesce:
-        return doit(ynl, args, 'coalesce-set')
-
-    if args.set_features:
-        # TODO: parse the bitmask
-        print("not implemented")
-        return
-
-    if args.set_channels:
-        return doit(ynl, args, 'channels-set')
-
-    if args.set_ring:
-        return doit(ynl, args, 'rings-set')
-
-    if args.show_priv_flags:
-        flags = bits_to_dict(dumpit(ynl, args, 'privflags-get')['flags'])
-        print_field(flags)
-        return
-
-    if args.show_eee:
-        eee = dumpit(ynl, args, 'eee-get')
-        ours = bits_to_dict(eee['modes-ours'])
-        peer = bits_to_dict(eee['modes-peer'])
-
-        if 'enabled' in eee:
-            status = 'enabled' if eee['enabled'] else 'disabled'
-            if 'active' in eee and eee['active']:
-                status = status + ' - active'
-            else:
-                status = status + ' - inactive'
-        else:
-            status = 'not supported'
-
-        print(f'EEE status: {status}')
-        print_field(eee, ('tx-lpi-timer', 'Tx LPI'))
-        print_speed('Advertised EEE link modes', ours)
-        print_speed('Link partner advertised EEE link modes', peer)
-
-        return
-
-    if args.show_pause:
-        print_field(dumpit(ynl, args, 'pause-get'),
-                ('autoneg', 'Autonegotiate', 'bool'),
-                ('rx', 'RX', 'bool'),
-                ('tx', 'TX', 'bool'))
-        return
-
-    if args.show_coalesce:
-        print_field(dumpit(ynl, args, 'coalesce-get'))
-        return
-
-    if args.show_features:
-        reply = dumpit(ynl, args, 'features-get')
-        available = bits_to_dict(reply['hw'])
-        requested = bits_to_dict(reply['wanted']).keys()
-        active = bits_to_dict(reply['active']).keys()
-        never_changed = bits_to_dict(reply['nochange']).keys()
-
-        for f in sorted(available):
-            value = "off"
-            if f in active:
-                value = "on"
-
-            fixed = ""
-            if f not in available or f in never_changed:
-                fixed = " [fixed]"
-
-            req = ""
-            if f in requested:
-                if f in active:
-                    req = " [requested on]"
-                else:
-                    req = " [requested off]"
-
-            print(f'{f}: {value}{fixed}{req}')
-
-        return
-
-    if args.show_channels:
-        reply = dumpit(ynl, args, 'channels-get')
-        print(f'Channel parameters for {args.device}:')
-
-        print(f'Pre-set maximums:')
-        print_field(reply,
-            ('rx-max', 'RX'),
-            ('tx-max', 'TX'),
-            ('other-max', 'Other'),
-            ('combined-max', 'Combined'))
-
-        print(f'Current hardware settings:')
-        print_field(reply,
-            ('rx-count', 'RX'),
-            ('tx-count', 'TX'),
-            ('other-count', 'Other'),
-            ('combined-count', 'Combined'))
-
-        return
-
-    if args.show_ring:
-        reply = dumpit(ynl, args, 'channels-get')
-
-        print(f'Ring parameters for {args.device}:')
-
-        print(f'Pre-set maximums:')
-        print_field(reply,
-            ('rx-max', 'RX'),
-            ('rx-mini-max', 'RX Mini'),
-            ('rx-jumbo-max', 'RX Jumbo'),
-            ('tx-max', 'TX'))
-
-        print(f'Current hardware settings:')
-        print_field(reply,
-            ('rx', 'RX'),
-            ('rx-mini', 'RX Mini'),
-            ('rx-jumbo', 'RX Jumbo'),
-            ('tx', 'TX'))
-
-        print_field(reply,
-            ('rx-buf-len', 'RX Buf Len'),
-            ('cqe-size', 'CQE Size'),
-            ('tx-push', 'TX Push', 'bool'))
-
-        return
-
-    if args.statistics:
-        print(f'NIC statistics:')
-
-        # TODO: pass id?
-        strset = dumpit(ynl, args, 'strset-get')
-        pprint.PrettyPrinter().pprint(strset)
-
-        req = {
-          'groups': {
-            'size': 1,
-            'bits': {
-              'bit':
-                # TODO: support passing the bitmask
-                #[
-                  #{ 'name': 'eth-phy', 'value': True },
-                  { 'name': 'eth-mac', 'value': True },
-                  #{ 'name': 'eth-ctrl', 'value': True },
-                  #{ 'name': 'rmon', 'value': True },
-                #],
-            },
-          },
-        }
-
-        rsp = dumpit(ynl, args, 'stats-get', req)
-        pprint.PrettyPrinter().pprint(rsp)
-        return
-
-    if args.show_time_stamping:
-        req = {
-          'header': {
-            'flags': 'stats',
-          },
-        }
-
-        tsinfo = dumpit(ynl, args, 'tsinfo-get', req)
-
-        print(f'Time stamping parameters for {args.device}:')
-
-        print('Capabilities:')
-        [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])]
-
-        print(f'PTP Hardware Clock: {tsinfo["phc-index"]}')
-
-        print('Hardware Transmit Timestamp Modes:')
-        [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])]
-
-        print('Hardware Receive Filter Modes:')
-        [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
-
-        print('Statistics:')
-        [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
-        return
-
-    print(f'Settings for {args.device}:')
-    linkmodes = dumpit(ynl, args, 'linkmodes-get')
-    ours = bits_to_dict(linkmodes['ours'])
-
-    supported_ports = ('TP',  'AUI', 'BNC', 'MII', 'FIBRE', 'Backplane')
-    ports = [ p for p in supported_ports if ours.get(p, False)]
-    print(f'Supported ports: [ {" ".join(ports)} ]')
-
-    print_speed('Supported link modes', ours)
-
-    print_field(ours, ('Pause', 'Supported pause frame use', 'yn'))
-    print_field(ours, ('Autoneg', 'Supports auto-negotiation', 'yn'))
-
-    supported_fec = ('None',  'PS', 'BASER', 'LLRS')
-    fec = [ p for p in supported_fec if ours.get(p, False)]
-    fec_str = " ".join(fec)
-    if len(fec) == 0:
-        fec_str = "Not reported"
-
-    print(f'Supported FEC modes: {fec_str}')
-
-    speed = 'Unknown!'
-    if linkmodes['speed'] > 0 and linkmodes['speed'] < 0xffffffff:
-        speed = f'{linkmodes["speed"]}Mb/s'
-    print(f'Speed: {speed}')
-
-    duplex_modes = {
-            0: 'Half',
-            1: 'Full',
-    }
-    duplex = duplex_modes.get(linkmodes["duplex"], None)
-    if not duplex:
-        duplex = f'Unknown! ({linkmodes["duplex"]})'
-    print(f'Duplex: {duplex}')
-
-    autoneg = "off"
-    if linkmodes.get("autoneg", 0) != 0:
-        autoneg = "on"
-    print(f'Auto-negotiation: {autoneg}')
-
-    ports = {
-            0: 'Twisted Pair',
-            1: 'AUI',
-            2: 'MII',
-            3: 'FIBRE',
-            4: 'BNC',
-            5: 'Directly Attached Copper',
-            0xef: 'None',
-    }
-    linkinfo = dumpit(ynl, args, 'linkinfo-get')
-    print(f'Port: {ports.get(linkinfo["port"], "Other")}')
-
-    print_field(linkinfo, ('phyaddr', 'PHYAD'))
-
-    transceiver = {
-            0: 'Internal',
-            1: 'External',
-    }
-    print(f'Transceiver: {transceiver.get(linkinfo["transceiver"], "Unknown")}')
-
-    mdix_ctrl = {
-            1: 'off',
-            2: 'on',
-    }
-    mdix = mdix_ctrl.get(linkinfo['tp-mdix-ctrl'], None)
-    if mdix:
-        mdix = mdix + ' (forced)'
-    else:
-        mdix = mdix_ctrl.get(linkinfo['tp-mdix'], 'Unknown (auto)')
-    print(f'MDI-X: {mdix}')
-
-    debug = dumpit(ynl, args, 'debug-get')
-    msgmask = bits_to_dict(debug.get("msgmask", [])).keys()
-    print(f'Current message level: {" ".join(msgmask)}')
-
-    linkstate = dumpit(ynl, args, 'linkstate-get')
-    detected_states = {
-            0: 'no',
-            1: 'yes',
-    }
-    # TODO: wol-get
-    detected = detected_states.get(linkstate['link'], 'unknown')
-    print(f'Link detected: {detected}')
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile
index 7db5240de58a..00af721b1571 100644
--- a/tools/net/ynl/generated/Makefile
+++ b/tools/net/ynl/generated/Makefile
@@ -12,7 +12,7 @@ include ../Makefile.deps
 YNL_GEN_ARG_ethtool:=--user-header linux/ethtool_netlink.h \
 	--exclude-op stats-get
 
-TOOL:=../ynl-gen-c.py
+TOOL:=../pyynl/ynl_gen_c.py
 
 GENS_PATHS=$(shell grep -nrI --files-without-match \
 		'protocol: netlink' \
diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore
index 296c4035dbf2..a4383358ec72 100644
--- a/tools/net/ynl/lib/.gitignore
+++ b/tools/net/ynl/lib/.gitignore
@@ -1,2 +1 @@
-__pycache__/
 *.d
diff --git a/tools/net/ynl/lib/Makefile b/tools/net/ynl/lib/Makefile
index 94c49cca3dca..4b2b98704ff9 100644
--- a/tools/net/ynl/lib/Makefile
+++ b/tools/net/ynl/lib/Makefile
@@ -19,7 +19,6 @@ ynl.a: $(OBJS)
 
 clean:
 	rm -f *.o *.d *~
-	rm -rf __pycache__
 
 distclean: clean
 	rm -f *.a
diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/lib/__init__.py
deleted file mode 100644
index 9137b83e580a..000000000000
--- a/tools/net/ynl/lib/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-
-from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \
-    SpecFamily, SpecOperation
-from .ynl import YnlFamily, Netlink, NlError
-
-__all__ = ["SpecAttr", "SpecAttrSet", "SpecEnumEntry", "SpecEnumSet",
-           "SpecFamily", "SpecOperation", "YnlFamily", "Netlink", "NlError"]
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
deleted file mode 100644
index 314ec8007496..000000000000
--- a/tools/net/ynl/lib/nlspec.py
+++ /dev/null
@@ -1,617 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-
-import collections
-import importlib
-import os
-import yaml
-
-
-# To be loaded dynamically as needed
-jsonschema = None
-
-
-class SpecElement:
-    """Netlink spec element.
-
-    Abstract element of the Netlink spec. Implements the dictionary interface
-    for access to the raw spec. Supports iterative resolution of dependencies
-    across elements and class inheritance levels. The elements of the spec
-    may refer to each other, and although loops should be very rare, having
-    to maintain correct ordering of instantiation is painful, so the resolve()
-    method should be used to perform parts of init which require access to
-    other parts of the spec.
-
-    Attributes:
-        yaml        raw spec as loaded from the spec file
-        family      back reference to the full family
-
-        name        name of the entity as listed in the spec (optional)
-        ident_name  name which can be safely used as identifier in code (optional)
-    """
-    def __init__(self, family, yaml):
-        self.yaml = yaml
-        self.family = family
-
-        if 'name' in self.yaml:
-            self.name = self.yaml['name']
-            self.ident_name = self.name.replace('-', '_')
-
-        self._super_resolved = False
-        family.add_unresolved(self)
-
-    def __getitem__(self, key):
-        return self.yaml[key]
-
-    def __contains__(self, key):
-        return key in self.yaml
-
-    def get(self, key, default=None):
-        return self.yaml.get(key, default)
-
-    def resolve_up(self, up):
-        if not self._super_resolved:
-            up.resolve()
-            self._super_resolved = True
-
-    def resolve(self):
-        pass
-
-
-class SpecEnumEntry(SpecElement):
-    """ Entry within an enum declared in the Netlink spec.
-
-    Attributes:
-        doc         documentation string
-        enum_set    back reference to the enum
-        value       numerical value of this enum (use accessors in most situations!)
-
-    Methods:
-        raw_value   raw value, i.e. the id in the enum, unlike user value which is a mask for flags
-        user_value   user value, same as raw value for enums, for flags it's the mask
-    """
-    def __init__(self, enum_set, yaml, prev, value_start):
-        if isinstance(yaml, str):
-            yaml = {'name': yaml}
-        super().__init__(enum_set.family, yaml)
-
-        self.doc = yaml.get('doc', '')
-        self.enum_set = enum_set
-
-        if 'value' in yaml:
-            self.value = yaml['value']
-        elif prev:
-            self.value = prev.value + 1
-        else:
-            self.value = value_start
-
-    def has_doc(self):
-        return bool(self.doc)
-
-    def raw_value(self):
-        return self.value
-
-    def user_value(self, as_flags=None):
-        if self.enum_set['type'] == 'flags' or as_flags:
-            return 1 << self.value
-        else:
-            return self.value
-
-
-class SpecEnumSet(SpecElement):
-    """ Enum type
-
-    Represents an enumeration (list of numerical constants)
-    as declared in the "definitions" section of the spec.
-
-    Attributes:
-        type            enum or flags
-        entries         entries by name
-        entries_by_val  entries by value
-    Methods:
-        get_mask      for flags compute the mask of all defined values
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-
-        self.type = yaml['type']
-
-        prev_entry = None
-        value_start = self.yaml.get('value-start', 0)
-        self.entries = dict()
-        self.entries_by_val = dict()
-        for entry in self.yaml['entries']:
-            e = self.new_entry(entry, prev_entry, value_start)
-            self.entries[e.name] = e
-            self.entries_by_val[e.raw_value()] = e
-            prev_entry = e
-
-    def new_entry(self, entry, prev_entry, value_start):
-        return SpecEnumEntry(self, entry, prev_entry, value_start)
-
-    def has_doc(self):
-        if 'doc' in self.yaml:
-            return True
-        return self.has_entry_doc()
-
-    def has_entry_doc(self):
-        for entry in self.entries.values():
-            if entry.has_doc():
-                return True
-        return False
-
-    def get_mask(self, as_flags=None):
-        mask = 0
-        for e in self.entries.values():
-            mask += e.user_value(as_flags)
-        return mask
-
-
-class SpecAttr(SpecElement):
-    """ Single Netlink attribute type
-
-    Represents a single attribute type within an attr space.
-
-    Attributes:
-        type          string, attribute type
-        value         numerical ID when serialized
-        attr_set      Attribute Set containing this attr
-        is_multi      bool, attr may repeat multiple times
-        struct_name   string, name of struct definition
-        sub_type      string, name of sub type
-        len           integer, optional byte length of binary types
-        display_hint  string, hint to help choose format specifier
-                      when displaying the value
-        sub_message   string, name of sub message type
-        selector      string, name of attribute used to select
-                      sub-message type
-
-        is_auto_scalar bool, attr is a variable-size scalar
-    """
-    def __init__(self, family, attr_set, yaml, value):
-        super().__init__(family, yaml)
-
-        self.type = yaml['type']
-        self.value = value
-        self.attr_set = attr_set
-        self.is_multi = yaml.get('multi-attr', False)
-        self.struct_name = yaml.get('struct')
-        self.sub_type = yaml.get('sub-type')
-        self.byte_order = yaml.get('byte-order')
-        self.len = yaml.get('len')
-        self.display_hint = yaml.get('display-hint')
-        self.sub_message = yaml.get('sub-message')
-        self.selector = yaml.get('selector')
-
-        self.is_auto_scalar = self.type == "sint" or self.type == "uint"
-
-
-class SpecAttrSet(SpecElement):
-    """ Netlink Attribute Set class.
-
-    Represents a ID space of attributes within Netlink.
-
-    Note that unlike other elements, which expose contents of the raw spec
-    via the dictionary interface Attribute Set exposes attributes by name.
-
-    Attributes:
-        attrs      ordered dict of all attributes (indexed by name)
-        attrs_by_val  ordered dict of all attributes (indexed by value)
-        subset_of  parent set if this is a subset, otherwise None
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-
-        self.subset_of = self.yaml.get('subset-of', None)
-
-        self.attrs = collections.OrderedDict()
-        self.attrs_by_val = collections.OrderedDict()
-
-        if self.subset_of is None:
-            val = 1
-            for elem in self.yaml['attributes']:
-                if 'value' in elem:
-                    val = elem['value']
-
-                attr = self.new_attr(elem, val)
-                self.attrs[attr.name] = attr
-                self.attrs_by_val[attr.value] = attr
-                val += 1
-        else:
-            real_set = family.attr_sets[self.subset_of]
-            for elem in self.yaml['attributes']:
-                real_attr = real_set[elem['name']]
-                combined_elem = real_attr.yaml | elem
-                attr = self.new_attr(combined_elem, real_attr.value)
-
-                self.attrs[attr.name] = attr
-                self.attrs_by_val[attr.value] = attr
-
-    def new_attr(self, elem, value):
-        return SpecAttr(self.family, self, elem, value)
-
-    def __getitem__(self, key):
-        return self.attrs[key]
-
-    def __contains__(self, key):
-        return key in self.attrs
-
-    def __iter__(self):
-        yield from self.attrs
-
-    def items(self):
-        return self.attrs.items()
-
-
-class SpecStructMember(SpecElement):
-    """Struct member attribute
-
-    Represents a single struct member attribute.
-
-    Attributes:
-        type        string, type of the member attribute
-        byte_order  string or None for native byte order
-        enum        string, name of the enum definition
-        len         integer, optional byte length of binary types
-        display_hint  string, hint to help choose format specifier
-                      when displaying the value
-        struct      string, name of nested struct type
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-        self.type = yaml['type']
-        self.byte_order = yaml.get('byte-order')
-        self.enum = yaml.get('enum')
-        self.len = yaml.get('len')
-        self.display_hint = yaml.get('display-hint')
-        self.struct = yaml.get('struct')
-
-
-class SpecStruct(SpecElement):
-    """Netlink struct type
-
-    Represents a C struct definition.
-
-    Attributes:
-        members   ordered list of struct members
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-
-        self.members = []
-        for member in yaml.get('members', []):
-            self.members.append(self.new_member(family, member))
-
-    def new_member(self, family, elem):
-        return SpecStructMember(family, elem)
-
-    def __iter__(self):
-        yield from self.members
-
-    def items(self):
-        return self.members.items()
-
-
-class SpecSubMessage(SpecElement):
-    """ Netlink sub-message definition
-
-    Represents a set of sub-message formats for polymorphic nlattrs
-    that contain type-specific sub messages.
-
-    Attributes:
-        name     string, name of sub-message definition
-        formats  dict of sub-message formats indexed by match value
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-
-        self.formats = collections.OrderedDict()
-        for elem in self.yaml['formats']:
-            format = self.new_format(family, elem)
-            self.formats[format.value] = format
-
-    def new_format(self, family, format):
-        return SpecSubMessageFormat(family, format)
-
-
-class SpecSubMessageFormat(SpecElement):
-    """ Netlink sub-message format definition
-
-    Represents a single format for a sub-message.
-
-    Attributes:
-        value         attribute value to match against type selector
-        fixed_header  string, name of fixed header, or None
-        attr_set      string, name of attribute set, or None
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-
-        self.value = yaml.get('value')
-        self.fixed_header = yaml.get('fixed-header')
-        self.attr_set = yaml.get('attribute-set')
-
-
-class SpecOperation(SpecElement):
-    """Netlink Operation
-
-    Information about a single Netlink operation.
-
-    Attributes:
-        value           numerical ID when serialized, None if req/rsp values differ
-
-        req_value       numerical ID when serialized, user -> kernel
-        rsp_value       numerical ID when serialized, user <- kernel
-        modes           supported operation modes (do, dump, event etc.)
-        is_call         bool, whether the operation is a call
-        is_async        bool, whether the operation is a notification
-        is_resv         bool, whether the operation does not exist (it's just a reserved ID)
-        attr_set        attribute set name
-        fixed_header    string, optional name of fixed header struct
-
-        yaml            raw spec as loaded from the spec file
-    """
-    def __init__(self, family, yaml, req_value, rsp_value):
-        super().__init__(family, yaml)
-
-        self.value = req_value if req_value == rsp_value else None
-        self.req_value = req_value
-        self.rsp_value = rsp_value
-
-        self.modes = yaml.keys() & {'do', 'dump', 'event', 'notify'}
-        self.is_call = 'do' in yaml or 'dump' in yaml
-        self.is_async = 'notify' in yaml or 'event' in yaml
-        self.is_resv = not self.is_async and not self.is_call
-        self.fixed_header = self.yaml.get('fixed-header', family.fixed_header)
-
-        # Added by resolve:
-        self.attr_set = None
-        delattr(self, "attr_set")
-
-    def resolve(self):
-        self.resolve_up(super())
-
-        if 'attribute-set' in self.yaml:
-            attr_set_name = self.yaml['attribute-set']
-        elif 'notify' in self.yaml:
-            msg = self.family.msgs[self.yaml['notify']]
-            attr_set_name = msg['attribute-set']
-        elif self.is_resv:
-            attr_set_name = ''
-        else:
-            raise Exception(f"Can't resolve attribute set for op '{self.name}'")
-        if attr_set_name:
-            self.attr_set = self.family.attr_sets[attr_set_name]
-
-
-class SpecMcastGroup(SpecElement):
-    """Netlink Multicast Group
-
-    Information about a multicast group.
-
-    Value is only used for classic netlink families that use the
-    netlink-raw schema. Genetlink families use dynamic ID allocation
-    where the ids of multicast groups get resolved at runtime. Value
-    will be None for genetlink families.
-
-    Attributes:
-        name      name of the mulitcast group
-        value     integer id of this multicast group for netlink-raw or None
-        yaml      raw spec as loaded from the spec file
-    """
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-        self.value = self.yaml.get('value')
-
-
-class SpecFamily(SpecElement):
-    """ Netlink Family Spec class.
-
-    Netlink family information loaded from a spec (e.g. in YAML).
-    Takes care of unfolding implicit information which can be skipped
-    in the spec itself for brevity.
-
-    The class can be used like a dictionary to access the raw spec
-    elements but that's usually a bad idea.
-
-    Attributes:
-        proto     protocol type (e.g. genetlink)
-        msg_id_model   enum-model for operations (unified, directional etc.)
-        license   spec license (loaded from an SPDX tag on the spec)
-
-        attr_sets  dict of attribute sets
-        msgs       dict of all messages (index by name)
-        sub_msgs   dict of all sub messages (index by name)
-        ops        dict of all valid requests / responses
-        ntfs       dict of all async events
-        consts     dict of all constants/enums
-        fixed_header  string, optional name of family default fixed header struct
-        mcast_groups  dict of all multicast groups (index by name)
-        kernel_family   dict of kernel family attributes
-    """
-    def __init__(self, spec_path, schema_path=None, exclude_ops=None):
-        with open(spec_path, "r") as stream:
-            prefix = '# SPDX-License-Identifier: '
-            first = stream.readline().strip()
-            if not first.startswith(prefix):
-                raise Exception('SPDX license tag required in the spec')
-            self.license = first[len(prefix):]
-
-            stream.seek(0)
-            spec = yaml.safe_load(stream)
-
-        self._resolution_list = []
-
-        super().__init__(self, spec)
-
-        self._exclude_ops = exclude_ops if exclude_ops else []
-
-        self.proto = self.yaml.get('protocol', 'genetlink')
-        self.msg_id_model = self.yaml['operations'].get('enum-model', 'unified')
-
-        if schema_path is None:
-            schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml'
-        if schema_path:
-            global jsonschema
-
-            with open(schema_path, "r") as stream:
-                schema = yaml.safe_load(stream)
-
-            if jsonschema is None:
-                jsonschema = importlib.import_module("jsonschema")
-
-            jsonschema.validate(self.yaml, schema)
-
-        self.attr_sets = collections.OrderedDict()
-        self.sub_msgs = collections.OrderedDict()
-        self.msgs = collections.OrderedDict()
-        self.req_by_value = collections.OrderedDict()
-        self.rsp_by_value = collections.OrderedDict()
-        self.ops = collections.OrderedDict()
-        self.ntfs = collections.OrderedDict()
-        self.consts = collections.OrderedDict()
-        self.mcast_groups = collections.OrderedDict()
-        self.kernel_family = collections.OrderedDict(self.yaml.get('kernel-family', {}))
-
-        last_exception = None
-        while len(self._resolution_list) > 0:
-            resolved = []
-            unresolved = self._resolution_list
-            self._resolution_list = []
-
-            for elem in unresolved:
-                try:
-                    elem.resolve()
-                except (KeyError, AttributeError) as e:
-                    self._resolution_list.append(elem)
-                    last_exception = e
-                    continue
-
-                resolved.append(elem)
-
-            if len(resolved) == 0:
-                raise last_exception
-
-    def new_enum(self, elem):
-        return SpecEnumSet(self, elem)
-
-    def new_attr_set(self, elem):
-        return SpecAttrSet(self, elem)
-
-    def new_struct(self, elem):
-        return SpecStruct(self, elem)
-
-    def new_sub_message(self, elem):
-        return SpecSubMessage(self, elem);
-
-    def new_operation(self, elem, req_val, rsp_val):
-        return SpecOperation(self, elem, req_val, rsp_val)
-
-    def new_mcast_group(self, elem):
-        return SpecMcastGroup(self, elem)
-
-    def add_unresolved(self, elem):
-        self._resolution_list.append(elem)
-
-    def _dictify_ops_unified(self):
-        self.fixed_header = self.yaml['operations'].get('fixed-header')
-        val = 1
-        for elem in self.yaml['operations']['list']:
-            if 'value' in elem:
-                val = elem['value']
-
-            op = self.new_operation(elem, val, val)
-            val += 1
-
-            self.msgs[op.name] = op
-
-    def _dictify_ops_directional(self):
-        self.fixed_header = self.yaml['operations'].get('fixed-header')
-        req_val = rsp_val = 1
-        for elem in self.yaml['operations']['list']:
-            if 'notify' in elem or 'event' in elem:
-                if 'value' in elem:
-                    rsp_val = elem['value']
-                req_val_next = req_val
-                rsp_val_next = rsp_val + 1
-                req_val = None
-            elif 'do' in elem or 'dump' in elem:
-                mode = elem['do'] if 'do' in elem else elem['dump']
-
-                v = mode.get('request', {}).get('value', None)
-                if v:
-                    req_val = v
-                v = mode.get('reply', {}).get('value', None)
-                if v:
-                    rsp_val = v
-
-                rsp_inc = 1 if 'reply' in mode else 0
-                req_val_next = req_val + 1
-                rsp_val_next = rsp_val + rsp_inc
-            else:
-                raise Exception("Can't parse directional ops")
-
-            if req_val == req_val_next:
-                req_val = None
-            if rsp_val == rsp_val_next:
-                rsp_val = None
-
-            skip = False
-            for exclude in self._exclude_ops:
-                skip |= bool(exclude.match(elem['name']))
-            if not skip:
-                op = self.new_operation(elem, req_val, rsp_val)
-
-            req_val = req_val_next
-            rsp_val = rsp_val_next
-
-            self.msgs[op.name] = op
-
-    def find_operation(self, name):
-      """
-      For a given operation name, find and return operation spec.
-      """
-      for op in self.yaml['operations']['list']:
-        if name == op['name']:
-          return op
-      return None
-
-    def resolve(self):
-        self.resolve_up(super())
-
-        definitions = self.yaml.get('definitions', [])
-        for elem in definitions:
-            if elem['type'] == 'enum' or elem['type'] == 'flags':
-                self.consts[elem['name']] = self.new_enum(elem)
-            elif elem['type'] == 'struct':
-                self.consts[elem['name']] = self.new_struct(elem)
-            else:
-                self.consts[elem['name']] = elem
-
-        for elem in self.yaml['attribute-sets']:
-            attr_set = self.new_attr_set(elem)
-            self.attr_sets[elem['name']] = attr_set
-
-        for elem in self.yaml.get('sub-messages', []):
-            sub_message = self.new_sub_message(elem)
-            self.sub_msgs[sub_message.name] = sub_message
-
-        if self.msg_id_model == 'unified':
-            self._dictify_ops_unified()
-        elif self.msg_id_model == 'directional':
-            self._dictify_ops_directional()
-
-        for op in self.msgs.values():
-            if op.req_value is not None:
-                self.req_by_value[op.req_value] = op
-            if op.rsp_value is not None:
-                self.rsp_by_value[op.rsp_value] = op
-            if not op.is_async and 'attribute-set' in op:
-                self.ops[op.name] = op
-            elif op.is_async:
-                self.ntfs[op.name] = op
-
-        mcgs = self.yaml.get('mcast-groups')
-        if mcgs:
-            for elem in mcgs['list']:
-                mcg = self.new_mcast_group(elem)
-                self.mcast_groups[elem['name']] = mcg
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
deleted file mode 100644
index 08f8bf89cfc2..000000000000
--- a/tools/net/ynl/lib/ynl.py
+++ /dev/null
@@ -1,1067 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-
-from collections import namedtuple
-from enum import Enum
-import functools
-import os
-import random
-import socket
-import struct
-from struct import Struct
-import sys
-import yaml
-import ipaddress
-import uuid
-import queue
-import selectors
-import time
-
-from .nlspec import SpecFamily
-
-#
-# Generic Netlink code which should really be in some library, but I can't quickly find one.
-#
-
-
-class Netlink:
-    # Netlink socket
-    SOL_NETLINK = 270
-
-    NETLINK_ADD_MEMBERSHIP = 1
-    NETLINK_CAP_ACK = 10
-    NETLINK_EXT_ACK = 11
-    NETLINK_GET_STRICT_CHK = 12
-
-    # Netlink message
-    NLMSG_ERROR = 2
-    NLMSG_DONE = 3
-
-    NLM_F_REQUEST = 1
-    NLM_F_ACK = 4
-    NLM_F_ROOT = 0x100
-    NLM_F_MATCH = 0x200
-
-    NLM_F_REPLACE = 0x100
-    NLM_F_EXCL = 0x200
-    NLM_F_CREATE = 0x400
-    NLM_F_APPEND = 0x800
-
-    NLM_F_CAPPED = 0x100
-    NLM_F_ACK_TLVS = 0x200
-
-    NLM_F_DUMP = NLM_F_ROOT | NLM_F_MATCH
-
-    NLA_F_NESTED = 0x8000
-    NLA_F_NET_BYTEORDER = 0x4000
-
-    NLA_TYPE_MASK = NLA_F_NESTED | NLA_F_NET_BYTEORDER
-
-    # Genetlink defines
-    NETLINK_GENERIC = 16
-
-    GENL_ID_CTRL = 0x10
-
-    # nlctrl
-    CTRL_CMD_GETFAMILY = 3
-
-    CTRL_ATTR_FAMILY_ID = 1
-    CTRL_ATTR_FAMILY_NAME = 2
-    CTRL_ATTR_MAXATTR = 5
-    CTRL_ATTR_MCAST_GROUPS = 7
-
-    CTRL_ATTR_MCAST_GRP_NAME = 1
-    CTRL_ATTR_MCAST_GRP_ID = 2
-
-    # Extack types
-    NLMSGERR_ATTR_MSG = 1
-    NLMSGERR_ATTR_OFFS = 2
-    NLMSGERR_ATTR_COOKIE = 3
-    NLMSGERR_ATTR_POLICY = 4
-    NLMSGERR_ATTR_MISS_TYPE = 5
-    NLMSGERR_ATTR_MISS_NEST = 6
-
-    # Policy types
-    NL_POLICY_TYPE_ATTR_TYPE = 1
-    NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2
-    NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3
-    NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4
-    NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5
-    NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6
-    NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7
-    NL_POLICY_TYPE_ATTR_POLICY_IDX = 8
-    NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9
-    NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10
-    NL_POLICY_TYPE_ATTR_PAD = 11
-    NL_POLICY_TYPE_ATTR_MASK = 12
-
-    AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64',
-                                  's8', 's16', 's32', 's64',
-                                  'binary', 'string', 'nul-string',
-                                  'nested', 'nested-array',
-                                  'bitfield32', 'sint', 'uint'])
-
-class NlError(Exception):
-  def __init__(self, nl_msg):
-    self.nl_msg = nl_msg
-    self.error = -nl_msg.error
-
-  def __str__(self):
-    return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}"
-
-
-class ConfigError(Exception):
-    pass
-
-
-class NlAttr:
-    ScalarFormat = namedtuple('ScalarFormat', ['native', 'big', 'little'])
-    type_formats = {
-        'u8' : ScalarFormat(Struct('B'), Struct("B"),  Struct("B")),
-        's8' : ScalarFormat(Struct('b'), Struct("b"),  Struct("b")),
-        'u16': ScalarFormat(Struct('H'), Struct(">H"), Struct("<H")),
-        's16': ScalarFormat(Struct('h'), Struct(">h"), Struct("<h")),
-        'u32': ScalarFormat(Struct('I'), Struct(">I"), Struct("<I")),
-        's32': ScalarFormat(Struct('i'), Struct(">i"), Struct("<i")),
-        'u64': ScalarFormat(Struct('Q'), Struct(">Q"), Struct("<Q")),
-        's64': ScalarFormat(Struct('q'), Struct(">q"), Struct("<q"))
-    }
-
-    def __init__(self, raw, offset):
-        self._len, self._type = struct.unpack("HH", raw[offset : offset + 4])
-        self.type = self._type & ~Netlink.NLA_TYPE_MASK
-        self.is_nest = self._type & Netlink.NLA_F_NESTED
-        self.payload_len = self._len
-        self.full_len = (self.payload_len + 3) & ~3
-        self.raw = raw[offset + 4 : offset + self.payload_len]
-
-    @classmethod
-    def get_format(cls, attr_type, byte_order=None):
-        format = cls.type_formats[attr_type]
-        if byte_order:
-            return format.big if byte_order == "big-endian" \
-                else format.little
-        return format.native
-
-    def as_scalar(self, attr_type, byte_order=None):
-        format = self.get_format(attr_type, byte_order)
-        return format.unpack(self.raw)[0]
-
-    def as_auto_scalar(self, attr_type, byte_order=None):
-        if len(self.raw) != 4 and len(self.raw) != 8:
-            raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}")
-        real_type = attr_type[0] + str(len(self.raw) * 8)
-        format = self.get_format(real_type, byte_order)
-        return format.unpack(self.raw)[0]
-
-    def as_strz(self):
-        return self.raw.decode('ascii')[:-1]
-
-    def as_bin(self):
-        return self.raw
-
-    def as_c_array(self, type):
-        format = self.get_format(type)
-        return [ x[0] for x in format.iter_unpack(self.raw) ]
-
-    def __repr__(self):
-        return f"[type:{self.type} len:{self._len}] {self.raw}"
-
-
-class NlAttrs:
-    def __init__(self, msg, offset=0):
-        self.attrs = []
-
-        while offset < len(msg):
-            attr = NlAttr(msg, offset)
-            offset += attr.full_len
-            self.attrs.append(attr)
-
-    def __iter__(self):
-        yield from self.attrs
-
-    def __repr__(self):
-        msg = ''
-        for a in self.attrs:
-            if msg:
-                msg += '\n'
-            msg += repr(a)
-        return msg
-
-
-class NlMsg:
-    def __init__(self, msg, offset, attr_space=None):
-        self.hdr = msg[offset : offset + 16]
-
-        self.nl_len, self.nl_type, self.nl_flags, self.nl_seq, self.nl_portid = \
-            struct.unpack("IHHII", self.hdr)
-
-        self.raw = msg[offset + 16 : offset + self.nl_len]
-
-        self.error = 0
-        self.done = 0
-
-        extack_off = None
-        if self.nl_type == Netlink.NLMSG_ERROR:
-            self.error = struct.unpack("i", self.raw[0:4])[0]
-            self.done = 1
-            extack_off = 20
-        elif self.nl_type == Netlink.NLMSG_DONE:
-            self.error = struct.unpack("i", self.raw[0:4])[0]
-            self.done = 1
-            extack_off = 4
-
-        self.extack = None
-        if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off:
-            self.extack = dict()
-            extack_attrs = NlAttrs(self.raw[extack_off:])
-            for extack in extack_attrs:
-                if extack.type == Netlink.NLMSGERR_ATTR_MSG:
-                    self.extack['msg'] = extack.as_strz()
-                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_TYPE:
-                    self.extack['miss-type'] = extack.as_scalar('u32')
-                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_NEST:
-                    self.extack['miss-nest'] = extack.as_scalar('u32')
-                elif extack.type == Netlink.NLMSGERR_ATTR_OFFS:
-                    self.extack['bad-attr-offs'] = extack.as_scalar('u32')
-                elif extack.type == Netlink.NLMSGERR_ATTR_POLICY:
-                    self.extack['policy'] = self._decode_policy(extack.raw)
-                else:
-                    if 'unknown' not in self.extack:
-                        self.extack['unknown'] = []
-                    self.extack['unknown'].append(extack)
-
-            if attr_space:
-                # We don't have the ability to parse nests yet, so only do global
-                if 'miss-type' in self.extack and 'miss-nest' not in self.extack:
-                    miss_type = self.extack['miss-type']
-                    if miss_type in attr_space.attrs_by_val:
-                        spec = attr_space.attrs_by_val[miss_type]
-                        self.extack['miss-type'] = spec['name']
-                        if 'doc' in spec:
-                            self.extack['miss-type-doc'] = spec['doc']
-
-    def _decode_policy(self, raw):
-        policy = {}
-        for attr in NlAttrs(raw):
-            if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE:
-                type = attr.as_scalar('u32')
-                policy['type'] = Netlink.AttrType(type).name
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S:
-                policy['min-value'] = attr.as_scalar('s64')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S:
-                policy['max-value'] = attr.as_scalar('s64')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U:
-                policy['min-value'] = attr.as_scalar('u64')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U:
-                policy['max-value'] = attr.as_scalar('u64')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH:
-                policy['min-length'] = attr.as_scalar('u32')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH:
-                policy['max-length'] = attr.as_scalar('u32')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK:
-                policy['bitfield32-mask'] = attr.as_scalar('u32')
-            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK:
-                policy['mask'] = attr.as_scalar('u64')
-        return policy
-
-    def cmd(self):
-        return self.nl_type
-
-    def __repr__(self):
-        msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}"
-        if self.error:
-            msg += '\n\terror: ' + str(self.error)
-        if self.extack:
-            msg += '\n\textack: ' + repr(self.extack)
-        return msg
-
-
-class NlMsgs:
-    def __init__(self, data, attr_space=None):
-        self.msgs = []
-
-        offset = 0
-        while offset < len(data):
-            msg = NlMsg(data, offset, attr_space=attr_space)
-            offset += msg.nl_len
-            self.msgs.append(msg)
-
-    def __iter__(self):
-        yield from self.msgs
-
-
-genl_family_name_to_id = None
-
-
-def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None):
-    # we prepend length in _genl_msg_finalize()
-    if seq is None:
-        seq = random.randint(1, 1024)
-    nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
-    genlmsg = struct.pack("BBH", genl_cmd, genl_version, 0)
-    return nlmsg + genlmsg
-
-
-def _genl_msg_finalize(msg):
-    return struct.pack("I", len(msg) + 4) + msg
-
-
-def _genl_load_families():
-    with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
-        sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
-
-        msg = _genl_msg(Netlink.GENL_ID_CTRL,
-                        Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK | Netlink.NLM_F_DUMP,
-                        Netlink.CTRL_CMD_GETFAMILY, 1)
-        msg = _genl_msg_finalize(msg)
-
-        sock.send(msg, 0)
-
-        global genl_family_name_to_id
-        genl_family_name_to_id = dict()
-
-        while True:
-            reply = sock.recv(128 * 1024)
-            nms = NlMsgs(reply)
-            for nl_msg in nms:
-                if nl_msg.error:
-                    print("Netlink error:", nl_msg.error)
-                    return
-                if nl_msg.done:
-                    return
-
-                gm = GenlMsg(nl_msg)
-                fam = dict()
-                for attr in NlAttrs(gm.raw):
-                    if attr.type == Netlink.CTRL_ATTR_FAMILY_ID:
-                        fam['id'] = attr.as_scalar('u16')
-                    elif attr.type == Netlink.CTRL_ATTR_FAMILY_NAME:
-                        fam['name'] = attr.as_strz()
-                    elif attr.type == Netlink.CTRL_ATTR_MAXATTR:
-                        fam['maxattr'] = attr.as_scalar('u32')
-                    elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS:
-                        fam['mcast'] = dict()
-                        for entry in NlAttrs(attr.raw):
-                            mcast_name = None
-                            mcast_id = None
-                            for entry_attr in NlAttrs(entry.raw):
-                                if entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_NAME:
-                                    mcast_name = entry_attr.as_strz()
-                                elif entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_ID:
-                                    mcast_id = entry_attr.as_scalar('u32')
-                            if mcast_name and mcast_id is not None:
-                                fam['mcast'][mcast_name] = mcast_id
-                if 'name' in fam and 'id' in fam:
-                    genl_family_name_to_id[fam['name']] = fam
-
-
-class GenlMsg:
-    def __init__(self, nl_msg):
-        self.nl = nl_msg
-        self.genl_cmd, self.genl_version, _ = struct.unpack_from("BBH", nl_msg.raw, 0)
-        self.raw = nl_msg.raw[4:]
-
-    def cmd(self):
-        return self.genl_cmd
-
-    def __repr__(self):
-        msg = repr(self.nl)
-        msg += f"\tgenl_cmd = {self.genl_cmd} genl_ver = {self.genl_version}\n"
-        for a in self.raw_attrs:
-            msg += '\t\t' + repr(a) + '\n'
-        return msg
-
-
-class NetlinkProtocol:
-    def __init__(self, family_name, proto_num):
-        self.family_name = family_name
-        self.proto_num = proto_num
-
-    def _message(self, nl_type, nl_flags, seq=None):
-        if seq is None:
-            seq = random.randint(1, 1024)
-        nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
-        return nlmsg
-
-    def message(self, flags, command, version, seq=None):
-        return self._message(command, flags, seq)
-
-    def _decode(self, nl_msg):
-        return nl_msg
-
-    def decode(self, ynl, nl_msg, op):
-        msg = self._decode(nl_msg)
-        if op is None:
-            op = ynl.rsp_by_value[msg.cmd()]
-        fixed_header_size = ynl._struct_size(op.fixed_header)
-        msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size)
-        return msg
-
-    def get_mcast_id(self, mcast_name, mcast_groups):
-        if mcast_name not in mcast_groups:
-            raise Exception(f'Multicast group "{mcast_name}" not present in the spec')
-        return mcast_groups[mcast_name].value
-
-    def msghdr_size(self):
-        return 16
-
-
-class GenlProtocol(NetlinkProtocol):
-    def __init__(self, family_name):
-        super().__init__(family_name, Netlink.NETLINK_GENERIC)
-
-        global genl_family_name_to_id
-        if genl_family_name_to_id is None:
-            _genl_load_families()
-
-        self.genl_family = genl_family_name_to_id[family_name]
-        self.family_id = genl_family_name_to_id[family_name]['id']
-
-    def message(self, flags, command, version, seq=None):
-        nlmsg = self._message(self.family_id, flags, seq)
-        genlmsg = struct.pack("BBH", command, version, 0)
-        return nlmsg + genlmsg
-
-    def _decode(self, nl_msg):
-        return GenlMsg(nl_msg)
-
-    def get_mcast_id(self, mcast_name, mcast_groups):
-        if mcast_name not in self.genl_family['mcast']:
-            raise Exception(f'Multicast group "{mcast_name}" not present in the family')
-        return self.genl_family['mcast'][mcast_name]
-
-    def msghdr_size(self):
-        return super().msghdr_size() + 4
-
-
-class SpaceAttrs:
-    SpecValuesPair = namedtuple('SpecValuesPair', ['spec', 'values'])
-
-    def __init__(self, attr_space, attrs, outer = None):
-        outer_scopes = outer.scopes if outer else []
-        inner_scope = self.SpecValuesPair(attr_space, attrs)
-        self.scopes = [inner_scope] + outer_scopes
-
-    def lookup(self, name):
-        for scope in self.scopes:
-            if name in scope.spec:
-                if name in scope.values:
-                    return scope.values[name]
-                spec_name = scope.spec.yaml['name']
-                raise Exception(
-                    f"No value for '{name}' in attribute space '{spec_name}'")
-        raise Exception(f"Attribute '{name}' not defined in any attribute-set")
-
-
-#
-# YNL implementation details.
-#
-
-
-class YnlFamily(SpecFamily):
-    def __init__(self, def_path, schema=None, process_unknown=False,
-                 recv_size=0):
-        super().__init__(def_path, schema)
-
-        self.include_raw = False
-        self.process_unknown = process_unknown
-
-        try:
-            if self.proto == "netlink-raw":
-                self.nlproto = NetlinkProtocol(self.yaml['name'],
-                                               self.yaml['protonum'])
-            else:
-                self.nlproto = GenlProtocol(self.yaml['name'])
-        except KeyError:
-            raise Exception(f"Family '{self.yaml['name']}' not supported by the kernel")
-
-        self._recv_dbg = False
-        # Note that netlink will use conservative (min) message size for
-        # the first dump recv() on the socket, our setting will only matter
-        # from the second recv() on.
-        self._recv_size = recv_size if recv_size else 131072
-        # Netlink will always allocate at least PAGE_SIZE - sizeof(skb_shinfo)
-        # for a message, so smaller receive sizes will lead to truncation.
-        # Note that the min size for other families may be larger than 4k!
-        if self._recv_size < 4000:
-            raise ConfigError()
-
-        self.sock = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, self.nlproto.proto_num)
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_GET_STRICT_CHK, 1)
-
-        self.async_msg_ids = set()
-        self.async_msg_queue = queue.Queue()
-
-        for msg in self.msgs.values():
-            if msg.is_async:
-                self.async_msg_ids.add(msg.rsp_value)
-
-        for op_name, op in self.ops.items():
-            bound_f = functools.partial(self._op, op_name)
-            setattr(self, op.ident_name, bound_f)
-
-
-    def ntf_subscribe(self, mcast_name):
-        mcast_id = self.nlproto.get_mcast_id(mcast_name, self.mcast_groups)
-        self.sock.bind((0, 0))
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_ADD_MEMBERSHIP,
-                             mcast_id)
-
-    def set_recv_dbg(self, enabled):
-        self._recv_dbg = enabled
-
-    def _recv_dbg_print(self, reply, nl_msgs):
-        if not self._recv_dbg:
-            return
-        print("Recv: read", len(reply), "bytes,",
-              len(nl_msgs.msgs), "messages", file=sys.stderr)
-        for nl_msg in nl_msgs:
-            print("  ", nl_msg, file=sys.stderr)
-
-    def _encode_enum(self, attr_spec, value):
-        enum = self.consts[attr_spec['enum']]
-        if enum.type == 'flags' or attr_spec.get('enum-as-flags', False):
-            scalar = 0
-            if isinstance(value, str):
-                value = [value]
-            for single_value in value:
-                scalar += enum.entries[single_value].user_value(as_flags = True)
-            return scalar
-        else:
-            return enum.entries[value].user_value()
-
-    def _get_scalar(self, attr_spec, value):
-        try:
-            return int(value)
-        except (ValueError, TypeError) as e:
-            if 'enum' not in attr_spec:
-                raise e
-        return self._encode_enum(attr_spec, value)
-
-    def _add_attr(self, space, name, value, search_attrs):
-        try:
-            attr = self.attr_sets[space][name]
-        except KeyError:
-            raise Exception(f"Space '{space}' has no attribute '{name}'")
-        nl_type = attr.value
-
-        if attr.is_multi and isinstance(value, list):
-            attr_payload = b''
-            for subvalue in value:
-                attr_payload += self._add_attr(space, name, subvalue, search_attrs)
-            return attr_payload
-
-        if attr["type"] == 'nest':
-            nl_type |= Netlink.NLA_F_NESTED
-            attr_payload = b''
-            sub_space = attr['nested-attributes']
-            sub_attrs = SpaceAttrs(self.attr_sets[sub_space], value, search_attrs)
-            for subname, subvalue in value.items():
-                attr_payload += self._add_attr(sub_space, subname, subvalue, sub_attrs)
-        elif attr["type"] == 'flag':
-            if not value:
-                # If value is absent or false then skip attribute creation.
-                return b''
-            attr_payload = b''
-        elif attr["type"] == 'string':
-            attr_payload = str(value).encode('ascii') + b'\x00'
-        elif attr["type"] == 'binary':
-            if isinstance(value, bytes):
-                attr_payload = value
-            elif isinstance(value, str):
-                attr_payload = bytes.fromhex(value)
-            elif isinstance(value, dict) and attr.struct_name:
-                attr_payload = self._encode_struct(attr.struct_name, value)
-            else:
-                raise Exception(f'Unknown type for binary attribute, value: {value}')
-        elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar:
-            scalar = self._get_scalar(attr, value)
-            if attr.is_auto_scalar:
-                attr_type = attr["type"][0] + ('32' if scalar.bit_length() <= 32 else '64')
-            else:
-                attr_type = attr["type"]
-            format = NlAttr.get_format(attr_type, attr.byte_order)
-            attr_payload = format.pack(scalar)
-        elif attr['type'] in "bitfield32":
-            scalar_value = self._get_scalar(attr, value["value"])
-            scalar_selector = self._get_scalar(attr, value["selector"])
-            attr_payload = struct.pack("II", scalar_value, scalar_selector)
-        elif attr['type'] == 'sub-message':
-            msg_format = self._resolve_selector(attr, search_attrs)
-            attr_payload = b''
-            if msg_format.fixed_header:
-                attr_payload += self._encode_struct(msg_format.fixed_header, value)
-            if msg_format.attr_set:
-                if msg_format.attr_set in self.attr_sets:
-                    nl_type |= Netlink.NLA_F_NESTED
-                    sub_attrs = SpaceAttrs(msg_format.attr_set, value, search_attrs)
-                    for subname, subvalue in value.items():
-                        attr_payload += self._add_attr(msg_format.attr_set,
-                                                       subname, subvalue, sub_attrs)
-                else:
-                    raise Exception(f"Unknown attribute-set '{msg_format.attr_set}'")
-        else:
-            raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}')
-
-        pad = b'\x00' * ((4 - len(attr_payload) % 4) % 4)
-        return struct.pack('HH', len(attr_payload) + 4, nl_type) + attr_payload + pad
-
-    def _decode_enum(self, raw, attr_spec):
-        enum = self.consts[attr_spec['enum']]
-        if enum.type == 'flags' or attr_spec.get('enum-as-flags', False):
-            i = 0
-            value = set()
-            while raw:
-                if raw & 1:
-                    value.add(enum.entries_by_val[i].name)
-                raw >>= 1
-                i += 1
-        else:
-            value = enum.entries_by_val[raw].name
-        return value
-
-    def _decode_binary(self, attr, attr_spec):
-        if attr_spec.struct_name:
-            decoded = self._decode_struct(attr.raw, attr_spec.struct_name)
-        elif attr_spec.sub_type:
-            decoded = attr.as_c_array(attr_spec.sub_type)
-        else:
-            decoded = attr.as_bin()
-            if attr_spec.display_hint:
-                decoded = self._formatted_string(decoded, attr_spec.display_hint)
-        return decoded
-
-    def _decode_array_attr(self, attr, attr_spec):
-        decoded = []
-        offset = 0
-        while offset < len(attr.raw):
-            item = NlAttr(attr.raw, offset)
-            offset += item.full_len
-
-            if attr_spec["sub-type"] == 'nest':
-                subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes'])
-                decoded.append({ item.type: subattrs })
-            elif attr_spec["sub-type"] == 'binary':
-                subattrs = item.as_bin()
-                if attr_spec.display_hint:
-                    subattrs = self._formatted_string(subattrs, attr_spec.display_hint)
-                decoded.append(subattrs)
-            elif attr_spec["sub-type"] in NlAttr.type_formats:
-                subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order)
-                if attr_spec.display_hint:
-                    subattrs = self._formatted_string(subattrs, attr_spec.display_hint)
-                decoded.append(subattrs)
-            else:
-                raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}')
-        return decoded
-
-    def _decode_nest_type_value(self, attr, attr_spec):
-        decoded = {}
-        value = attr
-        for name in attr_spec['type-value']:
-            value = NlAttr(value.raw, 0)
-            decoded[name] = value.type
-        subattrs = self._decode(NlAttrs(value.raw), attr_spec['nested-attributes'])
-        decoded.update(subattrs)
-        return decoded
-
-    def _decode_unknown(self, attr):
-        if attr.is_nest:
-            return self._decode(NlAttrs(attr.raw), None)
-        else:
-            return attr.as_bin()
-
-    def _rsp_add(self, rsp, name, is_multi, decoded):
-        if is_multi == None:
-            if name in rsp and type(rsp[name]) is not list:
-                rsp[name] = [rsp[name]]
-                is_multi = True
-            else:
-                is_multi = False
-
-        if not is_multi:
-            rsp[name] = decoded
-        elif name in rsp:
-            rsp[name].append(decoded)
-        else:
-            rsp[name] = [decoded]
-
-    def _resolve_selector(self, attr_spec, search_attrs):
-        sub_msg = attr_spec.sub_message
-        if sub_msg not in self.sub_msgs:
-            raise Exception(f"No sub-message spec named {sub_msg} for {attr_spec.name}")
-        sub_msg_spec = self.sub_msgs[sub_msg]
-
-        selector = attr_spec.selector
-        value = search_attrs.lookup(selector)
-        if value not in sub_msg_spec.formats:
-            raise Exception(f"No message format for '{value}' in sub-message spec '{sub_msg}'")
-
-        spec = sub_msg_spec.formats[value]
-        return spec
-
-    def _decode_sub_msg(self, attr, attr_spec, search_attrs):
-        msg_format = self._resolve_selector(attr_spec, search_attrs)
-        decoded = {}
-        offset = 0
-        if msg_format.fixed_header:
-            decoded.update(self._decode_struct(attr.raw, msg_format.fixed_header));
-            offset = self._struct_size(msg_format.fixed_header)
-        if msg_format.attr_set:
-            if msg_format.attr_set in self.attr_sets:
-                subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set)
-                decoded.update(subdict)
-            else:
-                raise Exception(f"Unknown attribute-set '{attr_space}' when decoding '{attr_spec.name}'")
-        return decoded
-
-    def _decode(self, attrs, space, outer_attrs = None):
-        rsp = dict()
-        if space:
-            attr_space = self.attr_sets[space]
-            search_attrs = SpaceAttrs(attr_space, rsp, outer_attrs)
-
-        for attr in attrs:
-            try:
-                attr_spec = attr_space.attrs_by_val[attr.type]
-            except (KeyError, UnboundLocalError):
-                if not self.process_unknown:
-                    raise Exception(f"Space '{space}' has no attribute with value '{attr.type}'")
-                attr_name = f"UnknownAttr({attr.type})"
-                self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr))
-                continue
-
-            try:
-                if attr_spec["type"] == 'nest':
-                    subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
-                    decoded = subdict
-                elif attr_spec["type"] == 'string':
-                    decoded = attr.as_strz()
-                elif attr_spec["type"] == 'binary':
-                    decoded = self._decode_binary(attr, attr_spec)
-                elif attr_spec["type"] == 'flag':
-                    decoded = True
-                elif attr_spec.is_auto_scalar:
-                    decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order)
-                elif attr_spec["type"] in NlAttr.type_formats:
-                    decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
-                    if 'enum' in attr_spec:
-                        decoded = self._decode_enum(decoded, attr_spec)
-                    elif attr_spec.display_hint:
-                        decoded = self._formatted_string(decoded, attr_spec.display_hint)
-                elif attr_spec["type"] == 'indexed-array':
-                    decoded = self._decode_array_attr(attr, attr_spec)
-                elif attr_spec["type"] == 'bitfield32':
-                    value, selector = struct.unpack("II", attr.raw)
-                    if 'enum' in attr_spec:
-                        value = self._decode_enum(value, attr_spec)
-                        selector = self._decode_enum(selector, attr_spec)
-                    decoded = {"value": value, "selector": selector}
-                elif attr_spec["type"] == 'sub-message':
-                    decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
-                elif attr_spec["type"] == 'nest-type-value':
-                    decoded = self._decode_nest_type_value(attr, attr_spec)
-                else:
-                    if not self.process_unknown:
-                        raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
-                    decoded = self._decode_unknown(attr)
-
-                self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
-            except:
-                print(f"Error decoding '{attr_spec.name}' from '{space}'")
-                raise
-
-        return rsp
-
-    def _decode_extack_path(self, attrs, attr_set, offset, target):
-        for attr in attrs:
-            try:
-                attr_spec = attr_set.attrs_by_val[attr.type]
-            except KeyError:
-                raise Exception(f"Space '{attr_set.name}' has no attribute with value '{attr.type}'")
-            if offset > target:
-                break
-            if offset == target:
-                return '.' + attr_spec.name
-
-            if offset + attr.full_len <= target:
-                offset += attr.full_len
-                continue
-            if attr_spec['type'] != 'nest':
-                raise Exception(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack")
-            offset += 4
-            subpath = self._decode_extack_path(NlAttrs(attr.raw),
-                                               self.attr_sets[attr_spec['nested-attributes']],
-                                               offset, target)
-            if subpath is None:
-                return None
-            return '.' + attr_spec.name + subpath
-
-        return None
-
-    def _decode_extack(self, request, op, extack):
-        if 'bad-attr-offs' not in extack:
-            return
-
-        msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op)
-        offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header)
-        path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset,
-                                        extack['bad-attr-offs'])
-        if path:
-            del extack['bad-attr-offs']
-            extack['bad-attr'] = path
-
-    def _struct_size(self, name):
-        if name:
-            members = self.consts[name].members
-            size = 0
-            for m in members:
-                if m.type in ['pad', 'binary']:
-                    if m.struct:
-                        size += self._struct_size(m.struct)
-                    else:
-                        size += m.len
-                else:
-                    format = NlAttr.get_format(m.type, m.byte_order)
-                    size += format.size
-            return size
-        else:
-            return 0
-
-    def _decode_struct(self, data, name):
-        members = self.consts[name].members
-        attrs = dict()
-        offset = 0
-        for m in members:
-            value = None
-            if m.type == 'pad':
-                offset += m.len
-            elif m.type == 'binary':
-                if m.struct:
-                    len = self._struct_size(m.struct)
-                    value = self._decode_struct(data[offset : offset + len],
-                                                m.struct)
-                    offset += len
-                else:
-                    value = data[offset : offset + m.len]
-                    offset += m.len
-            else:
-                format = NlAttr.get_format(m.type, m.byte_order)
-                [ value ] = format.unpack_from(data, offset)
-                offset += format.size
-            if value is not None:
-                if m.enum:
-                    value = self._decode_enum(value, m)
-                elif m.display_hint:
-                    value = self._formatted_string(value, m.display_hint)
-                attrs[m.name] = value
-        return attrs
-
-    def _encode_struct(self, name, vals):
-        members = self.consts[name].members
-        attr_payload = b''
-        for m in members:
-            value = vals.pop(m.name) if m.name in vals else None
-            if m.type == 'pad':
-                attr_payload += bytearray(m.len)
-            elif m.type == 'binary':
-                if m.struct:
-                    if value is None:
-                        value = dict()
-                    attr_payload += self._encode_struct(m.struct, value)
-                else:
-                    if value is None:
-                        attr_payload += bytearray(m.len)
-                    else:
-                        attr_payload += bytes.fromhex(value)
-            else:
-                if value is None:
-                    value = 0
-                format = NlAttr.get_format(m.type, m.byte_order)
-                attr_payload += format.pack(value)
-        return attr_payload
-
-    def _formatted_string(self, raw, display_hint):
-        if display_hint == 'mac':
-            formatted = ':'.join('%02x' % b for b in raw)
-        elif display_hint == 'hex':
-            if isinstance(raw, int):
-                formatted = hex(raw)
-            else:
-                formatted = bytes.hex(raw, ' ')
-        elif display_hint in [ 'ipv4', 'ipv6' ]:
-            formatted = format(ipaddress.ip_address(raw))
-        elif display_hint == 'uuid':
-            formatted = str(uuid.UUID(bytes=raw))
-        else:
-            formatted = raw
-        return formatted
-
-    def handle_ntf(self, decoded):
-        msg = dict()
-        if self.include_raw:
-            msg['raw'] = decoded
-        op = self.rsp_by_value[decoded.cmd()]
-        attrs = self._decode(decoded.raw_attrs, op.attr_set.name)
-        if op.fixed_header:
-            attrs.update(self._decode_struct(decoded.raw, op.fixed_header))
-
-        msg['name'] = op['name']
-        msg['msg'] = attrs
-        self.async_msg_queue.put(msg)
-
-    def check_ntf(self):
-        while True:
-            try:
-                reply = self.sock.recv(self._recv_size, socket.MSG_DONTWAIT)
-            except BlockingIOError:
-                return
-
-            nms = NlMsgs(reply)
-            self._recv_dbg_print(reply, nms)
-            for nl_msg in nms:
-                if nl_msg.error:
-                    print("Netlink error in ntf!?", os.strerror(-nl_msg.error))
-                    print(nl_msg)
-                    continue
-                if nl_msg.done:
-                    print("Netlink done while checking for ntf!?")
-                    continue
-
-                decoded = self.nlproto.decode(self, nl_msg, None)
-                if decoded.cmd() not in self.async_msg_ids:
-                    print("Unexpected msg id while checking for ntf", decoded)
-                    continue
-
-                self.handle_ntf(decoded)
-
-    def poll_ntf(self, duration=None):
-        start_time = time.time()
-        selector = selectors.DefaultSelector()
-        selector.register(self.sock, selectors.EVENT_READ)
-
-        while True:
-            try:
-                yield self.async_msg_queue.get_nowait()
-            except queue.Empty:
-                if duration is not None:
-                    timeout = start_time + duration - time.time()
-                    if timeout <= 0:
-                        return
-                else:
-                    timeout = None
-                events = selector.select(timeout)
-                if events:
-                    self.check_ntf()
-
-    def operation_do_attributes(self, name):
-      """
-      For a given operation name, find and return a supported
-      set of attributes (as a dict).
-      """
-      op = self.find_operation(name)
-      if not op:
-        return None
-
-      return op['do']['request']['attributes'].copy()
-
-    def _encode_message(self, op, vals, flags, req_seq):
-        nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
-        for flag in flags or []:
-            nl_flags |= flag
-
-        msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq)
-        if op.fixed_header:
-            msg += self._encode_struct(op.fixed_header, vals)
-        search_attrs = SpaceAttrs(op.attr_set, vals)
-        for name, value in vals.items():
-            msg += self._add_attr(op.attr_set.name, name, value, search_attrs)
-        msg = _genl_msg_finalize(msg)
-        return msg
-
-    def _ops(self, ops):
-        reqs_by_seq = {}
-        req_seq = random.randint(1024, 65535)
-        payload = b''
-        for (method, vals, flags) in ops:
-            op = self.ops[method]
-            msg = self._encode_message(op, vals, flags, req_seq)
-            reqs_by_seq[req_seq] = (op, msg, flags)
-            payload += msg
-            req_seq += 1
-
-        self.sock.send(payload, 0)
-
-        done = False
-        rsp = []
-        op_rsp = []
-        while not done:
-            reply = self.sock.recv(self._recv_size)
-            nms = NlMsgs(reply, attr_space=op.attr_set)
-            self._recv_dbg_print(reply, nms)
-            for nl_msg in nms:
-                if nl_msg.nl_seq in reqs_by_seq:
-                    (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq]
-                    if nl_msg.extack:
-                        self._decode_extack(req_msg, op, nl_msg.extack)
-                else:
-                    op = None
-                    req_flags = []
-
-                if nl_msg.error:
-                    raise NlError(nl_msg)
-                if nl_msg.done:
-                    if nl_msg.extack:
-                        print("Netlink warning:")
-                        print(nl_msg)
-
-                    if Netlink.NLM_F_DUMP in req_flags:
-                        rsp.append(op_rsp)
-                    elif not op_rsp:
-                        rsp.append(None)
-                    elif len(op_rsp) == 1:
-                        rsp.append(op_rsp[0])
-                    else:
-                        rsp.append(op_rsp)
-                    op_rsp = []
-
-                    del reqs_by_seq[nl_msg.nl_seq]
-                    done = len(reqs_by_seq) == 0
-                    break
-
-                decoded = self.nlproto.decode(self, nl_msg, op)
-
-                # Check if this is a reply to our request
-                if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value:
-                    if decoded.cmd() in self.async_msg_ids:
-                        self.handle_ntf(decoded)
-                        continue
-                    else:
-                        print('Unexpected message: ' + repr(decoded))
-                        continue
-
-                rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name)
-                if op.fixed_header:
-                    rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header))
-                op_rsp.append(rsp_msg)
-
-        return rsp
-
-    def _op(self, method, vals, flags=None, dump=False):
-        req_flags = flags or []
-        if dump:
-            req_flags.append(Netlink.NLM_F_DUMP)
-
-        ops = [(method, vals, req_flags)]
-        return self._ops(ops)[0]
-
-    def do(self, method, vals, flags=None):
-        return self._op(method, vals, flags)
-
-    def dump(self, method, vals):
-        return self._op(method, vals, dump=True)
-
-    def do_multi(self, ops):
-        return self._ops(ops)
diff --git a/tools/net/ynl/pyynl/.gitignore b/tools/net/ynl/pyynl/.gitignore
new file mode 100644
index 000000000000..b801cd2d016e
--- /dev/null
+++ b/tools/net/ynl/pyynl/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+lib/__pycache__/
diff --git a/tools/net/ynl/pyynl/__init__.py b/tools/net/ynl/pyynl/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
new file mode 100755
index 000000000000..41d9fa5c818d
--- /dev/null
+++ b/tools/net/ynl/pyynl/cli.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+import argparse
+import json
+import pathlib
+import pprint
+import sys
+
+sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
+from lib import YnlFamily, Netlink, NlError
+
+
+class YnlEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, bytes):
+            return bytes.hex(obj)
+        if isinstance(obj, set):
+            return list(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+def main():
+    description = """
+    YNL CLI utility - a general purpose netlink utility that uses YAML
+    specs to drive protocol encoding and decoding.
+    """
+    epilog = """
+    The --multi option can be repeated to include several do operations
+    in the same netlink payload.
+    """
+
+    parser = argparse.ArgumentParser(description=description,
+                                     epilog=epilog)
+    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    parser.add_argument('--schema', dest='schema', type=str)
+    parser.add_argument('--no-schema', action='store_true')
+    parser.add_argument('--json', dest='json_text', type=str)
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str)
+    group.add_argument('--multi', dest='multi', nargs=2, action='append',
+                       metavar=('DO-OPERATION', 'JSON_TEXT'), type=str)
+    group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str)
+    group.add_argument('--list-ops', action='store_true')
+    group.add_argument('--list-msgs', action='store_true')
+
+    parser.add_argument('--duration', dest='duration', type=int,
+                        help='when subscribed, watch for DURATION seconds')
+    parser.add_argument('--sleep', dest='duration', type=int,
+                        help='alias for duration')
+    parser.add_argument('--subscribe', dest='ntf', type=str)
+    parser.add_argument('--replace', dest='flags', action='append_const',
+                        const=Netlink.NLM_F_REPLACE)
+    parser.add_argument('--excl', dest='flags', action='append_const',
+                        const=Netlink.NLM_F_EXCL)
+    parser.add_argument('--create', dest='flags', action='append_const',
+                        const=Netlink.NLM_F_CREATE)
+    parser.add_argument('--append', dest='flags', action='append_const',
+                        const=Netlink.NLM_F_APPEND)
+    parser.add_argument('--process-unknown', action=argparse.BooleanOptionalAction)
+    parser.add_argument('--output-json', action='store_true')
+    parser.add_argument('--dbg-small-recv', default=0, const=4000,
+                        action='store', nargs='?', type=int)
+    args = parser.parse_args()
+
+    def output(msg):
+        if args.output_json:
+            print(json.dumps(msg, cls=YnlEncoder))
+        else:
+            pprint.PrettyPrinter().pprint(msg)
+
+    if args.no_schema:
+        args.schema = ''
+
+    attrs = {}
+    if args.json_text:
+        attrs = json.loads(args.json_text)
+
+    ynl = YnlFamily(args.spec, args.schema, args.process_unknown,
+                    recv_size=args.dbg_small_recv)
+    if args.dbg_small_recv:
+        ynl.set_recv_dbg(True)
+
+    if args.ntf:
+        ynl.ntf_subscribe(args.ntf)
+
+    if args.list_ops:
+        for op_name, op in ynl.ops.items():
+            print(op_name, " [", ", ".join(op.modes), "]")
+    if args.list_msgs:
+        for op_name, op in ynl.msgs.items():
+            print(op_name, " [", ", ".join(op.modes), "]")
+
+    try:
+        if args.do:
+            reply = ynl.do(args.do, attrs, args.flags)
+            output(reply)
+        if args.dump:
+            reply = ynl.dump(args.dump, attrs)
+            output(reply)
+        if args.multi:
+            ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ]
+            reply = ynl.do_multi(ops)
+            output(reply)
+    except NlError as e:
+        print(e)
+        exit(1)
+
+    if args.ntf:
+        try:
+            for msg in ynl.poll_ntf(duration=args.duration):
+                output(msg)
+        except KeyboardInterrupt:
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
new file mode 100755
index 000000000000..ebb0a11f67bf
--- /dev/null
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+import argparse
+import json
+import pathlib
+import pprint
+import sys
+import re
+import os
+
+sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
+from lib import YnlFamily
+
+def args_to_req(ynl, op_name, args, req):
+    """
+    Verify and convert command-line arguments to the ynl-compatible request.
+    """
+    valid_attrs = ynl.operation_do_attributes(op_name)
+    valid_attrs.remove('header') # not user-provided
+
+    if len(args) == 0:
+        print(f'no attributes, expected: {valid_attrs}')
+        sys.exit(1)
+
+    i = 0
+    while i < len(args):
+        attr = args[i]
+        if i + 1 >= len(args):
+            print(f'expected value for \'{attr}\'')
+            sys.exit(1)
+
+        if attr not in valid_attrs:
+            print(f'invalid attribute \'{attr}\', expected: {valid_attrs}')
+            sys.exit(1)
+
+        val = args[i+1]
+        i += 2
+
+        req[attr] = val
+
+def print_field(reply, *desc):
+    """
+    Pretty-print a set of fields from the reply. desc specifies the
+    fields and the optional type (bool/yn).
+    """
+    if len(desc) == 0:
+        return print_field(reply, *zip(reply.keys(), reply.keys()))
+
+    for spec in desc:
+        try:
+            field, name, tp = spec
+        except:
+            field, name = spec
+            tp = 'int'
+
+        value = reply.get(field, None)
+        if tp == 'yn':
+            value = 'yes' if value else 'no'
+        elif tp == 'bool' or isinstance(value, bool):
+            value = 'on' if value else 'off'
+        else:
+            value = 'n/a' if value is None else value
+
+        print(f'{name}: {value}')
+
+def print_speed(name, value):
+    """
+    Print out the speed-like strings from the value dict.
+    """
+    speed_re = re.compile(r'[0-9]+base[^/]+/.+')
+    speed = [ k for k, v in value.items() if v and speed_re.match(k) ]
+    print(f'{name}: {" ".join(speed)}')
+
+def doit(ynl, args, op_name):
+    """
+    Prepare request header, parse arguments and doit.
+    """
+    req = {
+        'header': {
+          'dev-name': args.device,
+        },
+    }
+
+    args_to_req(ynl, op_name, args.args, req)
+    ynl.do(op_name, req)
+
+def dumpit(ynl, args, op_name, extra = {}):
+    """
+    Prepare request header, parse arguments and dumpit (filtering out the
+    devices we're not interested in).
+    """
+    reply = ynl.dump(op_name, { 'header': {} } | extra)
+    if not reply:
+        return {}
+
+    for msg in reply:
+        if msg['header']['dev-name'] == args.device:
+            if args.json:
+                pprint.PrettyPrinter().pprint(msg)
+                sys.exit(0)
+            msg.pop('header', None)
+            return msg
+
+    print(f"Not supported for device {args.device}")
+    sys.exit(1)
+
+def bits_to_dict(attr):
+    """
+    Convert ynl-formatted bitmask to a dict of bit=value.
+    """
+    ret = {}
+    if 'bits' not in attr:
+        return dict()
+    if 'bit' not in attr['bits']:
+        return dict()
+    for bit in attr['bits']['bit']:
+        if bit['name'] == '':
+            continue
+        name = bit['name']
+        value = bit.get('value', False)
+        ret[name] = value
+    return ret
+
+def main():
+    parser = argparse.ArgumentParser(description='ethtool wannabe')
+    parser.add_argument('--json', action=argparse.BooleanOptionalAction)
+    parser.add_argument('--show-priv-flags', action=argparse.BooleanOptionalAction)
+    parser.add_argument('--set-priv-flags', action=argparse.BooleanOptionalAction)
+    parser.add_argument('--show-eee', action=argparse.BooleanOptionalAction)
+    parser.add_argument('--set-eee', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-a', '--show-pause', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-A', '--set-pause', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-c', '--show-coalesce', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-C', '--set-coalesce', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-g', '--show-ring', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-G', '--set-ring', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-k', '--show-features', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-K', '--set-features', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-l', '--show-channels', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-L', '--set-channels', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-T', '--show-time-stamping', action=argparse.BooleanOptionalAction)
+    parser.add_argument('-S', '--statistics', action=argparse.BooleanOptionalAction)
+    # TODO: --show-tunnels        tunnel-info-get
+    # TODO: --show-module         module-get
+    # TODO: --get-plca-cfg        plca-get
+    # TODO: --get-plca-status     plca-get-status
+    # TODO: --show-mm             mm-get
+    # TODO: --show-fec            fec-get
+    # TODO: --dump-module-eerpom  module-eeprom-get
+    # TODO:                       pse-get
+    # TODO:                       rss-get
+    parser.add_argument('device', metavar='device', type=str)
+    parser.add_argument('args', metavar='args', type=str, nargs='*')
+    global args
+    args = parser.parse_args()
+
+    script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+    spec = os.path.join(script_abs_dir,
+                        '../../../Documentation/netlink/specs/ethtool.yaml')
+    schema = os.path.join(script_abs_dir,
+                          '../../../Documentation/netlink/genetlink-legacy.yaml')
+
+    ynl = YnlFamily(spec, schema)
+
+    if args.set_priv_flags:
+        # TODO: parse the bitmask
+        print("not implemented")
+        return
+
+    if args.set_eee:
+        return doit(ynl, args, 'eee-set')
+
+    if args.set_pause:
+        return doit(ynl, args, 'pause-set')
+
+    if args.set_coalesce:
+        return doit(ynl, args, 'coalesce-set')
+
+    if args.set_features:
+        # TODO: parse the bitmask
+        print("not implemented")
+        return
+
+    if args.set_channels:
+        return doit(ynl, args, 'channels-set')
+
+    if args.set_ring:
+        return doit(ynl, args, 'rings-set')
+
+    if args.show_priv_flags:
+        flags = bits_to_dict(dumpit(ynl, args, 'privflags-get')['flags'])
+        print_field(flags)
+        return
+
+    if args.show_eee:
+        eee = dumpit(ynl, args, 'eee-get')
+        ours = bits_to_dict(eee['modes-ours'])
+        peer = bits_to_dict(eee['modes-peer'])
+
+        if 'enabled' in eee:
+            status = 'enabled' if eee['enabled'] else 'disabled'
+            if 'active' in eee and eee['active']:
+                status = status + ' - active'
+            else:
+                status = status + ' - inactive'
+        else:
+            status = 'not supported'
+
+        print(f'EEE status: {status}')
+        print_field(eee, ('tx-lpi-timer', 'Tx LPI'))
+        print_speed('Advertised EEE link modes', ours)
+        print_speed('Link partner advertised EEE link modes', peer)
+
+        return
+
+    if args.show_pause:
+        print_field(dumpit(ynl, args, 'pause-get'),
+                ('autoneg', 'Autonegotiate', 'bool'),
+                ('rx', 'RX', 'bool'),
+                ('tx', 'TX', 'bool'))
+        return
+
+    if args.show_coalesce:
+        print_field(dumpit(ynl, args, 'coalesce-get'))
+        return
+
+    if args.show_features:
+        reply = dumpit(ynl, args, 'features-get')
+        available = bits_to_dict(reply['hw'])
+        requested = bits_to_dict(reply['wanted']).keys()
+        active = bits_to_dict(reply['active']).keys()
+        never_changed = bits_to_dict(reply['nochange']).keys()
+
+        for f in sorted(available):
+            value = "off"
+            if f in active:
+                value = "on"
+
+            fixed = ""
+            if f not in available or f in never_changed:
+                fixed = " [fixed]"
+
+            req = ""
+            if f in requested:
+                if f in active:
+                    req = " [requested on]"
+                else:
+                    req = " [requested off]"
+
+            print(f'{f}: {value}{fixed}{req}')
+
+        return
+
+    if args.show_channels:
+        reply = dumpit(ynl, args, 'channels-get')
+        print(f'Channel parameters for {args.device}:')
+
+        print(f'Pre-set maximums:')
+        print_field(reply,
+            ('rx-max', 'RX'),
+            ('tx-max', 'TX'),
+            ('other-max', 'Other'),
+            ('combined-max', 'Combined'))
+
+        print(f'Current hardware settings:')
+        print_field(reply,
+            ('rx-count', 'RX'),
+            ('tx-count', 'TX'),
+            ('other-count', 'Other'),
+            ('combined-count', 'Combined'))
+
+        return
+
+    if args.show_ring:
+        reply = dumpit(ynl, args, 'channels-get')
+
+        print(f'Ring parameters for {args.device}:')
+
+        print(f'Pre-set maximums:')
+        print_field(reply,
+            ('rx-max', 'RX'),
+            ('rx-mini-max', 'RX Mini'),
+            ('rx-jumbo-max', 'RX Jumbo'),
+            ('tx-max', 'TX'))
+
+        print(f'Current hardware settings:')
+        print_field(reply,
+            ('rx', 'RX'),
+            ('rx-mini', 'RX Mini'),
+            ('rx-jumbo', 'RX Jumbo'),
+            ('tx', 'TX'))
+
+        print_field(reply,
+            ('rx-buf-len', 'RX Buf Len'),
+            ('cqe-size', 'CQE Size'),
+            ('tx-push', 'TX Push', 'bool'))
+
+        return
+
+    if args.statistics:
+        print(f'NIC statistics:')
+
+        # TODO: pass id?
+        strset = dumpit(ynl, args, 'strset-get')
+        pprint.PrettyPrinter().pprint(strset)
+
+        req = {
+          'groups': {
+            'size': 1,
+            'bits': {
+              'bit':
+                # TODO: support passing the bitmask
+                #[
+                  #{ 'name': 'eth-phy', 'value': True },
+                  { 'name': 'eth-mac', 'value': True },
+                  #{ 'name': 'eth-ctrl', 'value': True },
+                  #{ 'name': 'rmon', 'value': True },
+                #],
+            },
+          },
+        }
+
+        rsp = dumpit(ynl, args, 'stats-get', req)
+        pprint.PrettyPrinter().pprint(rsp)
+        return
+
+    if args.show_time_stamping:
+        req = {
+          'header': {
+            'flags': 'stats',
+          },
+        }
+
+        tsinfo = dumpit(ynl, args, 'tsinfo-get', req)
+
+        print(f'Time stamping parameters for {args.device}:')
+
+        print('Capabilities:')
+        [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])]
+
+        print(f'PTP Hardware Clock: {tsinfo["phc-index"]}')
+
+        print('Hardware Transmit Timestamp Modes:')
+        [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])]
+
+        print('Hardware Receive Filter Modes:')
+        [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
+
+        print('Statistics:')
+        [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
+        return
+
+    print(f'Settings for {args.device}:')
+    linkmodes = dumpit(ynl, args, 'linkmodes-get')
+    ours = bits_to_dict(linkmodes['ours'])
+
+    supported_ports = ('TP',  'AUI', 'BNC', 'MII', 'FIBRE', 'Backplane')
+    ports = [ p for p in supported_ports if ours.get(p, False)]
+    print(f'Supported ports: [ {" ".join(ports)} ]')
+
+    print_speed('Supported link modes', ours)
+
+    print_field(ours, ('Pause', 'Supported pause frame use', 'yn'))
+    print_field(ours, ('Autoneg', 'Supports auto-negotiation', 'yn'))
+
+    supported_fec = ('None',  'PS', 'BASER', 'LLRS')
+    fec = [ p for p in supported_fec if ours.get(p, False)]
+    fec_str = " ".join(fec)
+    if len(fec) == 0:
+        fec_str = "Not reported"
+
+    print(f'Supported FEC modes: {fec_str}')
+
+    speed = 'Unknown!'
+    if linkmodes['speed'] > 0 and linkmodes['speed'] < 0xffffffff:
+        speed = f'{linkmodes["speed"]}Mb/s'
+    print(f'Speed: {speed}')
+
+    duplex_modes = {
+            0: 'Half',
+            1: 'Full',
+    }
+    duplex = duplex_modes.get(linkmodes["duplex"], None)
+    if not duplex:
+        duplex = f'Unknown! ({linkmodes["duplex"]})'
+    print(f'Duplex: {duplex}')
+
+    autoneg = "off"
+    if linkmodes.get("autoneg", 0) != 0:
+        autoneg = "on"
+    print(f'Auto-negotiation: {autoneg}')
+
+    ports = {
+            0: 'Twisted Pair',
+            1: 'AUI',
+            2: 'MII',
+            3: 'FIBRE',
+            4: 'BNC',
+            5: 'Directly Attached Copper',
+            0xef: 'None',
+    }
+    linkinfo = dumpit(ynl, args, 'linkinfo-get')
+    print(f'Port: {ports.get(linkinfo["port"], "Other")}')
+
+    print_field(linkinfo, ('phyaddr', 'PHYAD'))
+
+    transceiver = {
+            0: 'Internal',
+            1: 'External',
+    }
+    print(f'Transceiver: {transceiver.get(linkinfo["transceiver"], "Unknown")}')
+
+    mdix_ctrl = {
+            1: 'off',
+            2: 'on',
+    }
+    mdix = mdix_ctrl.get(linkinfo['tp-mdix-ctrl'], None)
+    if mdix:
+        mdix = mdix + ' (forced)'
+    else:
+        mdix = mdix_ctrl.get(linkinfo['tp-mdix'], 'Unknown (auto)')
+    print(f'MDI-X: {mdix}')
+
+    debug = dumpit(ynl, args, 'debug-get')
+    msgmask = bits_to_dict(debug.get("msgmask", [])).keys()
+    print(f'Current message level: {" ".join(msgmask)}')
+
+    linkstate = dumpit(ynl, args, 'linkstate-get')
+    detected_states = {
+            0: 'no',
+            1: 'yes',
+    }
+    # TODO: wol-get
+    detected = detected_states.get(linkstate['link'], 'unknown')
+    print(f'Link detected: {detected}')
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/net/ynl/pyynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py
new file mode 100644
index 000000000000..9137b83e580a
--- /dev/null
+++ b/tools/net/ynl/pyynl/lib/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \
+    SpecFamily, SpecOperation
+from .ynl import YnlFamily, Netlink, NlError
+
+__all__ = ["SpecAttr", "SpecAttrSet", "SpecEnumEntry", "SpecEnumSet",
+           "SpecFamily", "SpecOperation", "YnlFamily", "Netlink", "NlError"]
diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
new file mode 100644
index 000000000000..314ec8007496
--- /dev/null
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -0,0 +1,617 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+import collections
+import importlib
+import os
+import yaml
+
+
+# To be loaded dynamically as needed
+jsonschema = None
+
+
+class SpecElement:
+    """Netlink spec element.
+
+    Abstract element of the Netlink spec. Implements the dictionary interface
+    for access to the raw spec. Supports iterative resolution of dependencies
+    across elements and class inheritance levels. The elements of the spec
+    may refer to each other, and although loops should be very rare, having
+    to maintain correct ordering of instantiation is painful, so the resolve()
+    method should be used to perform parts of init which require access to
+    other parts of the spec.
+
+    Attributes:
+        yaml        raw spec as loaded from the spec file
+        family      back reference to the full family
+
+        name        name of the entity as listed in the spec (optional)
+        ident_name  name which can be safely used as identifier in code (optional)
+    """
+    def __init__(self, family, yaml):
+        self.yaml = yaml
+        self.family = family
+
+        if 'name' in self.yaml:
+            self.name = self.yaml['name']
+            self.ident_name = self.name.replace('-', '_')
+
+        self._super_resolved = False
+        family.add_unresolved(self)
+
+    def __getitem__(self, key):
+        return self.yaml[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def get(self, key, default=None):
+        return self.yaml.get(key, default)
+
+    def resolve_up(self, up):
+        if not self._super_resolved:
+            up.resolve()
+            self._super_resolved = True
+
+    def resolve(self):
+        pass
+
+
+class SpecEnumEntry(SpecElement):
+    """ Entry within an enum declared in the Netlink spec.
+
+    Attributes:
+        doc         documentation string
+        enum_set    back reference to the enum
+        value       numerical value of this enum (use accessors in most situations!)
+
+    Methods:
+        raw_value   raw value, i.e. the id in the enum, unlike user value which is a mask for flags
+        user_value   user value, same as raw value for enums, for flags it's the mask
+    """
+    def __init__(self, enum_set, yaml, prev, value_start):
+        if isinstance(yaml, str):
+            yaml = {'name': yaml}
+        super().__init__(enum_set.family, yaml)
+
+        self.doc = yaml.get('doc', '')
+        self.enum_set = enum_set
+
+        if 'value' in yaml:
+            self.value = yaml['value']
+        elif prev:
+            self.value = prev.value + 1
+        else:
+            self.value = value_start
+
+    def has_doc(self):
+        return bool(self.doc)
+
+    def raw_value(self):
+        return self.value
+
+    def user_value(self, as_flags=None):
+        if self.enum_set['type'] == 'flags' or as_flags:
+            return 1 << self.value
+        else:
+            return self.value
+
+
+class SpecEnumSet(SpecElement):
+    """ Enum type
+
+    Represents an enumeration (list of numerical constants)
+    as declared in the "definitions" section of the spec.
+
+    Attributes:
+        type            enum or flags
+        entries         entries by name
+        entries_by_val  entries by value
+    Methods:
+        get_mask      for flags compute the mask of all defined values
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.type = yaml['type']
+
+        prev_entry = None
+        value_start = self.yaml.get('value-start', 0)
+        self.entries = dict()
+        self.entries_by_val = dict()
+        for entry in self.yaml['entries']:
+            e = self.new_entry(entry, prev_entry, value_start)
+            self.entries[e.name] = e
+            self.entries_by_val[e.raw_value()] = e
+            prev_entry = e
+
+    def new_entry(self, entry, prev_entry, value_start):
+        return SpecEnumEntry(self, entry, prev_entry, value_start)
+
+    def has_doc(self):
+        if 'doc' in self.yaml:
+            return True
+        return self.has_entry_doc()
+
+    def has_entry_doc(self):
+        for entry in self.entries.values():
+            if entry.has_doc():
+                return True
+        return False
+
+    def get_mask(self, as_flags=None):
+        mask = 0
+        for e in self.entries.values():
+            mask += e.user_value(as_flags)
+        return mask
+
+
+class SpecAttr(SpecElement):
+    """ Single Netlink attribute type
+
+    Represents a single attribute type within an attr space.
+
+    Attributes:
+        type          string, attribute type
+        value         numerical ID when serialized
+        attr_set      Attribute Set containing this attr
+        is_multi      bool, attr may repeat multiple times
+        struct_name   string, name of struct definition
+        sub_type      string, name of sub type
+        len           integer, optional byte length of binary types
+        display_hint  string, hint to help choose format specifier
+                      when displaying the value
+        sub_message   string, name of sub message type
+        selector      string, name of attribute used to select
+                      sub-message type
+
+        is_auto_scalar bool, attr is a variable-size scalar
+    """
+    def __init__(self, family, attr_set, yaml, value):
+        super().__init__(family, yaml)
+
+        self.type = yaml['type']
+        self.value = value
+        self.attr_set = attr_set
+        self.is_multi = yaml.get('multi-attr', False)
+        self.struct_name = yaml.get('struct')
+        self.sub_type = yaml.get('sub-type')
+        self.byte_order = yaml.get('byte-order')
+        self.len = yaml.get('len')
+        self.display_hint = yaml.get('display-hint')
+        self.sub_message = yaml.get('sub-message')
+        self.selector = yaml.get('selector')
+
+        self.is_auto_scalar = self.type == "sint" or self.type == "uint"
+
+
+class SpecAttrSet(SpecElement):
+    """ Netlink Attribute Set class.
+
+    Represents a ID space of attributes within Netlink.
+
+    Note that unlike other elements, which expose contents of the raw spec
+    via the dictionary interface Attribute Set exposes attributes by name.
+
+    Attributes:
+        attrs      ordered dict of all attributes (indexed by name)
+        attrs_by_val  ordered dict of all attributes (indexed by value)
+        subset_of  parent set if this is a subset, otherwise None
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.subset_of = self.yaml.get('subset-of', None)
+
+        self.attrs = collections.OrderedDict()
+        self.attrs_by_val = collections.OrderedDict()
+
+        if self.subset_of is None:
+            val = 1
+            for elem in self.yaml['attributes']:
+                if 'value' in elem:
+                    val = elem['value']
+
+                attr = self.new_attr(elem, val)
+                self.attrs[attr.name] = attr
+                self.attrs_by_val[attr.value] = attr
+                val += 1
+        else:
+            real_set = family.attr_sets[self.subset_of]
+            for elem in self.yaml['attributes']:
+                real_attr = real_set[elem['name']]
+                combined_elem = real_attr.yaml | elem
+                attr = self.new_attr(combined_elem, real_attr.value)
+
+                self.attrs[attr.name] = attr
+                self.attrs_by_val[attr.value] = attr
+
+    def new_attr(self, elem, value):
+        return SpecAttr(self.family, self, elem, value)
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def __contains__(self, key):
+        return key in self.attrs
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def items(self):
+        return self.attrs.items()
+
+
+class SpecStructMember(SpecElement):
+    """Struct member attribute
+
+    Represents a single struct member attribute.
+
+    Attributes:
+        type        string, type of the member attribute
+        byte_order  string or None for native byte order
+        enum        string, name of the enum definition
+        len         integer, optional byte length of binary types
+        display_hint  string, hint to help choose format specifier
+                      when displaying the value
+        struct      string, name of nested struct type
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+        self.type = yaml['type']
+        self.byte_order = yaml.get('byte-order')
+        self.enum = yaml.get('enum')
+        self.len = yaml.get('len')
+        self.display_hint = yaml.get('display-hint')
+        self.struct = yaml.get('struct')
+
+
+class SpecStruct(SpecElement):
+    """Netlink struct type
+
+    Represents a C struct definition.
+
+    Attributes:
+        members   ordered list of struct members
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.members = []
+        for member in yaml.get('members', []):
+            self.members.append(self.new_member(family, member))
+
+    def new_member(self, family, elem):
+        return SpecStructMember(family, elem)
+
+    def __iter__(self):
+        yield from self.members
+
+    def items(self):
+        return self.members.items()
+
+
+class SpecSubMessage(SpecElement):
+    """ Netlink sub-message definition
+
+    Represents a set of sub-message formats for polymorphic nlattrs
+    that contain type-specific sub messages.
+
+    Attributes:
+        name     string, name of sub-message definition
+        formats  dict of sub-message formats indexed by match value
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.formats = collections.OrderedDict()
+        for elem in self.yaml['formats']:
+            format = self.new_format(family, elem)
+            self.formats[format.value] = format
+
+    def new_format(self, family, format):
+        return SpecSubMessageFormat(family, format)
+
+
+class SpecSubMessageFormat(SpecElement):
+    """ Netlink sub-message format definition
+
+    Represents a single format for a sub-message.
+
+    Attributes:
+        value         attribute value to match against type selector
+        fixed_header  string, name of fixed header, or None
+        attr_set      string, name of attribute set, or None
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.value = yaml.get('value')
+        self.fixed_header = yaml.get('fixed-header')
+        self.attr_set = yaml.get('attribute-set')
+
+
+class SpecOperation(SpecElement):
+    """Netlink Operation
+
+    Information about a single Netlink operation.
+
+    Attributes:
+        value           numerical ID when serialized, None if req/rsp values differ
+
+        req_value       numerical ID when serialized, user -> kernel
+        rsp_value       numerical ID when serialized, user <- kernel
+        modes           supported operation modes (do, dump, event etc.)
+        is_call         bool, whether the operation is a call
+        is_async        bool, whether the operation is a notification
+        is_resv         bool, whether the operation does not exist (it's just a reserved ID)
+        attr_set        attribute set name
+        fixed_header    string, optional name of fixed header struct
+
+        yaml            raw spec as loaded from the spec file
+    """
+    def __init__(self, family, yaml, req_value, rsp_value):
+        super().__init__(family, yaml)
+
+        self.value = req_value if req_value == rsp_value else None
+        self.req_value = req_value
+        self.rsp_value = rsp_value
+
+        self.modes = yaml.keys() & {'do', 'dump', 'event', 'notify'}
+        self.is_call = 'do' in yaml or 'dump' in yaml
+        self.is_async = 'notify' in yaml or 'event' in yaml
+        self.is_resv = not self.is_async and not self.is_call
+        self.fixed_header = self.yaml.get('fixed-header', family.fixed_header)
+
+        # Added by resolve:
+        self.attr_set = None
+        delattr(self, "attr_set")
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        if 'attribute-set' in self.yaml:
+            attr_set_name = self.yaml['attribute-set']
+        elif 'notify' in self.yaml:
+            msg = self.family.msgs[self.yaml['notify']]
+            attr_set_name = msg['attribute-set']
+        elif self.is_resv:
+            attr_set_name = ''
+        else:
+            raise Exception(f"Can't resolve attribute set for op '{self.name}'")
+        if attr_set_name:
+            self.attr_set = self.family.attr_sets[attr_set_name]
+
+
+class SpecMcastGroup(SpecElement):
+    """Netlink Multicast Group
+
+    Information about a multicast group.
+
+    Value is only used for classic netlink families that use the
+    netlink-raw schema. Genetlink families use dynamic ID allocation
+    where the ids of multicast groups get resolved at runtime. Value
+    will be None for genetlink families.
+
+    Attributes:
+        name      name of the mulitcast group
+        value     integer id of this multicast group for netlink-raw or None
+        yaml      raw spec as loaded from the spec file
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+        self.value = self.yaml.get('value')
+
+
+class SpecFamily(SpecElement):
+    """ Netlink Family Spec class.
+
+    Netlink family information loaded from a spec (e.g. in YAML).
+    Takes care of unfolding implicit information which can be skipped
+    in the spec itself for brevity.
+
+    The class can be used like a dictionary to access the raw spec
+    elements but that's usually a bad idea.
+
+    Attributes:
+        proto     protocol type (e.g. genetlink)
+        msg_id_model   enum-model for operations (unified, directional etc.)
+        license   spec license (loaded from an SPDX tag on the spec)
+
+        attr_sets  dict of attribute sets
+        msgs       dict of all messages (index by name)
+        sub_msgs   dict of all sub messages (index by name)
+        ops        dict of all valid requests / responses
+        ntfs       dict of all async events
+        consts     dict of all constants/enums
+        fixed_header  string, optional name of family default fixed header struct
+        mcast_groups  dict of all multicast groups (index by name)
+        kernel_family   dict of kernel family attributes
+    """
+    def __init__(self, spec_path, schema_path=None, exclude_ops=None):
+        with open(spec_path, "r") as stream:
+            prefix = '# SPDX-License-Identifier: '
+            first = stream.readline().strip()
+            if not first.startswith(prefix):
+                raise Exception('SPDX license tag required in the spec')
+            self.license = first[len(prefix):]
+
+            stream.seek(0)
+            spec = yaml.safe_load(stream)
+
+        self._resolution_list = []
+
+        super().__init__(self, spec)
+
+        self._exclude_ops = exclude_ops if exclude_ops else []
+
+        self.proto = self.yaml.get('protocol', 'genetlink')
+        self.msg_id_model = self.yaml['operations'].get('enum-model', 'unified')
+
+        if schema_path is None:
+            schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml'
+        if schema_path:
+            global jsonschema
+
+            with open(schema_path, "r") as stream:
+                schema = yaml.safe_load(stream)
+
+            if jsonschema is None:
+                jsonschema = importlib.import_module("jsonschema")
+
+            jsonschema.validate(self.yaml, schema)
+
+        self.attr_sets = collections.OrderedDict()
+        self.sub_msgs = collections.OrderedDict()
+        self.msgs = collections.OrderedDict()
+        self.req_by_value = collections.OrderedDict()
+        self.rsp_by_value = collections.OrderedDict()
+        self.ops = collections.OrderedDict()
+        self.ntfs = collections.OrderedDict()
+        self.consts = collections.OrderedDict()
+        self.mcast_groups = collections.OrderedDict()
+        self.kernel_family = collections.OrderedDict(self.yaml.get('kernel-family', {}))
+
+        last_exception = None
+        while len(self._resolution_list) > 0:
+            resolved = []
+            unresolved = self._resolution_list
+            self._resolution_list = []
+
+            for elem in unresolved:
+                try:
+                    elem.resolve()
+                except (KeyError, AttributeError) as e:
+                    self._resolution_list.append(elem)
+                    last_exception = e
+                    continue
+
+                resolved.append(elem)
+
+            if len(resolved) == 0:
+                raise last_exception
+
+    def new_enum(self, elem):
+        return SpecEnumSet(self, elem)
+
+    def new_attr_set(self, elem):
+        return SpecAttrSet(self, elem)
+
+    def new_struct(self, elem):
+        return SpecStruct(self, elem)
+
+    def new_sub_message(self, elem):
+        return SpecSubMessage(self, elem);
+
+    def new_operation(self, elem, req_val, rsp_val):
+        return SpecOperation(self, elem, req_val, rsp_val)
+
+    def new_mcast_group(self, elem):
+        return SpecMcastGroup(self, elem)
+
+    def add_unresolved(self, elem):
+        self._resolution_list.append(elem)
+
+    def _dictify_ops_unified(self):
+        self.fixed_header = self.yaml['operations'].get('fixed-header')
+        val = 1
+        for elem in self.yaml['operations']['list']:
+            if 'value' in elem:
+                val = elem['value']
+
+            op = self.new_operation(elem, val, val)
+            val += 1
+
+            self.msgs[op.name] = op
+
+    def _dictify_ops_directional(self):
+        self.fixed_header = self.yaml['operations'].get('fixed-header')
+        req_val = rsp_val = 1
+        for elem in self.yaml['operations']['list']:
+            if 'notify' in elem or 'event' in elem:
+                if 'value' in elem:
+                    rsp_val = elem['value']
+                req_val_next = req_val
+                rsp_val_next = rsp_val + 1
+                req_val = None
+            elif 'do' in elem or 'dump' in elem:
+                mode = elem['do'] if 'do' in elem else elem['dump']
+
+                v = mode.get('request', {}).get('value', None)
+                if v:
+                    req_val = v
+                v = mode.get('reply', {}).get('value', None)
+                if v:
+                    rsp_val = v
+
+                rsp_inc = 1 if 'reply' in mode else 0
+                req_val_next = req_val + 1
+                rsp_val_next = rsp_val + rsp_inc
+            else:
+                raise Exception("Can't parse directional ops")
+
+            if req_val == req_val_next:
+                req_val = None
+            if rsp_val == rsp_val_next:
+                rsp_val = None
+
+            skip = False
+            for exclude in self._exclude_ops:
+                skip |= bool(exclude.match(elem['name']))
+            if not skip:
+                op = self.new_operation(elem, req_val, rsp_val)
+
+            req_val = req_val_next
+            rsp_val = rsp_val_next
+
+            self.msgs[op.name] = op
+
+    def find_operation(self, name):
+      """
+      For a given operation name, find and return operation spec.
+      """
+      for op in self.yaml['operations']['list']:
+        if name == op['name']:
+          return op
+      return None
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        definitions = self.yaml.get('definitions', [])
+        for elem in definitions:
+            if elem['type'] == 'enum' or elem['type'] == 'flags':
+                self.consts[elem['name']] = self.new_enum(elem)
+            elif elem['type'] == 'struct':
+                self.consts[elem['name']] = self.new_struct(elem)
+            else:
+                self.consts[elem['name']] = elem
+
+        for elem in self.yaml['attribute-sets']:
+            attr_set = self.new_attr_set(elem)
+            self.attr_sets[elem['name']] = attr_set
+
+        for elem in self.yaml.get('sub-messages', []):
+            sub_message = self.new_sub_message(elem)
+            self.sub_msgs[sub_message.name] = sub_message
+
+        if self.msg_id_model == 'unified':
+            self._dictify_ops_unified()
+        elif self.msg_id_model == 'directional':
+            self._dictify_ops_directional()
+
+        for op in self.msgs.values():
+            if op.req_value is not None:
+                self.req_by_value[op.req_value] = op
+            if op.rsp_value is not None:
+                self.rsp_by_value[op.rsp_value] = op
+            if not op.is_async and 'attribute-set' in op:
+                self.ops[op.name] = op
+            elif op.is_async:
+                self.ntfs[op.name] = op
+
+        mcgs = self.yaml.get('mcast-groups')
+        if mcgs:
+            for elem in mcgs['list']:
+                mcg = self.new_mcast_group(elem)
+                self.mcast_groups[elem['name']] = mcg
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
new file mode 100644
index 000000000000..08f8bf89cfc2
--- /dev/null
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -0,0 +1,1067 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+from collections import namedtuple
+from enum import Enum
+import functools
+import os
+import random
+import socket
+import struct
+from struct import Struct
+import sys
+import yaml
+import ipaddress
+import uuid
+import queue
+import selectors
+import time
+
+from .nlspec import SpecFamily
+
+#
+# Generic Netlink code which should really be in some library, but I can't quickly find one.
+#
+
+
+class Netlink:
+    # Netlink socket
+    SOL_NETLINK = 270
+
+    NETLINK_ADD_MEMBERSHIP = 1
+    NETLINK_CAP_ACK = 10
+    NETLINK_EXT_ACK = 11
+    NETLINK_GET_STRICT_CHK = 12
+
+    # Netlink message
+    NLMSG_ERROR = 2
+    NLMSG_DONE = 3
+
+    NLM_F_REQUEST = 1
+    NLM_F_ACK = 4
+    NLM_F_ROOT = 0x100
+    NLM_F_MATCH = 0x200
+
+    NLM_F_REPLACE = 0x100
+    NLM_F_EXCL = 0x200
+    NLM_F_CREATE = 0x400
+    NLM_F_APPEND = 0x800
+
+    NLM_F_CAPPED = 0x100
+    NLM_F_ACK_TLVS = 0x200
+
+    NLM_F_DUMP = NLM_F_ROOT | NLM_F_MATCH
+
+    NLA_F_NESTED = 0x8000
+    NLA_F_NET_BYTEORDER = 0x4000
+
+    NLA_TYPE_MASK = NLA_F_NESTED | NLA_F_NET_BYTEORDER
+
+    # Genetlink defines
+    NETLINK_GENERIC = 16
+
+    GENL_ID_CTRL = 0x10
+
+    # nlctrl
+    CTRL_CMD_GETFAMILY = 3
+
+    CTRL_ATTR_FAMILY_ID = 1
+    CTRL_ATTR_FAMILY_NAME = 2
+    CTRL_ATTR_MAXATTR = 5
+    CTRL_ATTR_MCAST_GROUPS = 7
+
+    CTRL_ATTR_MCAST_GRP_NAME = 1
+    CTRL_ATTR_MCAST_GRP_ID = 2
+
+    # Extack types
+    NLMSGERR_ATTR_MSG = 1
+    NLMSGERR_ATTR_OFFS = 2
+    NLMSGERR_ATTR_COOKIE = 3
+    NLMSGERR_ATTR_POLICY = 4
+    NLMSGERR_ATTR_MISS_TYPE = 5
+    NLMSGERR_ATTR_MISS_NEST = 6
+
+    # Policy types
+    NL_POLICY_TYPE_ATTR_TYPE = 1
+    NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2
+    NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3
+    NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4
+    NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5
+    NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6
+    NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7
+    NL_POLICY_TYPE_ATTR_POLICY_IDX = 8
+    NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9
+    NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10
+    NL_POLICY_TYPE_ATTR_PAD = 11
+    NL_POLICY_TYPE_ATTR_MASK = 12
+
+    AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64',
+                                  's8', 's16', 's32', 's64',
+                                  'binary', 'string', 'nul-string',
+                                  'nested', 'nested-array',
+                                  'bitfield32', 'sint', 'uint'])
+
+class NlError(Exception):
+  def __init__(self, nl_msg):
+    self.nl_msg = nl_msg
+    self.error = -nl_msg.error
+
+  def __str__(self):
+    return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}"
+
+
+class ConfigError(Exception):
+    pass
+
+
+class NlAttr:
+    ScalarFormat = namedtuple('ScalarFormat', ['native', 'big', 'little'])
+    type_formats = {
+        'u8' : ScalarFormat(Struct('B'), Struct("B"),  Struct("B")),
+        's8' : ScalarFormat(Struct('b'), Struct("b"),  Struct("b")),
+        'u16': ScalarFormat(Struct('H'), Struct(">H"), Struct("<H")),
+        's16': ScalarFormat(Struct('h'), Struct(">h"), Struct("<h")),
+        'u32': ScalarFormat(Struct('I'), Struct(">I"), Struct("<I")),
+        's32': ScalarFormat(Struct('i'), Struct(">i"), Struct("<i")),
+        'u64': ScalarFormat(Struct('Q'), Struct(">Q"), Struct("<Q")),
+        's64': ScalarFormat(Struct('q'), Struct(">q"), Struct("<q"))
+    }
+
+    def __init__(self, raw, offset):
+        self._len, self._type = struct.unpack("HH", raw[offset : offset + 4])
+        self.type = self._type & ~Netlink.NLA_TYPE_MASK
+        self.is_nest = self._type & Netlink.NLA_F_NESTED
+        self.payload_len = self._len
+        self.full_len = (self.payload_len + 3) & ~3
+        self.raw = raw[offset + 4 : offset + self.payload_len]
+
+    @classmethod
+    def get_format(cls, attr_type, byte_order=None):
+        format = cls.type_formats[attr_type]
+        if byte_order:
+            return format.big if byte_order == "big-endian" \
+                else format.little
+        return format.native
+
+    def as_scalar(self, attr_type, byte_order=None):
+        format = self.get_format(attr_type, byte_order)
+        return format.unpack(self.raw)[0]
+
+    def as_auto_scalar(self, attr_type, byte_order=None):
+        if len(self.raw) != 4 and len(self.raw) != 8:
+            raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}")
+        real_type = attr_type[0] + str(len(self.raw) * 8)
+        format = self.get_format(real_type, byte_order)
+        return format.unpack(self.raw)[0]
+
+    def as_strz(self):
+        return self.raw.decode('ascii')[:-1]
+
+    def as_bin(self):
+        return self.raw
+
+    def as_c_array(self, type):
+        format = self.get_format(type)
+        return [ x[0] for x in format.iter_unpack(self.raw) ]
+
+    def __repr__(self):
+        return f"[type:{self.type} len:{self._len}] {self.raw}"
+
+
+class NlAttrs:
+    def __init__(self, msg, offset=0):
+        self.attrs = []
+
+        while offset < len(msg):
+            attr = NlAttr(msg, offset)
+            offset += attr.full_len
+            self.attrs.append(attr)
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def __repr__(self):
+        msg = ''
+        for a in self.attrs:
+            if msg:
+                msg += '\n'
+            msg += repr(a)
+        return msg
+
+
+class NlMsg:
+    def __init__(self, msg, offset, attr_space=None):
+        self.hdr = msg[offset : offset + 16]
+
+        self.nl_len, self.nl_type, self.nl_flags, self.nl_seq, self.nl_portid = \
+            struct.unpack("IHHII", self.hdr)
+
+        self.raw = msg[offset + 16 : offset + self.nl_len]
+
+        self.error = 0
+        self.done = 0
+
+        extack_off = None
+        if self.nl_type == Netlink.NLMSG_ERROR:
+            self.error = struct.unpack("i", self.raw[0:4])[0]
+            self.done = 1
+            extack_off = 20
+        elif self.nl_type == Netlink.NLMSG_DONE:
+            self.error = struct.unpack("i", self.raw[0:4])[0]
+            self.done = 1
+            extack_off = 4
+
+        self.extack = None
+        if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off:
+            self.extack = dict()
+            extack_attrs = NlAttrs(self.raw[extack_off:])
+            for extack in extack_attrs:
+                if extack.type == Netlink.NLMSGERR_ATTR_MSG:
+                    self.extack['msg'] = extack.as_strz()
+                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_TYPE:
+                    self.extack['miss-type'] = extack.as_scalar('u32')
+                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_NEST:
+                    self.extack['miss-nest'] = extack.as_scalar('u32')
+                elif extack.type == Netlink.NLMSGERR_ATTR_OFFS:
+                    self.extack['bad-attr-offs'] = extack.as_scalar('u32')
+                elif extack.type == Netlink.NLMSGERR_ATTR_POLICY:
+                    self.extack['policy'] = self._decode_policy(extack.raw)
+                else:
+                    if 'unknown' not in self.extack:
+                        self.extack['unknown'] = []
+                    self.extack['unknown'].append(extack)
+
+            if attr_space:
+                # We don't have the ability to parse nests yet, so only do global
+                if 'miss-type' in self.extack and 'miss-nest' not in self.extack:
+                    miss_type = self.extack['miss-type']
+                    if miss_type in attr_space.attrs_by_val:
+                        spec = attr_space.attrs_by_val[miss_type]
+                        self.extack['miss-type'] = spec['name']
+                        if 'doc' in spec:
+                            self.extack['miss-type-doc'] = spec['doc']
+
+    def _decode_policy(self, raw):
+        policy = {}
+        for attr in NlAttrs(raw):
+            if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE:
+                type = attr.as_scalar('u32')
+                policy['type'] = Netlink.AttrType(type).name
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S:
+                policy['min-value'] = attr.as_scalar('s64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S:
+                policy['max-value'] = attr.as_scalar('s64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U:
+                policy['min-value'] = attr.as_scalar('u64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U:
+                policy['max-value'] = attr.as_scalar('u64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH:
+                policy['min-length'] = attr.as_scalar('u32')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH:
+                policy['max-length'] = attr.as_scalar('u32')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK:
+                policy['bitfield32-mask'] = attr.as_scalar('u32')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK:
+                policy['mask'] = attr.as_scalar('u64')
+        return policy
+
+    def cmd(self):
+        return self.nl_type
+
+    def __repr__(self):
+        msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}"
+        if self.error:
+            msg += '\n\terror: ' + str(self.error)
+        if self.extack:
+            msg += '\n\textack: ' + repr(self.extack)
+        return msg
+
+
+class NlMsgs:
+    def __init__(self, data, attr_space=None):
+        self.msgs = []
+
+        offset = 0
+        while offset < len(data):
+            msg = NlMsg(data, offset, attr_space=attr_space)
+            offset += msg.nl_len
+            self.msgs.append(msg)
+
+    def __iter__(self):
+        yield from self.msgs
+
+
+genl_family_name_to_id = None
+
+
+def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None):
+    # we prepend length in _genl_msg_finalize()
+    if seq is None:
+        seq = random.randint(1, 1024)
+    nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
+    genlmsg = struct.pack("BBH", genl_cmd, genl_version, 0)
+    return nlmsg + genlmsg
+
+
+def _genl_msg_finalize(msg):
+    return struct.pack("I", len(msg) + 4) + msg
+
+
+def _genl_load_families():
+    with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
+        sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
+
+        msg = _genl_msg(Netlink.GENL_ID_CTRL,
+                        Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK | Netlink.NLM_F_DUMP,
+                        Netlink.CTRL_CMD_GETFAMILY, 1)
+        msg = _genl_msg_finalize(msg)
+
+        sock.send(msg, 0)
+
+        global genl_family_name_to_id
+        genl_family_name_to_id = dict()
+
+        while True:
+            reply = sock.recv(128 * 1024)
+            nms = NlMsgs(reply)
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error:", nl_msg.error)
+                    return
+                if nl_msg.done:
+                    return
+
+                gm = GenlMsg(nl_msg)
+                fam = dict()
+                for attr in NlAttrs(gm.raw):
+                    if attr.type == Netlink.CTRL_ATTR_FAMILY_ID:
+                        fam['id'] = attr.as_scalar('u16')
+                    elif attr.type == Netlink.CTRL_ATTR_FAMILY_NAME:
+                        fam['name'] = attr.as_strz()
+                    elif attr.type == Netlink.CTRL_ATTR_MAXATTR:
+                        fam['maxattr'] = attr.as_scalar('u32')
+                    elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS:
+                        fam['mcast'] = dict()
+                        for entry in NlAttrs(attr.raw):
+                            mcast_name = None
+                            mcast_id = None
+                            for entry_attr in NlAttrs(entry.raw):
+                                if entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_NAME:
+                                    mcast_name = entry_attr.as_strz()
+                                elif entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_ID:
+                                    mcast_id = entry_attr.as_scalar('u32')
+                            if mcast_name and mcast_id is not None:
+                                fam['mcast'][mcast_name] = mcast_id
+                if 'name' in fam and 'id' in fam:
+                    genl_family_name_to_id[fam['name']] = fam
+
+
+class GenlMsg:
+    def __init__(self, nl_msg):
+        self.nl = nl_msg
+        self.genl_cmd, self.genl_version, _ = struct.unpack_from("BBH", nl_msg.raw, 0)
+        self.raw = nl_msg.raw[4:]
+
+    def cmd(self):
+        return self.genl_cmd
+
+    def __repr__(self):
+        msg = repr(self.nl)
+        msg += f"\tgenl_cmd = {self.genl_cmd} genl_ver = {self.genl_version}\n"
+        for a in self.raw_attrs:
+            msg += '\t\t' + repr(a) + '\n'
+        return msg
+
+
+class NetlinkProtocol:
+    def __init__(self, family_name, proto_num):
+        self.family_name = family_name
+        self.proto_num = proto_num
+
+    def _message(self, nl_type, nl_flags, seq=None):
+        if seq is None:
+            seq = random.randint(1, 1024)
+        nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
+        return nlmsg
+
+    def message(self, flags, command, version, seq=None):
+        return self._message(command, flags, seq)
+
+    def _decode(self, nl_msg):
+        return nl_msg
+
+    def decode(self, ynl, nl_msg, op):
+        msg = self._decode(nl_msg)
+        if op is None:
+            op = ynl.rsp_by_value[msg.cmd()]
+        fixed_header_size = ynl._struct_size(op.fixed_header)
+        msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size)
+        return msg
+
+    def get_mcast_id(self, mcast_name, mcast_groups):
+        if mcast_name not in mcast_groups:
+            raise Exception(f'Multicast group "{mcast_name}" not present in the spec')
+        return mcast_groups[mcast_name].value
+
+    def msghdr_size(self):
+        return 16
+
+
+class GenlProtocol(NetlinkProtocol):
+    def __init__(self, family_name):
+        super().__init__(family_name, Netlink.NETLINK_GENERIC)
+
+        global genl_family_name_to_id
+        if genl_family_name_to_id is None:
+            _genl_load_families()
+
+        self.genl_family = genl_family_name_to_id[family_name]
+        self.family_id = genl_family_name_to_id[family_name]['id']
+
+    def message(self, flags, command, version, seq=None):
+        nlmsg = self._message(self.family_id, flags, seq)
+        genlmsg = struct.pack("BBH", command, version, 0)
+        return nlmsg + genlmsg
+
+    def _decode(self, nl_msg):
+        return GenlMsg(nl_msg)
+
+    def get_mcast_id(self, mcast_name, mcast_groups):
+        if mcast_name not in self.genl_family['mcast']:
+            raise Exception(f'Multicast group "{mcast_name}" not present in the family')
+        return self.genl_family['mcast'][mcast_name]
+
+    def msghdr_size(self):
+        return super().msghdr_size() + 4
+
+
+class SpaceAttrs:
+    SpecValuesPair = namedtuple('SpecValuesPair', ['spec', 'values'])
+
+    def __init__(self, attr_space, attrs, outer = None):
+        outer_scopes = outer.scopes if outer else []
+        inner_scope = self.SpecValuesPair(attr_space, attrs)
+        self.scopes = [inner_scope] + outer_scopes
+
+    def lookup(self, name):
+        for scope in self.scopes:
+            if name in scope.spec:
+                if name in scope.values:
+                    return scope.values[name]
+                spec_name = scope.spec.yaml['name']
+                raise Exception(
+                    f"No value for '{name}' in attribute space '{spec_name}'")
+        raise Exception(f"Attribute '{name}' not defined in any attribute-set")
+
+
+#
+# YNL implementation details.
+#
+
+
+class YnlFamily(SpecFamily):
+    def __init__(self, def_path, schema=None, process_unknown=False,
+                 recv_size=0):
+        super().__init__(def_path, schema)
+
+        self.include_raw = False
+        self.process_unknown = process_unknown
+
+        try:
+            if self.proto == "netlink-raw":
+                self.nlproto = NetlinkProtocol(self.yaml['name'],
+                                               self.yaml['protonum'])
+            else:
+                self.nlproto = GenlProtocol(self.yaml['name'])
+        except KeyError:
+            raise Exception(f"Family '{self.yaml['name']}' not supported by the kernel")
+
+        self._recv_dbg = False
+        # Note that netlink will use conservative (min) message size for
+        # the first dump recv() on the socket, our setting will only matter
+        # from the second recv() on.
+        self._recv_size = recv_size if recv_size else 131072
+        # Netlink will always allocate at least PAGE_SIZE - sizeof(skb_shinfo)
+        # for a message, so smaller receive sizes will lead to truncation.
+        # Note that the min size for other families may be larger than 4k!
+        if self._recv_size < 4000:
+            raise ConfigError()
+
+        self.sock = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, self.nlproto.proto_num)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_GET_STRICT_CHK, 1)
+
+        self.async_msg_ids = set()
+        self.async_msg_queue = queue.Queue()
+
+        for msg in self.msgs.values():
+            if msg.is_async:
+                self.async_msg_ids.add(msg.rsp_value)
+
+        for op_name, op in self.ops.items():
+            bound_f = functools.partial(self._op, op_name)
+            setattr(self, op.ident_name, bound_f)
+
+
+    def ntf_subscribe(self, mcast_name):
+        mcast_id = self.nlproto.get_mcast_id(mcast_name, self.mcast_groups)
+        self.sock.bind((0, 0))
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_ADD_MEMBERSHIP,
+                             mcast_id)
+
+    def set_recv_dbg(self, enabled):
+        self._recv_dbg = enabled
+
+    def _recv_dbg_print(self, reply, nl_msgs):
+        if not self._recv_dbg:
+            return
+        print("Recv: read", len(reply), "bytes,",
+              len(nl_msgs.msgs), "messages", file=sys.stderr)
+        for nl_msg in nl_msgs:
+            print("  ", nl_msg, file=sys.stderr)
+
+    def _encode_enum(self, attr_spec, value):
+        enum = self.consts[attr_spec['enum']]
+        if enum.type == 'flags' or attr_spec.get('enum-as-flags', False):
+            scalar = 0
+            if isinstance(value, str):
+                value = [value]
+            for single_value in value:
+                scalar += enum.entries[single_value].user_value(as_flags = True)
+            return scalar
+        else:
+            return enum.entries[value].user_value()
+
+    def _get_scalar(self, attr_spec, value):
+        try:
+            return int(value)
+        except (ValueError, TypeError) as e:
+            if 'enum' not in attr_spec:
+                raise e
+        return self._encode_enum(attr_spec, value)
+
+    def _add_attr(self, space, name, value, search_attrs):
+        try:
+            attr = self.attr_sets[space][name]
+        except KeyError:
+            raise Exception(f"Space '{space}' has no attribute '{name}'")
+        nl_type = attr.value
+
+        if attr.is_multi and isinstance(value, list):
+            attr_payload = b''
+            for subvalue in value:
+                attr_payload += self._add_attr(space, name, subvalue, search_attrs)
+            return attr_payload
+
+        if attr["type"] == 'nest':
+            nl_type |= Netlink.NLA_F_NESTED
+            attr_payload = b''
+            sub_space = attr['nested-attributes']
+            sub_attrs = SpaceAttrs(self.attr_sets[sub_space], value, search_attrs)
+            for subname, subvalue in value.items():
+                attr_payload += self._add_attr(sub_space, subname, subvalue, sub_attrs)
+        elif attr["type"] == 'flag':
+            if not value:
+                # If value is absent or false then skip attribute creation.
+                return b''
+            attr_payload = b''
+        elif attr["type"] == 'string':
+            attr_payload = str(value).encode('ascii') + b'\x00'
+        elif attr["type"] == 'binary':
+            if isinstance(value, bytes):
+                attr_payload = value
+            elif isinstance(value, str):
+                attr_payload = bytes.fromhex(value)
+            elif isinstance(value, dict) and attr.struct_name:
+                attr_payload = self._encode_struct(attr.struct_name, value)
+            else:
+                raise Exception(f'Unknown type for binary attribute, value: {value}')
+        elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar:
+            scalar = self._get_scalar(attr, value)
+            if attr.is_auto_scalar:
+                attr_type = attr["type"][0] + ('32' if scalar.bit_length() <= 32 else '64')
+            else:
+                attr_type = attr["type"]
+            format = NlAttr.get_format(attr_type, attr.byte_order)
+            attr_payload = format.pack(scalar)
+        elif attr['type'] in "bitfield32":
+            scalar_value = self._get_scalar(attr, value["value"])
+            scalar_selector = self._get_scalar(attr, value["selector"])
+            attr_payload = struct.pack("II", scalar_value, scalar_selector)
+        elif attr['type'] == 'sub-message':
+            msg_format = self._resolve_selector(attr, search_attrs)
+            attr_payload = b''
+            if msg_format.fixed_header:
+                attr_payload += self._encode_struct(msg_format.fixed_header, value)
+            if msg_format.attr_set:
+                if msg_format.attr_set in self.attr_sets:
+                    nl_type |= Netlink.NLA_F_NESTED
+                    sub_attrs = SpaceAttrs(msg_format.attr_set, value, search_attrs)
+                    for subname, subvalue in value.items():
+                        attr_payload += self._add_attr(msg_format.attr_set,
+                                                       subname, subvalue, sub_attrs)
+                else:
+                    raise Exception(f"Unknown attribute-set '{msg_format.attr_set}'")
+        else:
+            raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}')
+
+        pad = b'\x00' * ((4 - len(attr_payload) % 4) % 4)
+        return struct.pack('HH', len(attr_payload) + 4, nl_type) + attr_payload + pad
+
+    def _decode_enum(self, raw, attr_spec):
+        enum = self.consts[attr_spec['enum']]
+        if enum.type == 'flags' or attr_spec.get('enum-as-flags', False):
+            i = 0
+            value = set()
+            while raw:
+                if raw & 1:
+                    value.add(enum.entries_by_val[i].name)
+                raw >>= 1
+                i += 1
+        else:
+            value = enum.entries_by_val[raw].name
+        return value
+
+    def _decode_binary(self, attr, attr_spec):
+        if attr_spec.struct_name:
+            decoded = self._decode_struct(attr.raw, attr_spec.struct_name)
+        elif attr_spec.sub_type:
+            decoded = attr.as_c_array(attr_spec.sub_type)
+        else:
+            decoded = attr.as_bin()
+            if attr_spec.display_hint:
+                decoded = self._formatted_string(decoded, attr_spec.display_hint)
+        return decoded
+
+    def _decode_array_attr(self, attr, attr_spec):
+        decoded = []
+        offset = 0
+        while offset < len(attr.raw):
+            item = NlAttr(attr.raw, offset)
+            offset += item.full_len
+
+            if attr_spec["sub-type"] == 'nest':
+                subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes'])
+                decoded.append({ item.type: subattrs })
+            elif attr_spec["sub-type"] == 'binary':
+                subattrs = item.as_bin()
+                if attr_spec.display_hint:
+                    subattrs = self._formatted_string(subattrs, attr_spec.display_hint)
+                decoded.append(subattrs)
+            elif attr_spec["sub-type"] in NlAttr.type_formats:
+                subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order)
+                if attr_spec.display_hint:
+                    subattrs = self._formatted_string(subattrs, attr_spec.display_hint)
+                decoded.append(subattrs)
+            else:
+                raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}')
+        return decoded
+
+    def _decode_nest_type_value(self, attr, attr_spec):
+        decoded = {}
+        value = attr
+        for name in attr_spec['type-value']:
+            value = NlAttr(value.raw, 0)
+            decoded[name] = value.type
+        subattrs = self._decode(NlAttrs(value.raw), attr_spec['nested-attributes'])
+        decoded.update(subattrs)
+        return decoded
+
+    def _decode_unknown(self, attr):
+        if attr.is_nest:
+            return self._decode(NlAttrs(attr.raw), None)
+        else:
+            return attr.as_bin()
+
+    def _rsp_add(self, rsp, name, is_multi, decoded):
+        if is_multi == None:
+            if name in rsp and type(rsp[name]) is not list:
+                rsp[name] = [rsp[name]]
+                is_multi = True
+            else:
+                is_multi = False
+
+        if not is_multi:
+            rsp[name] = decoded
+        elif name in rsp:
+            rsp[name].append(decoded)
+        else:
+            rsp[name] = [decoded]
+
+    def _resolve_selector(self, attr_spec, search_attrs):
+        sub_msg = attr_spec.sub_message
+        if sub_msg not in self.sub_msgs:
+            raise Exception(f"No sub-message spec named {sub_msg} for {attr_spec.name}")
+        sub_msg_spec = self.sub_msgs[sub_msg]
+
+        selector = attr_spec.selector
+        value = search_attrs.lookup(selector)
+        if value not in sub_msg_spec.formats:
+            raise Exception(f"No message format for '{value}' in sub-message spec '{sub_msg}'")
+
+        spec = sub_msg_spec.formats[value]
+        return spec
+
+    def _decode_sub_msg(self, attr, attr_spec, search_attrs):
+        msg_format = self._resolve_selector(attr_spec, search_attrs)
+        decoded = {}
+        offset = 0
+        if msg_format.fixed_header:
+            decoded.update(self._decode_struct(attr.raw, msg_format.fixed_header));
+            offset = self._struct_size(msg_format.fixed_header)
+        if msg_format.attr_set:
+            if msg_format.attr_set in self.attr_sets:
+                subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set)
+                decoded.update(subdict)
+            else:
+                raise Exception(f"Unknown attribute-set '{attr_space}' when decoding '{attr_spec.name}'")
+        return decoded
+
+    def _decode(self, attrs, space, outer_attrs = None):
+        rsp = dict()
+        if space:
+            attr_space = self.attr_sets[space]
+            search_attrs = SpaceAttrs(attr_space, rsp, outer_attrs)
+
+        for attr in attrs:
+            try:
+                attr_spec = attr_space.attrs_by_val[attr.type]
+            except (KeyError, UnboundLocalError):
+                if not self.process_unknown:
+                    raise Exception(f"Space '{space}' has no attribute with value '{attr.type}'")
+                attr_name = f"UnknownAttr({attr.type})"
+                self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr))
+                continue
+
+            try:
+                if attr_spec["type"] == 'nest':
+                    subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
+                    decoded = subdict
+                elif attr_spec["type"] == 'string':
+                    decoded = attr.as_strz()
+                elif attr_spec["type"] == 'binary':
+                    decoded = self._decode_binary(attr, attr_spec)
+                elif attr_spec["type"] == 'flag':
+                    decoded = True
+                elif attr_spec.is_auto_scalar:
+                    decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order)
+                elif attr_spec["type"] in NlAttr.type_formats:
+                    decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
+                    if 'enum' in attr_spec:
+                        decoded = self._decode_enum(decoded, attr_spec)
+                    elif attr_spec.display_hint:
+                        decoded = self._formatted_string(decoded, attr_spec.display_hint)
+                elif attr_spec["type"] == 'indexed-array':
+                    decoded = self._decode_array_attr(attr, attr_spec)
+                elif attr_spec["type"] == 'bitfield32':
+                    value, selector = struct.unpack("II", attr.raw)
+                    if 'enum' in attr_spec:
+                        value = self._decode_enum(value, attr_spec)
+                        selector = self._decode_enum(selector, attr_spec)
+                    decoded = {"value": value, "selector": selector}
+                elif attr_spec["type"] == 'sub-message':
+                    decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
+                elif attr_spec["type"] == 'nest-type-value':
+                    decoded = self._decode_nest_type_value(attr, attr_spec)
+                else:
+                    if not self.process_unknown:
+                        raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
+                    decoded = self._decode_unknown(attr)
+
+                self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
+            except:
+                print(f"Error decoding '{attr_spec.name}' from '{space}'")
+                raise
+
+        return rsp
+
+    def _decode_extack_path(self, attrs, attr_set, offset, target):
+        for attr in attrs:
+            try:
+                attr_spec = attr_set.attrs_by_val[attr.type]
+            except KeyError:
+                raise Exception(f"Space '{attr_set.name}' has no attribute with value '{attr.type}'")
+            if offset > target:
+                break
+            if offset == target:
+                return '.' + attr_spec.name
+
+            if offset + attr.full_len <= target:
+                offset += attr.full_len
+                continue
+            if attr_spec['type'] != 'nest':
+                raise Exception(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack")
+            offset += 4
+            subpath = self._decode_extack_path(NlAttrs(attr.raw),
+                                               self.attr_sets[attr_spec['nested-attributes']],
+                                               offset, target)
+            if subpath is None:
+                return None
+            return '.' + attr_spec.name + subpath
+
+        return None
+
+    def _decode_extack(self, request, op, extack):
+        if 'bad-attr-offs' not in extack:
+            return
+
+        msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op)
+        offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header)
+        path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset,
+                                        extack['bad-attr-offs'])
+        if path:
+            del extack['bad-attr-offs']
+            extack['bad-attr'] = path
+
+    def _struct_size(self, name):
+        if name:
+            members = self.consts[name].members
+            size = 0
+            for m in members:
+                if m.type in ['pad', 'binary']:
+                    if m.struct:
+                        size += self._struct_size(m.struct)
+                    else:
+                        size += m.len
+                else:
+                    format = NlAttr.get_format(m.type, m.byte_order)
+                    size += format.size
+            return size
+        else:
+            return 0
+
+    def _decode_struct(self, data, name):
+        members = self.consts[name].members
+        attrs = dict()
+        offset = 0
+        for m in members:
+            value = None
+            if m.type == 'pad':
+                offset += m.len
+            elif m.type == 'binary':
+                if m.struct:
+                    len = self._struct_size(m.struct)
+                    value = self._decode_struct(data[offset : offset + len],
+                                                m.struct)
+                    offset += len
+                else:
+                    value = data[offset : offset + m.len]
+                    offset += m.len
+            else:
+                format = NlAttr.get_format(m.type, m.byte_order)
+                [ value ] = format.unpack_from(data, offset)
+                offset += format.size
+            if value is not None:
+                if m.enum:
+                    value = self._decode_enum(value, m)
+                elif m.display_hint:
+                    value = self._formatted_string(value, m.display_hint)
+                attrs[m.name] = value
+        return attrs
+
+    def _encode_struct(self, name, vals):
+        members = self.consts[name].members
+        attr_payload = b''
+        for m in members:
+            value = vals.pop(m.name) if m.name in vals else None
+            if m.type == 'pad':
+                attr_payload += bytearray(m.len)
+            elif m.type == 'binary':
+                if m.struct:
+                    if value is None:
+                        value = dict()
+                    attr_payload += self._encode_struct(m.struct, value)
+                else:
+                    if value is None:
+                        attr_payload += bytearray(m.len)
+                    else:
+                        attr_payload += bytes.fromhex(value)
+            else:
+                if value is None:
+                    value = 0
+                format = NlAttr.get_format(m.type, m.byte_order)
+                attr_payload += format.pack(value)
+        return attr_payload
+
+    def _formatted_string(self, raw, display_hint):
+        if display_hint == 'mac':
+            formatted = ':'.join('%02x' % b for b in raw)
+        elif display_hint == 'hex':
+            if isinstance(raw, int):
+                formatted = hex(raw)
+            else:
+                formatted = bytes.hex(raw, ' ')
+        elif display_hint in [ 'ipv4', 'ipv6' ]:
+            formatted = format(ipaddress.ip_address(raw))
+        elif display_hint == 'uuid':
+            formatted = str(uuid.UUID(bytes=raw))
+        else:
+            formatted = raw
+        return formatted
+
+    def handle_ntf(self, decoded):
+        msg = dict()
+        if self.include_raw:
+            msg['raw'] = decoded
+        op = self.rsp_by_value[decoded.cmd()]
+        attrs = self._decode(decoded.raw_attrs, op.attr_set.name)
+        if op.fixed_header:
+            attrs.update(self._decode_struct(decoded.raw, op.fixed_header))
+
+        msg['name'] = op['name']
+        msg['msg'] = attrs
+        self.async_msg_queue.put(msg)
+
+    def check_ntf(self):
+        while True:
+            try:
+                reply = self.sock.recv(self._recv_size, socket.MSG_DONTWAIT)
+            except BlockingIOError:
+                return
+
+            nms = NlMsgs(reply)
+            self._recv_dbg_print(reply, nms)
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error in ntf!?", os.strerror(-nl_msg.error))
+                    print(nl_msg)
+                    continue
+                if nl_msg.done:
+                    print("Netlink done while checking for ntf!?")
+                    continue
+
+                decoded = self.nlproto.decode(self, nl_msg, None)
+                if decoded.cmd() not in self.async_msg_ids:
+                    print("Unexpected msg id while checking for ntf", decoded)
+                    continue
+
+                self.handle_ntf(decoded)
+
+    def poll_ntf(self, duration=None):
+        start_time = time.time()
+        selector = selectors.DefaultSelector()
+        selector.register(self.sock, selectors.EVENT_READ)
+
+        while True:
+            try:
+                yield self.async_msg_queue.get_nowait()
+            except queue.Empty:
+                if duration is not None:
+                    timeout = start_time + duration - time.time()
+                    if timeout <= 0:
+                        return
+                else:
+                    timeout = None
+                events = selector.select(timeout)
+                if events:
+                    self.check_ntf()
+
+    def operation_do_attributes(self, name):
+      """
+      For a given operation name, find and return a supported
+      set of attributes (as a dict).
+      """
+      op = self.find_operation(name)
+      if not op:
+        return None
+
+      return op['do']['request']['attributes'].copy()
+
+    def _encode_message(self, op, vals, flags, req_seq):
+        nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
+        for flag in flags or []:
+            nl_flags |= flag
+
+        msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq)
+        if op.fixed_header:
+            msg += self._encode_struct(op.fixed_header, vals)
+        search_attrs = SpaceAttrs(op.attr_set, vals)
+        for name, value in vals.items():
+            msg += self._add_attr(op.attr_set.name, name, value, search_attrs)
+        msg = _genl_msg_finalize(msg)
+        return msg
+
+    def _ops(self, ops):
+        reqs_by_seq = {}
+        req_seq = random.randint(1024, 65535)
+        payload = b''
+        for (method, vals, flags) in ops:
+            op = self.ops[method]
+            msg = self._encode_message(op, vals, flags, req_seq)
+            reqs_by_seq[req_seq] = (op, msg, flags)
+            payload += msg
+            req_seq += 1
+
+        self.sock.send(payload, 0)
+
+        done = False
+        rsp = []
+        op_rsp = []
+        while not done:
+            reply = self.sock.recv(self._recv_size)
+            nms = NlMsgs(reply, attr_space=op.attr_set)
+            self._recv_dbg_print(reply, nms)
+            for nl_msg in nms:
+                if nl_msg.nl_seq in reqs_by_seq:
+                    (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq]
+                    if nl_msg.extack:
+                        self._decode_extack(req_msg, op, nl_msg.extack)
+                else:
+                    op = None
+                    req_flags = []
+
+                if nl_msg.error:
+                    raise NlError(nl_msg)
+                if nl_msg.done:
+                    if nl_msg.extack:
+                        print("Netlink warning:")
+                        print(nl_msg)
+
+                    if Netlink.NLM_F_DUMP in req_flags:
+                        rsp.append(op_rsp)
+                    elif not op_rsp:
+                        rsp.append(None)
+                    elif len(op_rsp) == 1:
+                        rsp.append(op_rsp[0])
+                    else:
+                        rsp.append(op_rsp)
+                    op_rsp = []
+
+                    del reqs_by_seq[nl_msg.nl_seq]
+                    done = len(reqs_by_seq) == 0
+                    break
+
+                decoded = self.nlproto.decode(self, nl_msg, op)
+
+                # Check if this is a reply to our request
+                if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value:
+                    if decoded.cmd() in self.async_msg_ids:
+                        self.handle_ntf(decoded)
+                        continue
+                    else:
+                        print('Unexpected message: ' + repr(decoded))
+                        continue
+
+                rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name)
+                if op.fixed_header:
+                    rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header))
+                op_rsp.append(rsp_msg)
+
+        return rsp
+
+    def _op(self, method, vals, flags=None, dump=False):
+        req_flags = flags or []
+        if dump:
+            req_flags.append(Netlink.NLM_F_DUMP)
+
+        ops = [(method, vals, req_flags)]
+        return self._ops(ops)[0]
+
+    def do(self, method, vals, flags=None):
+        return self._op(method, vals, flags)
+
+    def dump(self, method, vals):
+        return self._op(method, vals, dump=True)
+
+    def do_multi(self, ops):
+        return self._ops(ops)
diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
new file mode 100755
index 000000000000..d3a7dfbcf929
--- /dev/null
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -0,0 +1,3044 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+
+import argparse
+import collections
+import filecmp
+import pathlib
+import os
+import re
+import shutil
+import sys
+import tempfile
+import yaml
+
+sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
+from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation, SpecEnumSet, SpecEnumEntry
+
+
+def c_upper(name):
+    return name.upper().replace('-', '_')
+
+
+def c_lower(name):
+    return name.lower().replace('-', '_')
+
+
+def limit_to_number(name):
+    """
+    Turn a string limit like u32-max or s64-min into its numerical value
+    """
+    if name[0] == 'u' and name.endswith('-min'):
+        return 0
+    width = int(name[1:-4])
+    if name[0] == 's':
+        width -= 1
+    value = (1 << width) - 1
+    if name[0] == 's' and name.endswith('-min'):
+        value = -value - 1
+    return value
+
+
+class BaseNlLib:
+    def get_family_id(self):
+        return 'ys->family_id'
+
+
+class Type(SpecAttr):
+    def __init__(self, family, attr_set, attr, value):
+        super().__init__(family, attr_set, attr, value)
+
+        self.attr = attr
+        self.attr_set = attr_set
+        self.type = attr['type']
+        self.checks = attr.get('checks', {})
+
+        self.request = False
+        self.reply = False
+
+        if 'len' in attr:
+            self.len = attr['len']
+
+        if 'nested-attributes' in attr:
+            self.nested_attrs = attr['nested-attributes']
+            if self.nested_attrs == family.name:
+                self.nested_render_name = c_lower(f"{family.ident_name}")
+            else:
+                self.nested_render_name = c_lower(f"{family.ident_name}_{self.nested_attrs}")
+
+            if self.nested_attrs in self.family.consts:
+                self.nested_struct_type = 'struct ' + self.nested_render_name + '_'
+            else:
+                self.nested_struct_type = 'struct ' + self.nested_render_name
+
+        self.c_name = c_lower(self.name)
+        if self.c_name in _C_KW:
+            self.c_name += '_'
+
+        # Added by resolve():
+        self.enum_name = None
+        delattr(self, "enum_name")
+
+    def _get_real_attr(self):
+        # if the attr is for a subset return the "real" attr (just one down, does not recurse)
+        return self.family.attr_sets[self.attr_set.subset_of][self.name]
+
+    def set_request(self):
+        self.request = True
+        if self.attr_set.subset_of:
+            self._get_real_attr().set_request()
+
+    def set_reply(self):
+        self.reply = True
+        if self.attr_set.subset_of:
+            self._get_real_attr().set_reply()
+
+    def get_limit(self, limit, default=None):
+        value = self.checks.get(limit, default)
+        if value is None:
+            return value
+        if isinstance(value, int):
+            return value
+        if value in self.family.consts:
+            raise Exception("Resolving family constants not implemented, yet")
+        return limit_to_number(value)
+
+    def get_limit_str(self, limit, default=None, suffix=''):
+        value = self.checks.get(limit, default)
+        if value is None:
+            return ''
+        if isinstance(value, int):
+            return str(value) + suffix
+        if value in self.family.consts:
+            return c_upper(f"{self.family['name']}-{value}")
+        return c_upper(value)
+
+    def resolve(self):
+        if 'name-prefix' in self.attr:
+            enum_name = f"{self.attr['name-prefix']}{self.name}"
+        else:
+            enum_name = f"{self.attr_set.name_prefix}{self.name}"
+        self.enum_name = c_upper(enum_name)
+
+        if self.attr_set.subset_of:
+            if self.checks != self._get_real_attr().checks:
+                raise Exception("Overriding checks not supported by codegen, yet")
+
+    def is_multi_val(self):
+        return None
+
+    def is_scalar(self):
+        return self.type in {'u8', 'u16', 'u32', 'u64', 's32', 's64'}
+
+    def is_recursive(self):
+        return False
+
+    def is_recursive_for_op(self, ri):
+        return self.is_recursive() and not ri.op
+
+    def presence_type(self):
+        return 'bit'
+
+    def presence_member(self, space, type_filter):
+        if self.presence_type() != type_filter:
+            return
+
+        if self.presence_type() == 'bit':
+            pfx = '__' if space == 'user' else ''
+            return f"{pfx}u32 {self.c_name}:1;"
+
+        if self.presence_type() == 'len':
+            pfx = '__' if space == 'user' else ''
+            return f"{pfx}u32 {self.c_name}_len;"
+
+    def _complex_member_type(self, ri):
+        return None
+
+    def free_needs_iter(self):
+        return False
+
+    def free(self, ri, var, ref):
+        if self.is_multi_val() or self.presence_type() == 'len':
+            ri.cw.p(f'free({var}->{ref}{self.c_name});')
+
+    def arg_member(self, ri):
+        member = self._complex_member_type(ri)
+        if member:
+            arg = [member + ' *' + self.c_name]
+            if self.presence_type() == 'count':
+                arg += ['unsigned int n_' + self.c_name]
+            return arg
+        raise Exception(f"Struct member not implemented for class type {self.type}")
+
+    def struct_member(self, ri):
+        if self.is_multi_val():
+            ri.cw.p(f"unsigned int n_{self.c_name};")
+        member = self._complex_member_type(ri)
+        if member:
+            ptr = '*' if self.is_multi_val() else ''
+            if self.is_recursive_for_op(ri):
+                ptr = '*'
+            ri.cw.p(f"{member} {ptr}{self.c_name};")
+            return
+        members = self.arg_member(ri)
+        for one in members:
+            ri.cw.p(one + ';')
+
+    def _attr_policy(self, policy):
+        return '{ .type = ' + policy + ', }'
+
+    def attr_policy(self, cw):
+        policy = f'NLA_{c_upper(self.type)}'
+        if self.attr.get('byte-order') == 'big-endian':
+            if self.type in {'u16', 'u32'}:
+                policy = f'NLA_BE{self.type[1:]}'
+
+        spec = self._attr_policy(policy)
+        cw.p(f"\t[{self.enum_name}] = {spec},")
+
+    def _attr_typol(self):
+        raise Exception(f"Type policy not implemented for class type {self.type}")
+
+    def attr_typol(self, cw):
+        typol = self._attr_typol()
+        cw.p(f'[{self.enum_name}] = {"{"} .name = "{self.name}", {typol}{"}"},')
+
+    def _attr_put_line(self, ri, var, line):
+        if self.presence_type() == 'bit':
+            ri.cw.p(f"if ({var}->_present.{self.c_name})")
+        elif self.presence_type() == 'len':
+            ri.cw.p(f"if ({var}->_present.{self.c_name}_len)")
+        ri.cw.p(f"{line};")
+
+    def _attr_put_simple(self, ri, var, put_type):
+        line = f"ynl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name})"
+        self._attr_put_line(ri, var, line)
+
+    def attr_put(self, ri, var):
+        raise Exception(f"Put not implemented for class type {self.type}")
+
+    def _attr_get(self, ri, var):
+        raise Exception(f"Attr get not implemented for class type {self.type}")
+
+    def attr_get(self, ri, var, first):
+        lines, init_lines, local_vars = self._attr_get(ri, var)
+        if type(lines) is str:
+            lines = [lines]
+        if type(init_lines) is str:
+            init_lines = [init_lines]
+
+        kw = 'if' if first else 'else if'
+        ri.cw.block_start(line=f"{kw} (type == {self.enum_name})")
+        if local_vars:
+            for local in local_vars:
+                ri.cw.p(local)
+            ri.cw.nl()
+
+        if not self.is_multi_val():
+            ri.cw.p("if (ynl_attr_validate(yarg, attr))")
+            ri.cw.p("return YNL_PARSE_CB_ERROR;")
+            if self.presence_type() == 'bit':
+                ri.cw.p(f"{var}->_present.{self.c_name} = 1;")
+
+        if init_lines:
+            ri.cw.nl()
+            for line in init_lines:
+                ri.cw.p(line)
+
+        for line in lines:
+            ri.cw.p(line)
+        ri.cw.block_end()
+        return True
+
+    def _setter_lines(self, ri, member, presence):
+        raise Exception(f"Setter not implemented for class type {self.type}")
+
+    def setter(self, ri, space, direction, deref=False, ref=None):
+        ref = (ref if ref else []) + [self.c_name]
+        var = "req"
+        member = f"{var}->{'.'.join(ref)}"
+
+        code = []
+        presence = ''
+        for i in range(0, len(ref)):
+            presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}"
+            # Every layer below last is a nest, so we know it uses bit presence
+            # last layer is "self" and may be a complex type
+            if i == len(ref) - 1 and self.presence_type() != 'bit':
+                continue
+            code.append(presence + ' = 1;')
+        code += self._setter_lines(ri, member, presence)
+
+        func_name = f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}"
+        free = bool([x for x in code if 'free(' in x])
+        alloc = bool([x for x in code if 'alloc(' in x])
+        if free and not alloc:
+            func_name = '__' + func_name
+        ri.cw.write_func('static inline void', func_name, body=code,
+                         args=[f'{type_name(ri, direction, deref=deref)} *{var}'] + self.arg_member(ri))
+
+
+class TypeUnused(Type):
+    def presence_type(self):
+        return ''
+
+    def arg_member(self, ri):
+        return []
+
+    def _attr_get(self, ri, var):
+        return ['return YNL_PARSE_CB_ERROR;'], None, None
+
+    def _attr_typol(self):
+        return '.type = YNL_PT_REJECT, '
+
+    def attr_policy(self, cw):
+        pass
+
+    def attr_put(self, ri, var):
+        pass
+
+    def attr_get(self, ri, var, first):
+        pass
+
+    def setter(self, ri, space, direction, deref=False, ref=None):
+        pass
+
+
+class TypePad(Type):
+    def presence_type(self):
+        return ''
+
+    def arg_member(self, ri):
+        return []
+
+    def _attr_typol(self):
+        return '.type = YNL_PT_IGNORE, '
+
+    def attr_put(self, ri, var):
+        pass
+
+    def attr_get(self, ri, var, first):
+        pass
+
+    def attr_policy(self, cw):
+        pass
+
+    def setter(self, ri, space, direction, deref=False, ref=None):
+        pass
+
+
+class TypeScalar(Type):
+    def __init__(self, family, attr_set, attr, value):
+        super().__init__(family, attr_set, attr, value)
+
+        self.byte_order_comment = ''
+        if 'byte-order' in attr:
+            self.byte_order_comment = f" /* {attr['byte-order']} */"
+
+        if 'enum' in self.attr:
+            enum = self.family.consts[self.attr['enum']]
+            low, high = enum.value_range()
+            if 'min' not in self.checks:
+                if low != 0 or self.type[0] == 's':
+                    self.checks['min'] = low
+            if 'max' not in self.checks:
+                self.checks['max'] = high
+
+        if 'min' in self.checks and 'max' in self.checks:
+            if self.get_limit('min') > self.get_limit('max'):
+                raise Exception(f'Invalid limit for "{self.name}" min: {self.get_limit("min")} max: {self.get_limit("max")}')
+            self.checks['range'] = True
+
+        low = min(self.get_limit('min', 0), self.get_limit('max', 0))
+        high = max(self.get_limit('min', 0), self.get_limit('max', 0))
+        if low < 0 and self.type[0] == 'u':
+            raise Exception(f'Invalid limit for "{self.name}" negative limit for unsigned type')
+        if low < -32768 or high > 32767:
+            self.checks['full-range'] = True
+
+        # Added by resolve():
+        self.is_bitfield = None
+        delattr(self, "is_bitfield")
+        self.type_name = None
+        delattr(self, "type_name")
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        if 'enum-as-flags' in self.attr and self.attr['enum-as-flags']:
+            self.is_bitfield = True
+        elif 'enum' in self.attr:
+            self.is_bitfield = self.family.consts[self.attr['enum']]['type'] == 'flags'
+        else:
+            self.is_bitfield = False
+
+        if not self.is_bitfield and 'enum' in self.attr:
+            self.type_name = self.family.consts[self.attr['enum']].user_type
+        elif self.is_auto_scalar:
+            self.type_name = '__' + self.type[0] + '64'
+        else:
+            self.type_name = '__' + self.type
+
+    def _attr_policy(self, policy):
+        if 'flags-mask' in self.checks or self.is_bitfield:
+            if self.is_bitfield:
+                enum = self.family.consts[self.attr['enum']]
+                mask = enum.get_mask(as_flags=True)
+            else:
+                flags = self.family.consts[self.checks['flags-mask']]
+                flag_cnt = len(flags['entries'])
+                mask = (1 << flag_cnt) - 1
+            return f"NLA_POLICY_MASK({policy}, 0x{mask:x})"
+        elif 'full-range' in self.checks:
+            return f"NLA_POLICY_FULL_RANGE({policy}, &{c_lower(self.enum_name)}_range)"
+        elif 'range' in self.checks:
+            return f"NLA_POLICY_RANGE({policy}, {self.get_limit_str('min')}, {self.get_limit_str('max')})"
+        elif 'min' in self.checks:
+            return f"NLA_POLICY_MIN({policy}, {self.get_limit_str('min')})"
+        elif 'max' in self.checks:
+            return f"NLA_POLICY_MAX({policy}, {self.get_limit_str('max')})"
+        return super()._attr_policy(policy)
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_U{c_upper(self.type[1:])}, '
+
+    def arg_member(self, ri):
+        return [f'{self.type_name} {self.c_name}{self.byte_order_comment}']
+
+    def attr_put(self, ri, var):
+        self._attr_put_simple(ri, var, self.type)
+
+    def _attr_get(self, ri, var):
+        return f"{var}->{self.c_name} = ynl_attr_get_{self.type}(attr);", None, None
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"{member} = {self.c_name};"]
+
+
+class TypeFlag(Type):
+    def arg_member(self, ri):
+        return []
+
+    def _attr_typol(self):
+        return '.type = YNL_PT_FLAG, '
+
+    def attr_put(self, ri, var):
+        self._attr_put_line(ri, var, f"ynl_attr_put(nlh, {self.enum_name}, NULL, 0)")
+
+    def _attr_get(self, ri, var):
+        return [], None, None
+
+    def _setter_lines(self, ri, member, presence):
+        return []
+
+
+class TypeString(Type):
+    def arg_member(self, ri):
+        return [f"const char *{self.c_name}"]
+
+    def presence_type(self):
+        return 'len'
+
+    def struct_member(self, ri):
+        ri.cw.p(f"char *{self.c_name};")
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NUL_STR, '
+
+    def _attr_policy(self, policy):
+        if 'exact-len' in self.checks:
+            mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')'
+        else:
+            mem = '{ .type = ' + policy
+            if 'max-len' in self.checks:
+                mem += ', .len = ' + self.get_limit_str('max-len')
+            mem += ', }'
+        return mem
+
+    def attr_policy(self, cw):
+        if self.checks.get('unterminated-ok', False):
+            policy = 'NLA_STRING'
+        else:
+            policy = 'NLA_NUL_STRING'
+
+        spec = self._attr_policy(policy)
+        cw.p(f"\t[{self.enum_name}] = {spec},")
+
+    def attr_put(self, ri, var):
+        self._attr_put_simple(ri, var, 'str')
+
+    def _attr_get(self, ri, var):
+        len_mem = var + '->_present.' + self.c_name + '_len'
+        return [f"{len_mem} = len;",
+                f"{var}->{self.c_name} = malloc(len + 1);",
+                f"memcpy({var}->{self.c_name}, ynl_attr_get_str(attr), len);",
+                f"{var}->{self.c_name}[len] = 0;"], \
+               ['len = strnlen(ynl_attr_get_str(attr), ynl_attr_data_len(attr));'], \
+               ['unsigned int len;']
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"free({member});",
+                f"{presence}_len = strlen({self.c_name});",
+                f"{member} = malloc({presence}_len + 1);",
+                f'memcpy({member}, {self.c_name}, {presence}_len);',
+                f'{member}[{presence}_len] = 0;']
+
+
+class TypeBinary(Type):
+    def arg_member(self, ri):
+        return [f"const void *{self.c_name}", 'size_t len']
+
+    def presence_type(self):
+        return 'len'
+
+    def struct_member(self, ri):
+        ri.cw.p(f"void *{self.c_name};")
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_BINARY,'
+
+    def _attr_policy(self, policy):
+        if len(self.checks) == 0:
+            pass
+        elif len(self.checks) == 1:
+            check_name = list(self.checks)[0]
+            if check_name not in {'exact-len', 'min-len', 'max-len'}:
+                raise Exception('Unsupported check for binary type: ' + check_name)
+        else:
+            raise Exception('More than one check for binary type not implemented, yet')
+
+        if len(self.checks) == 0:
+            mem = '{ .type = NLA_BINARY, }'
+        elif 'exact-len' in self.checks:
+            mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')'
+        elif 'min-len' in self.checks:
+            mem = '{ .len = ' + self.get_limit_str('min-len') + ', }'
+        elif 'max-len' in self.checks:
+            mem = 'NLA_POLICY_MAX_LEN(' + self.get_limit_str('max-len') + ')'
+
+        return mem
+
+    def attr_put(self, ri, var):
+        self._attr_put_line(ri, var, f"ynl_attr_put(nlh, {self.enum_name}, " +
+                            f"{var}->{self.c_name}, {var}->_present.{self.c_name}_len)")
+
+    def _attr_get(self, ri, var):
+        len_mem = var + '->_present.' + self.c_name + '_len'
+        return [f"{len_mem} = len;",
+                f"{var}->{self.c_name} = malloc(len);",
+                f"memcpy({var}->{self.c_name}, ynl_attr_data(attr), len);"], \
+               ['len = ynl_attr_data_len(attr);'], \
+               ['unsigned int len;']
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"free({member});",
+                f"{presence}_len = len;",
+                f"{member} = malloc({presence}_len);",
+                f'memcpy({member}, {self.c_name}, {presence}_len);']
+
+
+class TypeBitfield32(Type):
+    def _complex_member_type(self, ri):
+        return "struct nla_bitfield32"
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_BITFIELD32, '
+
+    def _attr_policy(self, policy):
+        if not 'enum' in self.attr:
+            raise Exception('Enum required for bitfield32 attr')
+        enum = self.family.consts[self.attr['enum']]
+        mask = enum.get_mask(as_flags=True)
+        return f"NLA_POLICY_BITFIELD32({mask})"
+
+    def attr_put(self, ri, var):
+        line = f"ynl_attr_put(nlh, {self.enum_name}, &{var}->{self.c_name}, sizeof(struct nla_bitfield32))"
+        self._attr_put_line(ri, var, line)
+
+    def _attr_get(self, ri, var):
+        return f"memcpy(&{var}->{self.c_name}, ynl_attr_data(attr), sizeof(struct nla_bitfield32));", None, None
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"memcpy(&{member}, {self.c_name}, sizeof(struct nla_bitfield32));"]
+
+
+class TypeNest(Type):
+    def is_recursive(self):
+        return self.family.pure_nested_structs[self.nested_attrs].recursive
+
+    def _complex_member_type(self, ri):
+        return self.nested_struct_type
+
+    def free(self, ri, var, ref):
+        at = '&'
+        if self.is_recursive_for_op(ri):
+            at = ''
+            ri.cw.p(f'if ({var}->{ref}{self.c_name})')
+        ri.cw.p(f'{self.nested_render_name}_free({at}{var}->{ref}{self.c_name});')
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+
+    def _attr_policy(self, policy):
+        return 'NLA_POLICY_NESTED(' + self.nested_render_name + '_nl_policy)'
+
+    def attr_put(self, ri, var):
+        at = '' if self.is_recursive_for_op(ri) else '&'
+        self._attr_put_line(ri, var, f"{self.nested_render_name}_put(nlh, " +
+                            f"{self.enum_name}, {at}{var}->{self.c_name})")
+
+    def _attr_get(self, ri, var):
+        get_lines = [f"if ({self.nested_render_name}_parse(&parg, attr))",
+                     "return YNL_PARSE_CB_ERROR;"]
+        init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
+                      f"parg.data = &{var}->{self.c_name};"]
+        return get_lines, init_lines, None
+
+    def setter(self, ri, space, direction, deref=False, ref=None):
+        ref = (ref if ref else []) + [self.c_name]
+
+        for _, attr in ri.family.pure_nested_structs[self.nested_attrs].member_list():
+            if attr.is_recursive():
+                continue
+            attr.setter(ri, self.nested_attrs, direction, deref=deref, ref=ref)
+
+
+class TypeMultiAttr(Type):
+    def __init__(self, family, attr_set, attr, value, base_type):
+        super().__init__(family, attr_set, attr, value)
+
+        self.base_type = base_type
+
+    def is_multi_val(self):
+        return True
+
+    def presence_type(self):
+        return 'count'
+
+    def _complex_member_type(self, ri):
+        if 'type' not in self.attr or self.attr['type'] == 'nest':
+            return self.nested_struct_type
+        elif self.attr['type'] in scalars:
+            scalar_pfx = '__' if ri.ku_space == 'user' else ''
+            return scalar_pfx + self.attr['type']
+        else:
+            raise Exception(f"Sub-type {self.attr['type']} not supported yet")
+
+    def free_needs_iter(self):
+        return 'type' not in self.attr or self.attr['type'] == 'nest'
+
+    def free(self, ri, var, ref):
+        if self.attr['type'] in scalars:
+            ri.cw.p(f"free({var}->{ref}{self.c_name});")
+        elif 'type' not in self.attr or self.attr['type'] == 'nest':
+            ri.cw.p(f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)")
+            ri.cw.p(f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);')
+            ri.cw.p(f"free({var}->{ref}{self.c_name});")
+        else:
+            raise Exception(f"Free of MultiAttr sub-type {self.attr['type']} not supported yet")
+
+    def _attr_policy(self, policy):
+        return self.base_type._attr_policy(policy)
+
+    def _attr_typol(self):
+        return self.base_type._attr_typol()
+
+    def _attr_get(self, ri, var):
+        return f'n_{self.c_name}++;', None, None
+
+    def attr_put(self, ri, var):
+        if self.attr['type'] in scalars:
+            put_type = self.type
+            ri.cw.p(f"for (unsigned int i = 0; i < {var}->n_{self.c_name}; i++)")
+            ri.cw.p(f"ynl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name}[i]);")
+        elif 'type' not in self.attr or self.attr['type'] == 'nest':
+            ri.cw.p(f"for (unsigned int i = 0; i < {var}->n_{self.c_name}; i++)")
+            self._attr_put_line(ri, var, f"{self.nested_render_name}_put(nlh, " +
+                                f"{self.enum_name}, &{var}->{self.c_name}[i])")
+        else:
+            raise Exception(f"Put of MultiAttr sub-type {self.attr['type']} not supported yet")
+
+    def _setter_lines(self, ri, member, presence):
+        # For multi-attr we have a count, not presence, hack up the presence
+        presence = presence[:-(len('_present.') + len(self.c_name))] + "n_" + self.c_name
+        return [f"free({member});",
+                f"{member} = {self.c_name};",
+                f"{presence} = n_{self.c_name};"]
+
+
+class TypeArrayNest(Type):
+    def is_multi_val(self):
+        return True
+
+    def presence_type(self):
+        return 'count'
+
+    def _complex_member_type(self, ri):
+        if 'sub-type' not in self.attr or self.attr['sub-type'] == 'nest':
+            return self.nested_struct_type
+        elif self.attr['sub-type'] in scalars:
+            scalar_pfx = '__' if ri.ku_space == 'user' else ''
+            return scalar_pfx + self.attr['sub-type']
+        else:
+            raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet")
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+
+    def _attr_get(self, ri, var):
+        local_vars = ['const struct nlattr *attr2;']
+        get_lines = [f'attr_{self.c_name} = attr;',
+                     'ynl_attr_for_each_nested(attr2, attr)',
+                     f'\t{var}->n_{self.c_name}++;']
+        return get_lines, None, local_vars
+
+
+class TypeNestTypeValue(Type):
+    def _complex_member_type(self, ri):
+        return self.nested_struct_type
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+
+    def _attr_get(self, ri, var):
+        prev = 'attr'
+        tv_args = ''
+        get_lines = []
+        local_vars = []
+        init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
+                      f"parg.data = &{var}->{self.c_name};"]
+        if 'type-value' in self.attr:
+            tv_names = [c_lower(x) for x in self.attr["type-value"]]
+            local_vars += [f'const struct nlattr *attr_{", *attr_".join(tv_names)};']
+            local_vars += [f'__u32 {", ".join(tv_names)};']
+            for level in self.attr["type-value"]:
+                level = c_lower(level)
+                get_lines += [f'attr_{level} = ynl_attr_data({prev});']
+                get_lines += [f'{level} = ynl_attr_type(attr_{level});']
+                prev = 'attr_' + level
+
+            tv_args = f", {', '.join(tv_names)}"
+
+        get_lines += [f"{self.nested_render_name}_parse(&parg, {prev}{tv_args});"]
+        return get_lines, init_lines, local_vars
+
+
+class Struct:
+    def __init__(self, family, space_name, type_list=None, inherited=None):
+        self.family = family
+        self.space_name = space_name
+        self.attr_set = family.attr_sets[space_name]
+        # Use list to catch comparisons with empty sets
+        self._inherited = inherited if inherited is not None else []
+        self.inherited = []
+
+        self.nested = type_list is None
+        if family.name == c_lower(space_name):
+            self.render_name = c_lower(family.ident_name)
+        else:
+            self.render_name = c_lower(family.ident_name + '-' + space_name)
+        self.struct_name = 'struct ' + self.render_name
+        if self.nested and space_name in family.consts:
+            self.struct_name += '_'
+        self.ptr_name = self.struct_name + ' *'
+        # All attr sets this one contains, directly or multiple levels down
+        self.child_nests = set()
+
+        self.request = False
+        self.reply = False
+        self.recursive = False
+
+        self.attr_list = []
+        self.attrs = dict()
+        if type_list is not None:
+            for t in type_list:
+                self.attr_list.append((t, self.attr_set[t]),)
+        else:
+            for t in self.attr_set:
+                self.attr_list.append((t, self.attr_set[t]),)
+
+        max_val = 0
+        self.attr_max_val = None
+        for name, attr in self.attr_list:
+            if attr.value >= max_val:
+                max_val = attr.value
+                self.attr_max_val = attr
+            self.attrs[name] = attr
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def member_list(self):
+        return self.attr_list
+
+    def set_inherited(self, new_inherited):
+        if self._inherited != new_inherited:
+            raise Exception("Inheriting different members not supported")
+        self.inherited = [c_lower(x) for x in sorted(self._inherited)]
+
+
+class EnumEntry(SpecEnumEntry):
+    def __init__(self, enum_set, yaml, prev, value_start):
+        super().__init__(enum_set, yaml, prev, value_start)
+
+        if prev:
+            self.value_change = (self.value != prev.value + 1)
+        else:
+            self.value_change = (self.value != 0)
+        self.value_change = self.value_change or self.enum_set['type'] == 'flags'
+
+        # Added by resolve:
+        self.c_name = None
+        delattr(self, "c_name")
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        self.c_name = c_upper(self.enum_set.value_pfx + self.name)
+
+
+class EnumSet(SpecEnumSet):
+    def __init__(self, family, yaml):
+        self.render_name = c_lower(family.ident_name + '-' + yaml['name'])
+
+        if 'enum-name' in yaml:
+            if yaml['enum-name']:
+                self.enum_name = 'enum ' + c_lower(yaml['enum-name'])
+                self.user_type = self.enum_name
+            else:
+                self.enum_name = None
+        else:
+            self.enum_name = 'enum ' + self.render_name
+
+        if self.enum_name:
+            self.user_type = self.enum_name
+        else:
+            self.user_type = 'int'
+
+        self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-")
+        self.header = yaml.get('header', None)
+        self.enum_cnt_name = yaml.get('enum-cnt-name', None)
+
+        super().__init__(family, yaml)
+
+    def new_entry(self, entry, prev_entry, value_start):
+        return EnumEntry(self, entry, prev_entry, value_start)
+
+    def value_range(self):
+        low = min([x.value for x in self.entries.values()])
+        high = max([x.value for x in self.entries.values()])
+
+        if high - low + 1 != len(self.entries):
+            raise Exception("Can't get value range for a noncontiguous enum")
+
+        return low, high
+
+
+class AttrSet(SpecAttrSet):
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        if self.subset_of is None:
+            if 'name-prefix' in yaml:
+                pfx = yaml['name-prefix']
+            elif self.name == family.name:
+                pfx = family.ident_name + '-a-'
+            else:
+                pfx = f"{family.ident_name}-a-{self.name}-"
+            self.name_prefix = c_upper(pfx)
+            self.max_name = c_upper(self.yaml.get('attr-max-name', f"{self.name_prefix}max"))
+            self.cnt_name = c_upper(self.yaml.get('attr-cnt-name', f"__{self.name_prefix}max"))
+        else:
+            self.name_prefix = family.attr_sets[self.subset_of].name_prefix
+            self.max_name = family.attr_sets[self.subset_of].max_name
+            self.cnt_name = family.attr_sets[self.subset_of].cnt_name
+
+        # Added by resolve:
+        self.c_name = None
+        delattr(self, "c_name")
+
+    def resolve(self):
+        self.c_name = c_lower(self.name)
+        if self.c_name in _C_KW:
+            self.c_name += '_'
+        if self.c_name == self.family.c_name:
+            self.c_name = ''
+
+    def new_attr(self, elem, value):
+        if elem['type'] in scalars:
+            t = TypeScalar(self.family, self, elem, value)
+        elif elem['type'] == 'unused':
+            t = TypeUnused(self.family, self, elem, value)
+        elif elem['type'] == 'pad':
+            t = TypePad(self.family, self, elem, value)
+        elif elem['type'] == 'flag':
+            t = TypeFlag(self.family, self, elem, value)
+        elif elem['type'] == 'string':
+            t = TypeString(self.family, self, elem, value)
+        elif elem['type'] == 'binary':
+            t = TypeBinary(self.family, self, elem, value)
+        elif elem['type'] == 'bitfield32':
+            t = TypeBitfield32(self.family, self, elem, value)
+        elif elem['type'] == 'nest':
+            t = TypeNest(self.family, self, elem, value)
+        elif elem['type'] == 'indexed-array' and 'sub-type' in elem:
+            if elem["sub-type"] == 'nest':
+                t = TypeArrayNest(self.family, self, elem, value)
+            else:
+                raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}')
+        elif elem['type'] == 'nest-type-value':
+            t = TypeNestTypeValue(self.family, self, elem, value)
+        else:
+            raise Exception(f"No typed class for type {elem['type']}")
+
+        if 'multi-attr' in elem and elem['multi-attr']:
+            t = TypeMultiAttr(self.family, self, elem, value, t)
+
+        return t
+
+
+class Operation(SpecOperation):
+    def __init__(self, family, yaml, req_value, rsp_value):
+        super().__init__(family, yaml, req_value, rsp_value)
+
+        self.render_name = c_lower(family.ident_name + '_' + self.name)
+
+        self.dual_policy = ('do' in yaml and 'request' in yaml['do']) and \
+                         ('dump' in yaml and 'request' in yaml['dump'])
+
+        self.has_ntf = False
+
+        # Added by resolve:
+        self.enum_name = None
+        delattr(self, "enum_name")
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        if not self.is_async:
+            self.enum_name = self.family.op_prefix + c_upper(self.name)
+        else:
+            self.enum_name = self.family.async_op_prefix + c_upper(self.name)
+
+    def mark_has_ntf(self):
+        self.has_ntf = True
+
+
+class Family(SpecFamily):
+    def __init__(self, file_name, exclude_ops):
+        # Added by resolve:
+        self.c_name = None
+        delattr(self, "c_name")
+        self.op_prefix = None
+        delattr(self, "op_prefix")
+        self.async_op_prefix = None
+        delattr(self, "async_op_prefix")
+        self.mcgrps = None
+        delattr(self, "mcgrps")
+        self.consts = None
+        delattr(self, "consts")
+        self.hooks = None
+        delattr(self, "hooks")
+
+        super().__init__(file_name, exclude_ops=exclude_ops)
+
+        self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME'))
+        self.ver_key = c_upper(self.yaml.get('c-version-name', self.yaml["name"] + '_FAMILY_VERSION'))
+
+        if 'definitions' not in self.yaml:
+            self.yaml['definitions'] = []
+
+        if 'uapi-header' in self.yaml:
+            self.uapi_header = self.yaml['uapi-header']
+        else:
+            self.uapi_header = f"linux/{self.ident_name}.h"
+        if self.uapi_header.startswith("linux/") and self.uapi_header.endswith('.h'):
+            self.uapi_header_name = self.uapi_header[6:-2]
+        else:
+            self.uapi_header_name = self.ident_name
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}:
+            raise Exception("Codegen only supported for genetlink")
+
+        self.c_name = c_lower(self.ident_name)
+        if 'name-prefix' in self.yaml['operations']:
+            self.op_prefix = c_upper(self.yaml['operations']['name-prefix'])
+        else:
+            self.op_prefix = c_upper(self.yaml['name'] + '-cmd-')
+        if 'async-prefix' in self.yaml['operations']:
+            self.async_op_prefix = c_upper(self.yaml['operations']['async-prefix'])
+        else:
+            self.async_op_prefix = self.op_prefix
+
+        self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
+
+        self.hooks = dict()
+        for when in ['pre', 'post']:
+            self.hooks[when] = dict()
+            for op_mode in ['do', 'dump']:
+                self.hooks[when][op_mode] = dict()
+                self.hooks[when][op_mode]['set'] = set()
+                self.hooks[when][op_mode]['list'] = []
+
+        # dict space-name -> 'request': set(attrs), 'reply': set(attrs)
+        self.root_sets = dict()
+        # dict space-name -> set('request', 'reply')
+        self.pure_nested_structs = dict()
+
+        self._mark_notify()
+        self._mock_up_events()
+
+        self._load_root_sets()
+        self._load_nested_sets()
+        self._load_attr_use()
+        self._load_hooks()
+
+        self.kernel_policy = self.yaml.get('kernel-policy', 'split')
+        if self.kernel_policy == 'global':
+            self._load_global_policy()
+
+    def new_enum(self, elem):
+        return EnumSet(self, elem)
+
+    def new_attr_set(self, elem):
+        return AttrSet(self, elem)
+
+    def new_operation(self, elem, req_value, rsp_value):
+        return Operation(self, elem, req_value, rsp_value)
+
+    def _mark_notify(self):
+        for op in self.msgs.values():
+            if 'notify' in op:
+                self.ops[op['notify']].mark_has_ntf()
+
+    # Fake a 'do' equivalent of all events, so that we can render their response parsing
+    def _mock_up_events(self):
+        for op in self.yaml['operations']['list']:
+            if 'event' in op:
+                op['do'] = {
+                    'reply': {
+                        'attributes': op['event']['attributes']
+                    }
+                }
+
+    def _load_root_sets(self):
+        for op_name, op in self.msgs.items():
+            if 'attribute-set' not in op:
+                continue
+
+            req_attrs = set()
+            rsp_attrs = set()
+            for op_mode in ['do', 'dump']:
+                if op_mode in op and 'request' in op[op_mode]:
+                    req_attrs.update(set(op[op_mode]['request']['attributes']))
+                if op_mode in op and 'reply' in op[op_mode]:
+                    rsp_attrs.update(set(op[op_mode]['reply']['attributes']))
+            if 'event' in op:
+                rsp_attrs.update(set(op['event']['attributes']))
+
+            if op['attribute-set'] not in self.root_sets:
+                self.root_sets[op['attribute-set']] = {'request': req_attrs, 'reply': rsp_attrs}
+            else:
+                self.root_sets[op['attribute-set']]['request'].update(req_attrs)
+                self.root_sets[op['attribute-set']]['reply'].update(rsp_attrs)
+
+    def _sort_pure_types(self):
+        # Try to reorder according to dependencies
+        pns_key_list = list(self.pure_nested_structs.keys())
+        pns_key_seen = set()
+        rounds = len(pns_key_list) ** 2  # it's basically bubble sort
+        for _ in range(rounds):
+            if len(pns_key_list) == 0:
+                break
+            name = pns_key_list.pop(0)
+            finished = True
+            for _, spec in self.attr_sets[name].items():
+                if 'nested-attributes' in spec:
+                    nested = spec['nested-attributes']
+                    # If the unknown nest we hit is recursive it's fine, it'll be a pointer
+                    if self.pure_nested_structs[nested].recursive:
+                        continue
+                    if nested not in pns_key_seen:
+                        # Dicts are sorted, this will make struct last
+                        struct = self.pure_nested_structs.pop(name)
+                        self.pure_nested_structs[name] = struct
+                        finished = False
+                        break
+            if finished:
+                pns_key_seen.add(name)
+            else:
+                pns_key_list.append(name)
+
+    def _load_nested_sets(self):
+        attr_set_queue = list(self.root_sets.keys())
+        attr_set_seen = set(self.root_sets.keys())
+
+        while len(attr_set_queue):
+            a_set = attr_set_queue.pop(0)
+            for attr, spec in self.attr_sets[a_set].items():
+                if 'nested-attributes' not in spec:
+                    continue
+
+                nested = spec['nested-attributes']
+                if nested not in attr_set_seen:
+                    attr_set_queue.append(nested)
+                    attr_set_seen.add(nested)
+
+                inherit = set()
+                if nested not in self.root_sets:
+                    if nested not in self.pure_nested_structs:
+                        self.pure_nested_structs[nested] = Struct(self, nested, inherited=inherit)
+                else:
+                    raise Exception(f'Using attr set as root and nested not supported - {nested}')
+
+                if 'type-value' in spec:
+                    if nested in self.root_sets:
+                        raise Exception("Inheriting members to a space used as root not supported")
+                    inherit.update(set(spec['type-value']))
+                elif spec['type'] == 'indexed-array':
+                    inherit.add('idx')
+                self.pure_nested_structs[nested].set_inherited(inherit)
+
+        for root_set, rs_members in self.root_sets.items():
+            for attr, spec in self.attr_sets[root_set].items():
+                if 'nested-attributes' in spec:
+                    nested = spec['nested-attributes']
+                    if attr in rs_members['request']:
+                        self.pure_nested_structs[nested].request = True
+                    if attr in rs_members['reply']:
+                        self.pure_nested_structs[nested].reply = True
+
+        self._sort_pure_types()
+
+        # Propagate the request / reply / recursive
+        for attr_set, struct in reversed(self.pure_nested_structs.items()):
+            for _, spec in self.attr_sets[attr_set].items():
+                if 'nested-attributes' in spec:
+                    child_name = spec['nested-attributes']
+                    struct.child_nests.add(child_name)
+                    child = self.pure_nested_structs.get(child_name)
+                    if child:
+                        if not child.recursive:
+                            struct.child_nests.update(child.child_nests)
+                        child.request |= struct.request
+                        child.reply |= struct.reply
+                if attr_set in struct.child_nests:
+                    struct.recursive = True
+
+        self._sort_pure_types()
+
+    def _load_attr_use(self):
+        for _, struct in self.pure_nested_structs.items():
+            if struct.request:
+                for _, arg in struct.member_list():
+                    arg.set_request()
+            if struct.reply:
+                for _, arg in struct.member_list():
+                    arg.set_reply()
+
+        for root_set, rs_members in self.root_sets.items():
+            for attr, spec in self.attr_sets[root_set].items():
+                if attr in rs_members['request']:
+                    spec.set_request()
+                if attr in rs_members['reply']:
+                    spec.set_reply()
+
+    def _load_global_policy(self):
+        global_set = set()
+        attr_set_name = None
+        for op_name, op in self.ops.items():
+            if not op:
+                continue
+            if 'attribute-set' not in op:
+                continue
+
+            if attr_set_name is None:
+                attr_set_name = op['attribute-set']
+            if attr_set_name != op['attribute-set']:
+                raise Exception('For a global policy all ops must use the same set')
+
+            for op_mode in ['do', 'dump']:
+                if op_mode in op:
+                    req = op[op_mode].get('request')
+                    if req:
+                        global_set.update(req.get('attributes', []))
+
+        self.global_policy = []
+        self.global_policy_set = attr_set_name
+        for attr in self.attr_sets[attr_set_name]:
+            if attr in global_set:
+                self.global_policy.append(attr)
+
+    def _load_hooks(self):
+        for op in self.ops.values():
+            for op_mode in ['do', 'dump']:
+                if op_mode not in op:
+                    continue
+                for when in ['pre', 'post']:
+                    if when not in op[op_mode]:
+                        continue
+                    name = op[op_mode][when]
+                    if name in self.hooks[when][op_mode]['set']:
+                        continue
+                    self.hooks[when][op_mode]['set'].add(name)
+                    self.hooks[when][op_mode]['list'].append(name)
+
+
+class RenderInfo:
+    def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None):
+        self.family = family
+        self.nl = cw.nlib
+        self.ku_space = ku_space
+        self.op_mode = op_mode
+        self.op = op
+
+        self.fixed_hdr = None
+        if op and op.fixed_header:
+            self.fixed_hdr = 'struct ' + c_lower(op.fixed_header)
+
+        # 'do' and 'dump' response parsing is identical
+        self.type_consistent = True
+        if op_mode != 'do' and 'dump' in op:
+            if 'do' in op:
+                if ('reply' in op['do']) != ('reply' in op["dump"]):
+                    self.type_consistent = False
+                elif 'reply' in op['do'] and op["do"]["reply"] != op["dump"]["reply"]:
+                    self.type_consistent = False
+            else:
+                self.type_consistent = False
+
+        self.attr_set = attr_set
+        if not self.attr_set:
+            self.attr_set = op['attribute-set']
+
+        self.type_name_conflict = False
+        if op:
+            self.type_name = c_lower(op.name)
+        else:
+            self.type_name = c_lower(attr_set)
+            if attr_set in family.consts:
+                self.type_name_conflict = True
+
+        self.cw = cw
+
+        self.struct = dict()
+        if op_mode == 'notify':
+            op_mode = 'do'
+        for op_dir in ['request', 'reply']:
+            if op:
+                type_list = []
+                if op_dir in op[op_mode]:
+                    type_list = op[op_mode][op_dir]['attributes']
+                self.struct[op_dir] = Struct(family, self.attr_set, type_list=type_list)
+        if op_mode == 'event':
+            self.struct['reply'] = Struct(family, self.attr_set, type_list=op['event']['attributes'])
+
+
+class CodeWriter:
+    def __init__(self, nlib, out_file=None, overwrite=True):
+        self.nlib = nlib
+        self._overwrite = overwrite
+
+        self._nl = False
+        self._block_end = False
+        self._silent_block = False
+        self._ind = 0
+        self._ifdef_block = None
+        if out_file is None:
+            self._out = os.sys.stdout
+        else:
+            self._out = tempfile.NamedTemporaryFile('w+')
+            self._out_file = out_file
+
+    def __del__(self):
+        self.close_out_file()
+
+    def close_out_file(self):
+        if self._out == os.sys.stdout:
+            return
+        # Avoid modifying the file if contents didn't change
+        self._out.flush()
+        if not self._overwrite and os.path.isfile(self._out_file):
+            if filecmp.cmp(self._out.name, self._out_file, shallow=False):
+                return
+        with open(self._out_file, 'w+') as out_file:
+            self._out.seek(0)
+            shutil.copyfileobj(self._out, out_file)
+            self._out.close()
+        self._out = os.sys.stdout
+
+    @classmethod
+    def _is_cond(cls, line):
+        return line.startswith('if') or line.startswith('while') or line.startswith('for')
+
+    def p(self, line, add_ind=0):
+        if self._block_end:
+            self._block_end = False
+            if line.startswith('else'):
+                line = '} ' + line
+            else:
+                self._out.write('\t' * self._ind + '}\n')
+
+        if self._nl:
+            self._out.write('\n')
+            self._nl = False
+
+        ind = self._ind
+        if line[-1] == ':':
+            ind -= 1
+        if self._silent_block:
+            ind += 1
+        self._silent_block = line.endswith(')') and CodeWriter._is_cond(line)
+        if line[0] == '#':
+            ind = 0
+        if add_ind:
+            ind += add_ind
+        self._out.write('\t' * ind + line + '\n')
+
+    def nl(self):
+        self._nl = True
+
+    def block_start(self, line=''):
+        if line:
+            line = line + ' '
+        self.p(line + '{')
+        self._ind += 1
+
+    def block_end(self, line=''):
+        if line and line[0] not in {';', ','}:
+            line = ' ' + line
+        self._ind -= 1
+        self._nl = False
+        if not line:
+            # Delay printing closing bracket in case "else" comes next
+            if self._block_end:
+                self._out.write('\t' * (self._ind + 1) + '}\n')
+            self._block_end = True
+        else:
+            self.p('}' + line)
+
+    def write_doc_line(self, doc, indent=True):
+        words = doc.split()
+        line = ' *'
+        for word in words:
+            if len(line) + len(word) >= 79:
+                self.p(line)
+                line = ' *'
+                if indent:
+                    line += '  '
+            line += ' ' + word
+        self.p(line)
+
+    def write_func_prot(self, qual_ret, name, args=None, doc=None, suffix=''):
+        if not args:
+            args = ['void']
+
+        if doc:
+            self.p('/*')
+            self.p(' * ' + doc)
+            self.p(' */')
+
+        oneline = qual_ret
+        if qual_ret[-1] != '*':
+            oneline += ' '
+        oneline += f"{name}({', '.join(args)}){suffix}"
+
+        if len(oneline) < 80:
+            self.p(oneline)
+            return
+
+        v = qual_ret
+        if len(v) > 3:
+            self.p(v)
+            v = ''
+        elif qual_ret[-1] != '*':
+            v += ' '
+        v += name + '('
+        ind = '\t' * (len(v) // 8) + ' ' * (len(v) % 8)
+        delta_ind = len(v) - len(ind)
+        v += args[0]
+        i = 1
+        while i < len(args):
+            next_len = len(v) + len(args[i])
+            if v[0] == '\t':
+                next_len += delta_ind
+            if next_len > 76:
+                self.p(v + ',')
+                v = ind
+            else:
+                v += ', '
+            v += args[i]
+            i += 1
+        self.p(v + ')' + suffix)
+
+    def write_func_lvar(self, local_vars):
+        if not local_vars:
+            return
+
+        if type(local_vars) is str:
+            local_vars = [local_vars]
+
+        local_vars.sort(key=len, reverse=True)
+        for var in local_vars:
+            self.p(var)
+        self.nl()
+
+    def write_func(self, qual_ret, name, body, args=None, local_vars=None):
+        self.write_func_prot(qual_ret=qual_ret, name=name, args=args)
+        self.write_func_lvar(local_vars=local_vars)
+
+        self.block_start()
+        for line in body:
+            self.p(line)
+        self.block_end()
+
+    def writes_defines(self, defines):
+        longest = 0
+        for define in defines:
+            if len(define[0]) > longest:
+                longest = len(define[0])
+        longest = ((longest + 8) // 8) * 8
+        for define in defines:
+            line = '#define ' + define[0]
+            line += '\t' * ((longest - len(define[0]) + 7) // 8)
+            if type(define[1]) is int:
+                line += str(define[1])
+            elif type(define[1]) is str:
+                line += '"' + define[1] + '"'
+            self.p(line)
+
+    def write_struct_init(self, members):
+        longest = max([len(x[0]) for x in members])
+        longest += 1  # because we prepend a .
+        longest = ((longest + 8) // 8) * 8
+        for one in members:
+            line = '.' + one[0]
+            line += '\t' * ((longest - len(one[0]) - 1 + 7) // 8)
+            line += '= ' + str(one[1]) + ','
+            self.p(line)
+
+    def ifdef_block(self, config):
+        config_option = None
+        if config:
+            config_option = 'CONFIG_' + c_upper(config)
+        if self._ifdef_block == config_option:
+            return
+
+        if self._ifdef_block:
+            self.p('#endif /* ' + self._ifdef_block + ' */')
+        if config_option:
+            self.p('#ifdef ' + config_option)
+        self._ifdef_block = config_option
+
+
+scalars = {'u8', 'u16', 'u32', 'u64', 's32', 's64', 'uint', 'sint'}
+
+direction_to_suffix = {
+    'reply': '_rsp',
+    'request': '_req',
+    '': ''
+}
+
+op_mode_to_wrapper = {
+    'do': '',
+    'dump': '_list',
+    'notify': '_ntf',
+    'event': '',
+}
+
+_C_KW = {
+    'auto',
+    'bool',
+    'break',
+    'case',
+    'char',
+    'const',
+    'continue',
+    'default',
+    'do',
+    'double',
+    'else',
+    'enum',
+    'extern',
+    'float',
+    'for',
+    'goto',
+    'if',
+    'inline',
+    'int',
+    'long',
+    'register',
+    'return',
+    'short',
+    'signed',
+    'sizeof',
+    'static',
+    'struct',
+    'switch',
+    'typedef',
+    'union',
+    'unsigned',
+    'void',
+    'volatile',
+    'while'
+}
+
+
+def rdir(direction):
+    if direction == 'reply':
+        return 'request'
+    if direction == 'request':
+        return 'reply'
+    return direction
+
+
+def op_prefix(ri, direction, deref=False):
+    suffix = f"_{ri.type_name}"
+
+    if not ri.op_mode or ri.op_mode == 'do':
+        suffix += f"{direction_to_suffix[direction]}"
+    else:
+        if direction == 'request':
+            suffix += '_req_dump'
+        else:
+            if ri.type_consistent:
+                if deref:
+                    suffix += f"{direction_to_suffix[direction]}"
+                else:
+                    suffix += op_mode_to_wrapper[ri.op_mode]
+            else:
+                suffix += '_rsp'
+                suffix += '_dump' if deref else '_list'
+
+    return f"{ri.family.c_name}{suffix}"
+
+
+def type_name(ri, direction, deref=False):
+    return f"struct {op_prefix(ri, direction, deref=deref)}"
+
+
+def print_prototype(ri, direction, terminate=True, doc=None):
+    suffix = ';' if terminate else ''
+
+    fname = ri.op.render_name
+    if ri.op_mode == 'dump':
+        fname += '_dump'
+
+    args = ['struct ynl_sock *ys']
+    if 'request' in ri.op[ri.op_mode]:
+        args.append(f"{type_name(ri, direction)} *" + f"{direction_to_suffix[direction][1:]}")
+
+    ret = 'int'
+    if 'reply' in ri.op[ri.op_mode]:
+        ret = f"{type_name(ri, rdir(direction))} *"
+
+    ri.cw.write_func_prot(ret, fname, args, doc=doc, suffix=suffix)
+
+
+def print_req_prototype(ri):
+    print_prototype(ri, "request", doc=ri.op['doc'])
+
+
+def print_dump_prototype(ri):
+    print_prototype(ri, "request")
+
+
+def put_typol_fwd(cw, struct):
+    cw.p(f'extern const struct ynl_policy_nest {struct.render_name}_nest;')
+
+
+def put_typol(cw, struct):
+    type_max = struct.attr_set.max_name
+    cw.block_start(line=f'const struct ynl_policy_attr {struct.render_name}_policy[{type_max} + 1] =')
+
+    for _, arg in struct.member_list():
+        arg.attr_typol(cw)
+
+    cw.block_end(line=';')
+    cw.nl()
+
+    cw.block_start(line=f'const struct ynl_policy_nest {struct.render_name}_nest =')
+    cw.p(f'.max_attr = {type_max},')
+    cw.p(f'.table = {struct.render_name}_policy,')
+    cw.block_end(line=';')
+    cw.nl()
+
+
+def _put_enum_to_str_helper(cw, render_name, map_name, arg_name, enum=None):
+    args = [f'int {arg_name}']
+    if enum:
+        args = [enum.user_type + ' ' + arg_name]
+    cw.write_func_prot('const char *', f'{render_name}_str', args)
+    cw.block_start()
+    if enum and enum.type == 'flags':
+        cw.p(f'{arg_name} = ffs({arg_name}) - 1;')
+    cw.p(f'if ({arg_name} < 0 || {arg_name} >= (int)YNL_ARRAY_SIZE({map_name}))')
+    cw.p('return NULL;')
+    cw.p(f'return {map_name}[{arg_name}];')
+    cw.block_end()
+    cw.nl()
+
+
+def put_op_name_fwd(family, cw):
+    cw.write_func_prot('const char *', f'{family.c_name}_op_str', ['int op'], suffix=';')
+
+
+def put_op_name(family, cw):
+    map_name = f'{family.c_name}_op_strmap'
+    cw.block_start(line=f"static const char * const {map_name}[] =")
+    for op_name, op in family.msgs.items():
+        if op.rsp_value:
+            # Make sure we don't add duplicated entries, if multiple commands
+            # produce the same response in legacy families.
+            if family.rsp_by_value[op.rsp_value] != op:
+                cw.p(f'// skip "{op_name}", duplicate reply value')
+                continue
+
+            if op.req_value == op.rsp_value:
+                cw.p(f'[{op.enum_name}] = "{op_name}",')
+            else:
+                cw.p(f'[{op.rsp_value}] = "{op_name}",')
+    cw.block_end(line=';')
+    cw.nl()
+
+    _put_enum_to_str_helper(cw, family.c_name + '_op', map_name, 'op')
+
+
+def put_enum_to_str_fwd(family, cw, enum):
+    args = [enum.user_type + ' value']
+    cw.write_func_prot('const char *', f'{enum.render_name}_str', args, suffix=';')
+
+
+def put_enum_to_str(family, cw, enum):
+    map_name = f'{enum.render_name}_strmap'
+    cw.block_start(line=f"static const char * const {map_name}[] =")
+    for entry in enum.entries.values():
+        cw.p(f'[{entry.value}] = "{entry.name}",')
+    cw.block_end(line=';')
+    cw.nl()
+
+    _put_enum_to_str_helper(cw, enum.render_name, map_name, 'value', enum=enum)
+
+
+def put_req_nested_prototype(ri, struct, suffix=';'):
+    func_args = ['struct nlmsghdr *nlh',
+                 'unsigned int attr_type',
+                 f'{struct.ptr_name}obj']
+
+    ri.cw.write_func_prot('int', f'{struct.render_name}_put', func_args,
+                          suffix=suffix)
+
+
+def put_req_nested(ri, struct):
+    put_req_nested_prototype(ri, struct, suffix='')
+    ri.cw.block_start()
+    ri.cw.write_func_lvar('struct nlattr *nest;')
+
+    ri.cw.p("nest = ynl_attr_nest_start(nlh, attr_type);")
+
+    for _, arg in struct.member_list():
+        arg.attr_put(ri, "obj")
+
+    ri.cw.p("ynl_attr_nest_end(nlh, nest);")
+
+    ri.cw.nl()
+    ri.cw.p('return 0;')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def _multi_parse(ri, struct, init_lines, local_vars):
+    if struct.nested:
+        iter_line = "ynl_attr_for_each_nested(attr, nested)"
+    else:
+        if ri.fixed_hdr:
+            local_vars += ['void *hdr;']
+        iter_line = "ynl_attr_for_each(attr, nlh, yarg->ys->family->hdr_len)"
+
+    array_nests = set()
+    multi_attrs = set()
+    needs_parg = False
+    for arg, aspec in struct.member_list():
+        if aspec['type'] == 'indexed-array' and 'sub-type' in aspec:
+            if aspec["sub-type"] == 'nest':
+                local_vars.append(f'const struct nlattr *attr_{aspec.c_name};')
+                array_nests.add(arg)
+            else:
+                raise Exception(f'Not supported sub-type {aspec["sub-type"]}')
+        if 'multi-attr' in aspec:
+            multi_attrs.add(arg)
+        needs_parg |= 'nested-attributes' in aspec
+    if array_nests or multi_attrs:
+        local_vars.append('int i;')
+    if needs_parg:
+        local_vars.append('struct ynl_parse_arg parg;')
+        init_lines.append('parg.ys = yarg->ys;')
+
+    all_multi = array_nests | multi_attrs
+
+    for anest in sorted(all_multi):
+        local_vars.append(f"unsigned int n_{struct[anest].c_name} = 0;")
+
+    ri.cw.block_start()
+    ri.cw.write_func_lvar(local_vars)
+
+    for line in init_lines:
+        ri.cw.p(line)
+    ri.cw.nl()
+
+    for arg in struct.inherited:
+        ri.cw.p(f'dst->{arg} = {arg};')
+
+    if ri.fixed_hdr:
+        ri.cw.p('hdr = ynl_nlmsg_data_offset(nlh, sizeof(struct genlmsghdr));')
+        ri.cw.p(f"memcpy(&dst->_hdr, hdr, sizeof({ri.fixed_hdr}));")
+    for anest in sorted(all_multi):
+        aspec = struct[anest]
+        ri.cw.p(f"if (dst->{aspec.c_name})")
+        ri.cw.p(f'return ynl_error_parse(yarg, "attribute already present ({struct.attr_set.name}.{aspec.name})");')
+
+    ri.cw.nl()
+    ri.cw.block_start(line=iter_line)
+    ri.cw.p('unsigned int type = ynl_attr_type(attr);')
+    ri.cw.nl()
+
+    first = True
+    for _, arg in struct.member_list():
+        good = arg.attr_get(ri, 'dst', first=first)
+        # First may be 'unused' or 'pad', ignore those
+        first &= not good
+
+    ri.cw.block_end()
+    ri.cw.nl()
+
+    for anest in sorted(array_nests):
+        aspec = struct[anest]
+
+        ri.cw.block_start(line=f"if (n_{aspec.c_name})")
+        ri.cw.p(f"dst->{aspec.c_name} = calloc(n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));")
+        ri.cw.p(f"dst->n_{aspec.c_name} = n_{aspec.c_name};")
+        ri.cw.p('i = 0;')
+        ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;")
+        ri.cw.block_start(line=f"ynl_attr_for_each_nested(attr, attr_{aspec.c_name})")
+        ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];")
+        ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr, ynl_attr_type(attr)))")
+        ri.cw.p('return YNL_PARSE_CB_ERROR;')
+        ri.cw.p('i++;')
+        ri.cw.block_end()
+        ri.cw.block_end()
+    ri.cw.nl()
+
+    for anest in sorted(multi_attrs):
+        aspec = struct[anest]
+        ri.cw.block_start(line=f"if (n_{aspec.c_name})")
+        ri.cw.p(f"dst->{aspec.c_name} = calloc(n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));")
+        ri.cw.p(f"dst->n_{aspec.c_name} = n_{aspec.c_name};")
+        ri.cw.p('i = 0;')
+        if 'nested-attributes' in aspec:
+            ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;")
+        ri.cw.block_start(line=iter_line)
+        ri.cw.block_start(line=f"if (ynl_attr_type(attr) == {aspec.enum_name})")
+        if 'nested-attributes' in aspec:
+            ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];")
+            ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr))")
+            ri.cw.p('return YNL_PARSE_CB_ERROR;')
+        elif aspec.type in scalars:
+            ri.cw.p(f"dst->{aspec.c_name}[i] = ynl_attr_get_{aspec.type}(attr);")
+        else:
+            raise Exception('Nest parsing type not supported yet')
+        ri.cw.p('i++;')
+        ri.cw.block_end()
+        ri.cw.block_end()
+        ri.cw.block_end()
+    ri.cw.nl()
+
+    if struct.nested:
+        ri.cw.p('return 0;')
+    else:
+        ri.cw.p('return YNL_PARSE_CB_OK;')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def parse_rsp_nested_prototype(ri, struct, suffix=';'):
+    func_args = ['struct ynl_parse_arg *yarg',
+                 'const struct nlattr *nested']
+    for arg in struct.inherited:
+        func_args.append('__u32 ' + arg)
+
+    ri.cw.write_func_prot('int', f'{struct.render_name}_parse', func_args,
+                          suffix=suffix)
+
+
+def parse_rsp_nested(ri, struct):
+    parse_rsp_nested_prototype(ri, struct, suffix='')
+
+    local_vars = ['const struct nlattr *attr;',
+                  f'{struct.ptr_name}dst = yarg->data;']
+    init_lines = []
+
+    if struct.member_list():
+        _multi_parse(ri, struct, init_lines, local_vars)
+    else:
+        # Empty nest
+        ri.cw.block_start()
+        ri.cw.p('return 0;')
+        ri.cw.block_end()
+        ri.cw.nl()
+
+
+def parse_rsp_msg(ri, deref=False):
+    if 'reply' not in ri.op[ri.op_mode] and ri.op_mode != 'event':
+        return
+
+    func_args = ['const struct nlmsghdr *nlh',
+                 'struct ynl_parse_arg *yarg']
+
+    local_vars = [f'{type_name(ri, "reply", deref=deref)} *dst;',
+                  'const struct nlattr *attr;']
+    init_lines = ['dst = yarg->data;']
+
+    ri.cw.write_func_prot('int', f'{op_prefix(ri, "reply", deref=deref)}_parse', func_args)
+
+    if ri.struct["reply"].member_list():
+        _multi_parse(ri, ri.struct["reply"], init_lines, local_vars)
+    else:
+        # Empty reply
+        ri.cw.block_start()
+        ri.cw.p('return YNL_PARSE_CB_OK;')
+        ri.cw.block_end()
+        ri.cw.nl()
+
+
+def print_req(ri):
+    ret_ok = '0'
+    ret_err = '-1'
+    direction = "request"
+    local_vars = ['struct ynl_req_state yrs = { .yarg = { .ys = ys, }, };',
+                  'struct nlmsghdr *nlh;',
+                  'int err;']
+
+    if 'reply' in ri.op[ri.op_mode]:
+        ret_ok = 'rsp'
+        ret_err = 'NULL'
+        local_vars += [f'{type_name(ri, rdir(direction))} *rsp;']
+
+    if ri.fixed_hdr:
+        local_vars += ['size_t hdr_len;',
+                       'void *hdr;']
+
+    print_prototype(ri, direction, terminate=False)
+    ri.cw.block_start()
+    ri.cw.write_func_lvar(local_vars)
+
+    ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);")
+
+    ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;")
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p(f"yrs.yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
+    ri.cw.nl()
+
+    if ri.fixed_hdr:
+        ri.cw.p("hdr_len = sizeof(req->_hdr);")
+        ri.cw.p("hdr = ynl_nlmsg_put_extra_header(nlh, hdr_len);")
+        ri.cw.p("memcpy(hdr, &req->_hdr, hdr_len);")
+        ri.cw.nl()
+
+    for _, attr in ri.struct["request"].member_list():
+        attr.attr_put(ri, "req")
+    ri.cw.nl()
+
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p('rsp = calloc(1, sizeof(*rsp));')
+        ri.cw.p('yrs.yarg.data = rsp;')
+        ri.cw.p(f"yrs.cb = {op_prefix(ri, 'reply')}_parse;")
+        if ri.op.value is not None:
+            ri.cw.p(f'yrs.rsp_cmd = {ri.op.enum_name};')
+        else:
+            ri.cw.p(f'yrs.rsp_cmd = {ri.op.rsp_value};')
+        ri.cw.nl()
+    ri.cw.p("err = ynl_exec(ys, nlh, &yrs);")
+    ri.cw.p('if (err < 0)')
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p('goto err_free;')
+    else:
+        ri.cw.p('return -1;')
+    ri.cw.nl()
+
+    ri.cw.p(f"return {ret_ok};")
+    ri.cw.nl()
+
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p('err_free:')
+        ri.cw.p(f"{call_free(ri, rdir(direction), 'rsp')}")
+        ri.cw.p(f"return {ret_err};")
+
+    ri.cw.block_end()
+
+
+def print_dump(ri):
+    direction = "request"
+    print_prototype(ri, direction, terminate=False)
+    ri.cw.block_start()
+    local_vars = ['struct ynl_dump_state yds = {};',
+                  'struct nlmsghdr *nlh;',
+                  'int err;']
+
+    if ri.fixed_hdr:
+        local_vars += ['size_t hdr_len;',
+                       'void *hdr;']
+
+    ri.cw.write_func_lvar(local_vars)
+
+    ri.cw.p('yds.yarg.ys = ys;')
+    ri.cw.p(f"yds.yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
+    ri.cw.p("yds.yarg.data = NULL;")
+    ri.cw.p(f"yds.alloc_sz = sizeof({type_name(ri, rdir(direction))});")
+    ri.cw.p(f"yds.cb = {op_prefix(ri, 'reply', deref=True)}_parse;")
+    if ri.op.value is not None:
+        ri.cw.p(f'yds.rsp_cmd = {ri.op.enum_name};')
+    else:
+        ri.cw.p(f'yds.rsp_cmd = {ri.op.rsp_value};')
+    ri.cw.nl()
+    ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);")
+
+    if ri.fixed_hdr:
+        ri.cw.p("hdr_len = sizeof(req->_hdr);")
+        ri.cw.p("hdr = ynl_nlmsg_put_extra_header(nlh, hdr_len);")
+        ri.cw.p("memcpy(hdr, &req->_hdr, hdr_len);")
+        ri.cw.nl()
+
+    if "request" in ri.op[ri.op_mode]:
+        ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;")
+        ri.cw.nl()
+        for _, attr in ri.struct["request"].member_list():
+            attr.attr_put(ri, "req")
+    ri.cw.nl()
+
+    ri.cw.p('err = ynl_exec_dump(ys, nlh, &yds);')
+    ri.cw.p('if (err < 0)')
+    ri.cw.p('goto free_list;')
+    ri.cw.nl()
+
+    ri.cw.p('return yds.first;')
+    ri.cw.nl()
+    ri.cw.p('free_list:')
+    ri.cw.p(call_free(ri, rdir(direction), 'yds.first'))
+    ri.cw.p('return NULL;')
+    ri.cw.block_end()
+
+
+def call_free(ri, direction, var):
+    return f"{op_prefix(ri, direction)}_free({var});"
+
+
+def free_arg_name(direction):
+    if direction:
+        return direction_to_suffix[direction][1:]
+    return 'obj'
+
+
+def print_alloc_wrapper(ri, direction):
+    name = op_prefix(ri, direction)
+    ri.cw.write_func_prot(f'static inline struct {name} *', f"{name}_alloc", [f"void"])
+    ri.cw.block_start()
+    ri.cw.p(f'return calloc(1, sizeof(struct {name}));')
+    ri.cw.block_end()
+
+
+def print_free_prototype(ri, direction, suffix=';'):
+    name = op_prefix(ri, direction)
+    struct_name = name
+    if ri.type_name_conflict:
+        struct_name += '_'
+    arg = free_arg_name(direction)
+    ri.cw.write_func_prot('void', f"{name}_free", [f"struct {struct_name} *{arg}"], suffix=suffix)
+
+
+def _print_type(ri, direction, struct):
+    suffix = f'_{ri.type_name}{direction_to_suffix[direction]}'
+    if not direction and ri.type_name_conflict:
+        suffix += '_'
+
+    if ri.op_mode == 'dump':
+        suffix += '_dump'
+
+    ri.cw.block_start(line=f"struct {ri.family.c_name}{suffix}")
+
+    if ri.fixed_hdr:
+        ri.cw.p(ri.fixed_hdr + ' _hdr;')
+        ri.cw.nl()
+
+    meta_started = False
+    for _, attr in struct.member_list():
+        for type_filter in ['len', 'bit']:
+            line = attr.presence_member(ri.ku_space, type_filter)
+            if line:
+                if not meta_started:
+                    ri.cw.block_start(line=f"struct")
+                    meta_started = True
+                ri.cw.p(line)
+    if meta_started:
+        ri.cw.block_end(line='_present;')
+        ri.cw.nl()
+
+    for arg in struct.inherited:
+        ri.cw.p(f"__u32 {arg};")
+
+    for _, attr in struct.member_list():
+        attr.struct_member(ri)
+
+    ri.cw.block_end(line=';')
+    ri.cw.nl()
+
+
+def print_type(ri, direction):
+    _print_type(ri, direction, ri.struct[direction])
+
+
+def print_type_full(ri, struct):
+    _print_type(ri, "", struct)
+
+
+def print_type_helpers(ri, direction, deref=False):
+    print_free_prototype(ri, direction)
+    ri.cw.nl()
+
+    if ri.ku_space == 'user' and direction == 'request':
+        for _, attr in ri.struct[direction].member_list():
+            attr.setter(ri, ri.attr_set, direction, deref=deref)
+    ri.cw.nl()
+
+
+def print_req_type_helpers(ri):
+    if len(ri.struct["request"].attr_list) == 0:
+        return
+    print_alloc_wrapper(ri, "request")
+    print_type_helpers(ri, "request")
+
+
+def print_rsp_type_helpers(ri):
+    if 'reply' not in ri.op[ri.op_mode]:
+        return
+    print_type_helpers(ri, "reply")
+
+
+def print_parse_prototype(ri, direction, terminate=True):
+    suffix = "_rsp" if direction == "reply" else "_req"
+    term = ';' if terminate else ''
+
+    ri.cw.write_func_prot('void', f"{ri.op.render_name}{suffix}_parse",
+                          ['const struct nlattr **tb',
+                           f"struct {ri.op.render_name}{suffix} *req"],
+                          suffix=term)
+
+
+def print_req_type(ri):
+    if len(ri.struct["request"].attr_list) == 0:
+        return
+    print_type(ri, "request")
+
+
+def print_req_free(ri):
+    if 'request' not in ri.op[ri.op_mode]:
+        return
+    _free_type(ri, 'request', ri.struct['request'])
+
+
+def print_rsp_type(ri):
+    if (ri.op_mode == 'do' or ri.op_mode == 'dump') and 'reply' in ri.op[ri.op_mode]:
+        direction = 'reply'
+    elif ri.op_mode == 'event':
+        direction = 'reply'
+    else:
+        return
+    print_type(ri, direction)
+
+
+def print_wrapped_type(ri):
+    ri.cw.block_start(line=f"{type_name(ri, 'reply')}")
+    if ri.op_mode == 'dump':
+        ri.cw.p(f"{type_name(ri, 'reply')} *next;")
+    elif ri.op_mode == 'notify' or ri.op_mode == 'event':
+        ri.cw.p('__u16 family;')
+        ri.cw.p('__u8 cmd;')
+        ri.cw.p('struct ynl_ntf_base_type *next;')
+        ri.cw.p(f"void (*free)({type_name(ri, 'reply')} *ntf);")
+    ri.cw.p(f"{type_name(ri, 'reply', deref=True)} obj __attribute__((aligned(8)));")
+    ri.cw.block_end(line=';')
+    ri.cw.nl()
+    print_free_prototype(ri, 'reply')
+    ri.cw.nl()
+
+
+def _free_type_members_iter(ri, struct):
+    for _, attr in struct.member_list():
+        if attr.free_needs_iter():
+            ri.cw.p('unsigned int i;')
+            ri.cw.nl()
+            break
+
+
+def _free_type_members(ri, var, struct, ref=''):
+    for _, attr in struct.member_list():
+        attr.free(ri, var, ref)
+
+
+def _free_type(ri, direction, struct):
+    var = free_arg_name(direction)
+
+    print_free_prototype(ri, direction, suffix='')
+    ri.cw.block_start()
+    _free_type_members_iter(ri, struct)
+    _free_type_members(ri, var, struct)
+    if direction:
+        ri.cw.p(f'free({var});')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def free_rsp_nested_prototype(ri):
+        print_free_prototype(ri, "")
+
+
+def free_rsp_nested(ri, struct):
+    _free_type(ri, "", struct)
+
+
+def print_rsp_free(ri):
+    if 'reply' not in ri.op[ri.op_mode]:
+        return
+    _free_type(ri, 'reply', ri.struct['reply'])
+
+
+def print_dump_type_free(ri):
+    sub_type = type_name(ri, 'reply')
+
+    print_free_prototype(ri, 'reply', suffix='')
+    ri.cw.block_start()
+    ri.cw.p(f"{sub_type} *next = rsp;")
+    ri.cw.nl()
+    ri.cw.block_start(line='while ((void *)next != YNL_LIST_END)')
+    _free_type_members_iter(ri, ri.struct['reply'])
+    ri.cw.p('rsp = next;')
+    ri.cw.p('next = rsp->next;')
+    ri.cw.nl()
+
+    _free_type_members(ri, 'rsp', ri.struct['reply'], ref='obj.')
+    ri.cw.p(f'free(rsp);')
+    ri.cw.block_end()
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def print_ntf_type_free(ri):
+    print_free_prototype(ri, 'reply', suffix='')
+    ri.cw.block_start()
+    _free_type_members_iter(ri, ri.struct['reply'])
+    _free_type_members(ri, 'rsp', ri.struct['reply'], ref='obj.')
+    ri.cw.p(f'free(rsp);')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def print_req_policy_fwd(cw, struct, ri=None, terminate=True):
+    if terminate and ri and policy_should_be_static(struct.family):
+        return
+
+    if terminate:
+        prefix = 'extern '
+    else:
+        if ri and policy_should_be_static(struct.family):
+            prefix = 'static '
+        else:
+            prefix = ''
+
+    suffix = ';' if terminate else ' = {'
+
+    max_attr = struct.attr_max_val
+    if ri:
+        name = ri.op.render_name
+        if ri.op.dual_policy:
+            name += '_' + ri.op_mode
+    else:
+        name = struct.render_name
+    cw.p(f"{prefix}const struct nla_policy {name}_nl_policy[{max_attr.enum_name} + 1]{suffix}")
+
+
+def print_req_policy(cw, struct, ri=None):
+    if ri and ri.op:
+        cw.ifdef_block(ri.op.get('config-cond', None))
+    print_req_policy_fwd(cw, struct, ri=ri, terminate=False)
+    for _, arg in struct.member_list():
+        arg.attr_policy(cw)
+    cw.p("};")
+    cw.ifdef_block(None)
+    cw.nl()
+
+
+def kernel_can_gen_family_struct(family):
+    return family.proto == 'genetlink'
+
+
+def policy_should_be_static(family):
+    return family.kernel_policy == 'split' or kernel_can_gen_family_struct(family)
+
+
+def print_kernel_policy_ranges(family, cw):
+    first = True
+    for _, attr_set in family.attr_sets.items():
+        if attr_set.subset_of:
+            continue
+
+        for _, attr in attr_set.items():
+            if not attr.request:
+                continue
+            if 'full-range' not in attr.checks:
+                continue
+
+            if first:
+                cw.p('/* Integer value ranges */')
+                first = False
+
+            sign = '' if attr.type[0] == 'u' else '_signed'
+            suffix = 'ULL' if attr.type[0] == 'u' else 'LL'
+            cw.block_start(line=f'static const struct netlink_range_validation{sign} {c_lower(attr.enum_name)}_range =')
+            members = []
+            if 'min' in attr.checks:
+                members.append(('min', attr.get_limit_str('min', suffix=suffix)))
+            if 'max' in attr.checks:
+                members.append(('max', attr.get_limit_str('max', suffix=suffix)))
+            cw.write_struct_init(members)
+            cw.block_end(line=';')
+            cw.nl()
+
+
+def print_kernel_op_table_fwd(family, cw, terminate):
+    exported = not kernel_can_gen_family_struct(family)
+
+    if not terminate or exported:
+        cw.p(f"/* Ops table for {family.ident_name} */")
+
+        pol_to_struct = {'global': 'genl_small_ops',
+                         'per-op': 'genl_ops',
+                         'split': 'genl_split_ops'}
+        struct_type = pol_to_struct[family.kernel_policy]
+
+        if not exported:
+            cnt = ""
+        elif family.kernel_policy == 'split':
+            cnt = 0
+            for op in family.ops.values():
+                if 'do' in op:
+                    cnt += 1
+                if 'dump' in op:
+                    cnt += 1
+        else:
+            cnt = len(family.ops)
+
+        qual = 'static const' if not exported else 'const'
+        line = f"{qual} struct {struct_type} {family.c_name}_nl_ops[{cnt}]"
+        if terminate:
+            cw.p(f"extern {line};")
+        else:
+            cw.block_start(line=line + ' =')
+
+    if not terminate:
+        return
+
+    cw.nl()
+    for name in family.hooks['pre']['do']['list']:
+        cw.write_func_prot('int', c_lower(name),
+                           ['const struct genl_split_ops *ops',
+                            'struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
+    for name in family.hooks['post']['do']['list']:
+        cw.write_func_prot('void', c_lower(name),
+                           ['const struct genl_split_ops *ops',
+                            'struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
+    for name in family.hooks['pre']['dump']['list']:
+        cw.write_func_prot('int', c_lower(name),
+                           ['struct netlink_callback *cb'], suffix=';')
+    for name in family.hooks['post']['dump']['list']:
+        cw.write_func_prot('int', c_lower(name),
+                           ['struct netlink_callback *cb'], suffix=';')
+
+    cw.nl()
+
+    for op_name, op in family.ops.items():
+        if op.is_async:
+            continue
+
+        if 'do' in op:
+            name = c_lower(f"{family.ident_name}-nl-{op_name}-doit")
+            cw.write_func_prot('int', name,
+                               ['struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
+
+        if 'dump' in op:
+            name = c_lower(f"{family.ident_name}-nl-{op_name}-dumpit")
+            cw.write_func_prot('int', name,
+                               ['struct sk_buff *skb', 'struct netlink_callback *cb'], suffix=';')
+    cw.nl()
+
+
+def print_kernel_op_table_hdr(family, cw):
+    print_kernel_op_table_fwd(family, cw, terminate=True)
+
+
+def print_kernel_op_table(family, cw):
+    print_kernel_op_table_fwd(family, cw, terminate=False)
+    if family.kernel_policy == 'global' or family.kernel_policy == 'per-op':
+        for op_name, op in family.ops.items():
+            if op.is_async:
+                continue
+
+            cw.ifdef_block(op.get('config-cond', None))
+            cw.block_start()
+            members = [('cmd', op.enum_name)]
+            if 'dont-validate' in op:
+                members.append(('validate',
+                                ' | '.join([c_upper('genl-dont-validate-' + x)
+                                            for x in op['dont-validate']])), )
+            for op_mode in ['do', 'dump']:
+                if op_mode in op:
+                    name = c_lower(f"{family.ident_name}-nl-{op_name}-{op_mode}it")
+                    members.append((op_mode + 'it', name))
+            if family.kernel_policy == 'per-op':
+                struct = Struct(family, op['attribute-set'],
+                                type_list=op['do']['request']['attributes'])
+
+                name = c_lower(f"{family.ident_name}-{op_name}-nl-policy")
+                members.append(('policy', name))
+                members.append(('maxattr', struct.attr_max_val.enum_name))
+            if 'flags' in op:
+                members.append(('flags', ' | '.join([c_upper('genl-' + x) for x in op['flags']])))
+            cw.write_struct_init(members)
+            cw.block_end(line=',')
+    elif family.kernel_policy == 'split':
+        cb_names = {'do':   {'pre': 'pre_doit', 'post': 'post_doit'},
+                    'dump': {'pre': 'start', 'post': 'done'}}
+
+        for op_name, op in family.ops.items():
+            for op_mode in ['do', 'dump']:
+                if op.is_async or op_mode not in op:
+                    continue
+
+                cw.ifdef_block(op.get('config-cond', None))
+                cw.block_start()
+                members = [('cmd', op.enum_name)]
+                if 'dont-validate' in op:
+                    dont_validate = []
+                    for x in op['dont-validate']:
+                        if op_mode == 'do' and x in ['dump', 'dump-strict']:
+                            continue
+                        if op_mode == "dump" and x == 'strict':
+                            continue
+                        dont_validate.append(x)
+
+                    if dont_validate:
+                        members.append(('validate',
+                                        ' | '.join([c_upper('genl-dont-validate-' + x)
+                                                    for x in dont_validate])), )
+                name = c_lower(f"{family.ident_name}-nl-{op_name}-{op_mode}it")
+                if 'pre' in op[op_mode]:
+                    members.append((cb_names[op_mode]['pre'], c_lower(op[op_mode]['pre'])))
+                members.append((op_mode + 'it', name))
+                if 'post' in op[op_mode]:
+                    members.append((cb_names[op_mode]['post'], c_lower(op[op_mode]['post'])))
+                if 'request' in op[op_mode]:
+                    struct = Struct(family, op['attribute-set'],
+                                    type_list=op[op_mode]['request']['attributes'])
+
+                    if op.dual_policy:
+                        name = c_lower(f"{family.ident_name}-{op_name}-{op_mode}-nl-policy")
+                    else:
+                        name = c_lower(f"{family.ident_name}-{op_name}-nl-policy")
+                    members.append(('policy', name))
+                    members.append(('maxattr', struct.attr_max_val.enum_name))
+                flags = (op['flags'] if 'flags' in op else []) + ['cmd-cap-' + op_mode]
+                members.append(('flags', ' | '.join([c_upper('genl-' + x) for x in flags])))
+                cw.write_struct_init(members)
+                cw.block_end(line=',')
+    cw.ifdef_block(None)
+
+    cw.block_end(line=';')
+    cw.nl()
+
+
+def print_kernel_mcgrp_hdr(family, cw):
+    if not family.mcgrps['list']:
+        return
+
+    cw.block_start('enum')
+    for grp in family.mcgrps['list']:
+        grp_id = c_upper(f"{family.ident_name}-nlgrp-{grp['name']},")
+        cw.p(grp_id)
+    cw.block_end(';')
+    cw.nl()
+
+
+def print_kernel_mcgrp_src(family, cw):
+    if not family.mcgrps['list']:
+        return
+
+    cw.block_start('static const struct genl_multicast_group ' + family.c_name + '_nl_mcgrps[] =')
+    for grp in family.mcgrps['list']:
+        name = grp['name']
+        grp_id = c_upper(f"{family.ident_name}-nlgrp-{name}")
+        cw.p('[' + grp_id + '] = { "' + name + '", },')
+    cw.block_end(';')
+    cw.nl()
+
+
+def print_kernel_family_struct_hdr(family, cw):
+    if not kernel_can_gen_family_struct(family):
+        return
+
+    cw.p(f"extern struct genl_family {family.c_name}_nl_family;")
+    cw.nl()
+    if 'sock-priv' in family.kernel_family:
+        cw.p(f'void {family.c_name}_nl_sock_priv_init({family.kernel_family["sock-priv"]} *priv);')
+        cw.p(f'void {family.c_name}_nl_sock_priv_destroy({family.kernel_family["sock-priv"]} *priv);')
+        cw.nl()
+
+
+def print_kernel_family_struct_src(family, cw):
+    if not kernel_can_gen_family_struct(family):
+        return
+
+    cw.block_start(f"struct genl_family {family.ident_name}_nl_family __ro_after_init =")
+    cw.p('.name\t\t= ' + family.fam_key + ',')
+    cw.p('.version\t= ' + family.ver_key + ',')
+    cw.p('.netnsok\t= true,')
+    cw.p('.parallel_ops\t= true,')
+    cw.p('.module\t\t= THIS_MODULE,')
+    if family.kernel_policy == 'per-op':
+        cw.p(f'.ops\t\t= {family.c_name}_nl_ops,')
+        cw.p(f'.n_ops\t\t= ARRAY_SIZE({family.c_name}_nl_ops),')
+    elif family.kernel_policy == 'split':
+        cw.p(f'.split_ops\t= {family.c_name}_nl_ops,')
+        cw.p(f'.n_split_ops\t= ARRAY_SIZE({family.c_name}_nl_ops),')
+    if family.mcgrps['list']:
+        cw.p(f'.mcgrps\t\t= {family.c_name}_nl_mcgrps,')
+        cw.p(f'.n_mcgrps\t= ARRAY_SIZE({family.c_name}_nl_mcgrps),')
+    if 'sock-priv' in family.kernel_family:
+        cw.p(f'.sock_priv_size\t= sizeof({family.kernel_family["sock-priv"]}),')
+        # Force cast here, actual helpers take pointer to the real type.
+        cw.p(f'.sock_priv_init\t= (void *){family.c_name}_nl_sock_priv_init,')
+        cw.p(f'.sock_priv_destroy = (void *){family.c_name}_nl_sock_priv_destroy,')
+    cw.block_end(';')
+
+
+def uapi_enum_start(family, cw, obj, ckey='', enum_name='enum-name'):
+    start_line = 'enum'
+    if enum_name in obj:
+        if obj[enum_name]:
+            start_line = 'enum ' + c_lower(obj[enum_name])
+    elif ckey and ckey in obj:
+        start_line = 'enum ' + family.c_name + '_' + c_lower(obj[ckey])
+    cw.block_start(line=start_line)
+
+
+def render_uapi_unified(family, cw, max_by_define, separate_ntf):
+    max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX"))
+    cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX"))
+    max_value = f"({cnt_name} - 1)"
+
+    uapi_enum_start(family, cw, family['operations'], 'enum-name')
+    val = 0
+    for op in family.msgs.values():
+        if separate_ntf and ('notify' in op or 'event' in op):
+            continue
+
+        suffix = ','
+        if op.value != val:
+            suffix = f" = {op.value},"
+            val = op.value
+        cw.p(op.enum_name + suffix)
+        val += 1
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+
+def render_uapi_directional(family, cw, max_by_define):
+    max_name = f"{family.op_prefix}USER_MAX"
+    cnt_name = f"__{family.op_prefix}USER_CNT"
+    max_value = f"({cnt_name} - 1)"
+
+    cw.block_start(line='enum')
+    cw.p(c_upper(f'{family.name}_MSG_USER_NONE = 0,'))
+    val = 0
+    for op in family.msgs.values():
+        if 'do' in op and 'event' not in op:
+            suffix = ','
+            if op.value and op.value != val:
+                suffix = f" = {op.value},"
+                val = op.value
+            cw.p(op.enum_name + suffix)
+            val += 1
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+    max_name = f"{family.op_prefix}KERNEL_MAX"
+    cnt_name = f"__{family.op_prefix}KERNEL_CNT"
+    max_value = f"({cnt_name} - 1)"
+
+    cw.block_start(line='enum')
+    cw.p(c_upper(f'{family.name}_MSG_KERNEL_NONE = 0,'))
+    val = 0
+    for op in family.msgs.values():
+        if ('do' in op and 'reply' in op['do']) or 'notify' in op or 'event' in op:
+            enum_name = op.enum_name
+            if 'event' not in op and 'notify' not in op:
+                enum_name = f'{enum_name}_REPLY'
+
+            suffix = ','
+            if op.value and op.value != val:
+                suffix = f" = {op.value},"
+                val = op.value
+            cw.p(enum_name + suffix)
+            val += 1
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+
+def render_uapi(family, cw):
+    hdr_prot = f"_UAPI_LINUX_{c_upper(family.uapi_header_name)}_H"
+    hdr_prot = hdr_prot.replace('/', '_')
+    cw.p('#ifndef ' + hdr_prot)
+    cw.p('#define ' + hdr_prot)
+    cw.nl()
+
+    defines = [(family.fam_key, family["name"]),
+               (family.ver_key, family.get('version', 1))]
+    cw.writes_defines(defines)
+    cw.nl()
+
+    defines = []
+    for const in family['definitions']:
+        if const['type'] != 'const':
+            cw.writes_defines(defines)
+            defines = []
+            cw.nl()
+
+        # Write kdoc for enum and flags (one day maybe also structs)
+        if const['type'] == 'enum' or const['type'] == 'flags':
+            enum = family.consts[const['name']]
+
+            if enum.header:
+                continue
+
+            if enum.has_doc():
+                if enum.has_entry_doc():
+                    cw.p('/**')
+                    doc = ''
+                    if 'doc' in enum:
+                        doc = ' - ' + enum['doc']
+                    cw.write_doc_line(enum.enum_name + doc)
+                else:
+                    cw.p('/*')
+                    cw.write_doc_line(enum['doc'], indent=False)
+                for entry in enum.entries.values():
+                    if entry.has_doc():
+                        doc = '@' + entry.c_name + ': ' + entry['doc']
+                        cw.write_doc_line(doc)
+                cw.p(' */')
+
+            uapi_enum_start(family, cw, const, 'name')
+            name_pfx = const.get('name-prefix', f"{family.ident_name}-{const['name']}-")
+            for entry in enum.entries.values():
+                suffix = ','
+                if entry.value_change:
+                    suffix = f" = {entry.user_value()}" + suffix
+                cw.p(entry.c_name + suffix)
+
+            if const.get('render-max', False):
+                cw.nl()
+                cw.p('/* private: */')
+                if const['type'] == 'flags':
+                    max_name = c_upper(name_pfx + 'mask')
+                    max_val = f' = {enum.get_mask()},'
+                    cw.p(max_name + max_val)
+                else:
+                    cnt_name = enum.enum_cnt_name
+                    max_name = c_upper(name_pfx + 'max')
+                    if not cnt_name:
+                        cnt_name = '__' + name_pfx + 'max'
+                    cw.p(c_upper(cnt_name) + ',')
+                    cw.p(max_name + ' = (' + c_upper(cnt_name) + ' - 1)')
+            cw.block_end(line=';')
+            cw.nl()
+        elif const['type'] == 'const':
+            defines.append([c_upper(family.get('c-define-name',
+                                               f"{family.ident_name}-{const['name']}")),
+                            const['value']])
+
+    if defines:
+        cw.writes_defines(defines)
+        cw.nl()
+
+    max_by_define = family.get('max-by-define', False)
+
+    for _, attr_set in family.attr_sets.items():
+        if attr_set.subset_of:
+            continue
+
+        max_value = f"({attr_set.cnt_name} - 1)"
+
+        val = 0
+        uapi_enum_start(family, cw, attr_set.yaml, 'enum-name')
+        for _, attr in attr_set.items():
+            suffix = ','
+            if attr.value != val:
+                suffix = f" = {attr.value},"
+                val = attr.value
+            val += 1
+            cw.p(attr.enum_name + suffix)
+        if attr_set.items():
+            cw.nl()
+        cw.p(attr_set.cnt_name + ('' if max_by_define else ','))
+        if not max_by_define:
+            cw.p(f"{attr_set.max_name} = {max_value}")
+        cw.block_end(line=';')
+        if max_by_define:
+            cw.p(f"#define {attr_set.max_name} {max_value}")
+        cw.nl()
+
+    # Commands
+    separate_ntf = 'async-prefix' in family['operations']
+
+    if family.msg_id_model == 'unified':
+        render_uapi_unified(family, cw, max_by_define, separate_ntf)
+    elif family.msg_id_model == 'directional':
+        render_uapi_directional(family, cw, max_by_define)
+    else:
+        raise Exception(f'Unsupported message enum-model {family.msg_id_model}')
+
+    if separate_ntf:
+        uapi_enum_start(family, cw, family['operations'], enum_name='async-enum')
+        for op in family.msgs.values():
+            if separate_ntf and not ('notify' in op or 'event' in op):
+                continue
+
+            suffix = ','
+            if 'value' in op:
+                suffix = f" = {op['value']},"
+            cw.p(op.enum_name + suffix)
+        cw.block_end(line=';')
+        cw.nl()
+
+    # Multicast
+    defines = []
+    for grp in family.mcgrps['list']:
+        name = grp['name']
+        defines.append([c_upper(grp.get('c-define-name', f"{family.ident_name}-mcgrp-{name}")),
+                        f'{name}'])
+    cw.nl()
+    if defines:
+        cw.writes_defines(defines)
+        cw.nl()
+
+    cw.p(f'#endif /* {hdr_prot} */')
+
+
+def _render_user_ntf_entry(ri, op):
+    ri.cw.block_start(line=f"[{op.enum_name}] = ")
+    ri.cw.p(f".alloc_sz\t= sizeof({type_name(ri, 'event')}),")
+    ri.cw.p(f".cb\t\t= {op_prefix(ri, 'reply', deref=True)}_parse,")
+    ri.cw.p(f".policy\t\t= &{ri.struct['reply'].render_name}_nest,")
+    ri.cw.p(f".free\t\t= (void *){op_prefix(ri, 'notify')}_free,")
+    ri.cw.block_end(line=',')
+
+
+def render_user_family(family, cw, prototype):
+    symbol = f'const struct ynl_family ynl_{family.c_name}_family'
+    if prototype:
+        cw.p(f'extern {symbol};')
+        return
+
+    if family.ntfs:
+        cw.block_start(line=f"static const struct ynl_ntf_info {family['name']}_ntf_info[] = ")
+        for ntf_op_name, ntf_op in family.ntfs.items():
+            if 'notify' in ntf_op:
+                op = family.ops[ntf_op['notify']]
+                ri = RenderInfo(cw, family, "user", op, "notify")
+            elif 'event' in ntf_op:
+                ri = RenderInfo(cw, family, "user", ntf_op, "event")
+            else:
+                raise Exception('Invalid notification ' + ntf_op_name)
+            _render_user_ntf_entry(ri, ntf_op)
+        for op_name, op in family.ops.items():
+            if 'event' not in op:
+                continue
+            ri = RenderInfo(cw, family, "user", op, "event")
+            _render_user_ntf_entry(ri, op)
+        cw.block_end(line=";")
+        cw.nl()
+
+    cw.block_start(f'{symbol} = ')
+    cw.p(f'.name\t\t= "{family.c_name}",')
+    if family.fixed_header:
+        cw.p(f'.hdr_len\t= sizeof(struct genlmsghdr) + sizeof(struct {c_lower(family.fixed_header)}),')
+    else:
+        cw.p('.hdr_len\t= sizeof(struct genlmsghdr),')
+    if family.ntfs:
+        cw.p(f".ntf_info\t= {family['name']}_ntf_info,")
+        cw.p(f".ntf_info_size\t= YNL_ARRAY_SIZE({family['name']}_ntf_info),")
+    cw.block_end(line=';')
+
+
+def family_contains_bitfield32(family):
+    for _, attr_set in family.attr_sets.items():
+        if attr_set.subset_of:
+            continue
+        for _, attr in attr_set.items():
+            if attr.type == "bitfield32":
+                return True
+    return False
+
+
+def find_kernel_root(full_path):
+    sub_path = ''
+    while True:
+        sub_path = os.path.join(os.path.basename(full_path), sub_path)
+        full_path = os.path.dirname(full_path)
+        maintainers = os.path.join(full_path, "MAINTAINERS")
+        if os.path.exists(maintainers):
+            return full_path, sub_path[:-1]
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Netlink simple parsing generator')
+    parser.add_argument('--mode', dest='mode', type=str, required=True,
+                        choices=('user', 'kernel', 'uapi'))
+    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    parser.add_argument('--header', dest='header', action='store_true', default=None)
+    parser.add_argument('--source', dest='header', action='store_false')
+    parser.add_argument('--user-header', nargs='+', default=[])
+    parser.add_argument('--cmp-out', action='store_true', default=None,
+                        help='Do not overwrite the output file if the new output is identical to the old')
+    parser.add_argument('--exclude-op', action='append', default=[])
+    parser.add_argument('-o', dest='out_file', type=str, default=None)
+    args = parser.parse_args()
+
+    if args.header is None:
+        parser.error("--header or --source is required")
+
+    exclude_ops = [re.compile(expr) for expr in args.exclude_op]
+
+    try:
+        parsed = Family(args.spec, exclude_ops)
+        if parsed.license != '((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)':
+            print('Spec license:', parsed.license)
+            print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)')
+            os.sys.exit(1)
+    except yaml.YAMLError as exc:
+        print(exc)
+        os.sys.exit(1)
+        return
+
+    cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out))
+
+    _, spec_kernel = find_kernel_root(args.spec)
+    if args.mode == 'uapi' or args.header:
+        cw.p(f'/* SPDX-License-Identifier: {parsed.license} */')
+    else:
+        cw.p(f'// SPDX-License-Identifier: {parsed.license}')
+    cw.p("/* Do not edit directly, auto-generated from: */")
+    cw.p(f"/*\t{spec_kernel} */")
+    cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */")
+    if args.exclude_op or args.user_header:
+        line = ''
+        line += ' --user-header '.join([''] + args.user_header)
+        line += ' --exclude-op '.join([''] + args.exclude_op)
+        cw.p(f'/* YNL-ARG{line} */')
+    cw.nl()
+
+    if args.mode == 'uapi':
+        render_uapi(parsed, cw)
+        return
+
+    hdr_prot = f"_LINUX_{parsed.c_name.upper()}_GEN_H"
+    if args.header:
+        cw.p('#ifndef ' + hdr_prot)
+        cw.p('#define ' + hdr_prot)
+        cw.nl()
+
+    if args.out_file:
+        hdr_file = os.path.basename(args.out_file[:-2]) + ".h"
+    else:
+        hdr_file = "generated_header_file.h"
+
+    if args.mode == 'kernel':
+        cw.p('#include <net/netlink.h>')
+        cw.p('#include <net/genetlink.h>')
+        cw.nl()
+        if not args.header:
+            if args.out_file:
+                cw.p(f'#include "{hdr_file}"')
+            cw.nl()
+        headers = ['uapi/' + parsed.uapi_header]
+        headers += parsed.kernel_family.get('headers', [])
+    else:
+        cw.p('#include <stdlib.h>')
+        cw.p('#include <string.h>')
+        if args.header:
+            cw.p('#include <linux/types.h>')
+            if family_contains_bitfield32(parsed):
+                cw.p('#include <linux/netlink.h>')
+        else:
+            cw.p(f'#include "{hdr_file}"')
+            cw.p('#include "ynl.h"')
+        headers = []
+    for definition in parsed['definitions']:
+        if 'header' in definition:
+            headers.append(definition['header'])
+    if args.mode == 'user':
+        headers.append(parsed.uapi_header)
+    seen_header = []
+    for one in headers:
+        if one not in seen_header:
+            cw.p(f"#include <{one}>")
+            seen_header.append(one)
+    cw.nl()
+
+    if args.mode == "user":
+        if not args.header:
+            cw.p("#include <linux/genetlink.h>")
+            cw.nl()
+            for one in args.user_header:
+                cw.p(f'#include "{one}"')
+        else:
+            cw.p('struct ynl_sock;')
+            cw.nl()
+            render_user_family(parsed, cw, True)
+        cw.nl()
+
+    if args.mode == "kernel":
+        if args.header:
+            for _, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    cw.p('/* Common nested types */')
+                    break
+            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    print_req_policy_fwd(cw, struct)
+            cw.nl()
+
+            if parsed.kernel_policy == 'global':
+                cw.p(f"/* Global operation policy for {parsed.name} */")
+
+                struct = Struct(parsed, parsed.global_policy_set, type_list=parsed.global_policy)
+                print_req_policy_fwd(cw, struct)
+                cw.nl()
+
+            if parsed.kernel_policy in {'per-op', 'split'}:
+                for op_name, op in parsed.ops.items():
+                    if 'do' in op and 'event' not in op:
+                        ri = RenderInfo(cw, parsed, args.mode, op, "do")
+                        print_req_policy_fwd(cw, ri.struct['request'], ri=ri)
+                        cw.nl()
+
+            print_kernel_op_table_hdr(parsed, cw)
+            print_kernel_mcgrp_hdr(parsed, cw)
+            print_kernel_family_struct_hdr(parsed, cw)
+        else:
+            print_kernel_policy_ranges(parsed, cw)
+
+            for _, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    cw.p('/* Common nested types */')
+                    break
+            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    print_req_policy(cw, struct)
+            cw.nl()
+
+            if parsed.kernel_policy == 'global':
+                cw.p(f"/* Global operation policy for {parsed.name} */")
+
+                struct = Struct(parsed, parsed.global_policy_set, type_list=parsed.global_policy)
+                print_req_policy(cw, struct)
+                cw.nl()
+
+            for op_name, op in parsed.ops.items():
+                if parsed.kernel_policy in {'per-op', 'split'}:
+                    for op_mode in ['do', 'dump']:
+                        if op_mode in op and 'request' in op[op_mode]:
+                            cw.p(f"/* {op.enum_name} - {op_mode} */")
+                            ri = RenderInfo(cw, parsed, args.mode, op, op_mode)
+                            print_req_policy(cw, ri.struct['request'], ri=ri)
+                            cw.nl()
+
+            print_kernel_op_table(parsed, cw)
+            print_kernel_mcgrp_src(parsed, cw)
+            print_kernel_family_struct_src(parsed, cw)
+
+    if args.mode == "user":
+        if args.header:
+            cw.p('/* Enums */')
+            put_op_name_fwd(parsed, cw)
+
+            for name, const in parsed.consts.items():
+                if isinstance(const, EnumSet):
+                    put_enum_to_str_fwd(parsed, cw, const)
+            cw.nl()
+
+            cw.p('/* Common nested types */')
+            for attr_set, struct in parsed.pure_nested_structs.items():
+                ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
+                print_type_full(ri, struct)
+
+            for op_name, op in parsed.ops.items():
+                cw.p(f"/* ============== {op.enum_name} ============== */")
+
+                if 'do' in op and 'event' not in op:
+                    cw.p(f"/* {op.enum_name} - do */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, "do")
+                    print_req_type(ri)
+                    print_req_type_helpers(ri)
+                    cw.nl()
+                    print_rsp_type(ri)
+                    print_rsp_type_helpers(ri)
+                    cw.nl()
+                    print_req_prototype(ri)
+                    cw.nl()
+
+                if 'dump' in op:
+                    cw.p(f"/* {op.enum_name} - dump */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, 'dump')
+                    print_req_type(ri)
+                    print_req_type_helpers(ri)
+                    if not ri.type_consistent:
+                        print_rsp_type(ri)
+                    print_wrapped_type(ri)
+                    print_dump_prototype(ri)
+                    cw.nl()
+
+                if op.has_ntf:
+                    cw.p(f"/* {op.enum_name} - notify */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, 'notify')
+                    if not ri.type_consistent:
+                        raise Exception(f'Only notifications with consistent types supported ({op.name})')
+                    print_wrapped_type(ri)
+
+            for op_name, op in parsed.ntfs.items():
+                if 'event' in op:
+                    ri = RenderInfo(cw, parsed, args.mode, op, 'event')
+                    cw.p(f"/* {op.enum_name} - event */")
+                    print_rsp_type(ri)
+                    cw.nl()
+                    print_wrapped_type(ri)
+            cw.nl()
+        else:
+            cw.p('/* Enums */')
+            put_op_name(parsed, cw)
+
+            for name, const in parsed.consts.items():
+                if isinstance(const, EnumSet):
+                    put_enum_to_str(parsed, cw, const)
+            cw.nl()
+
+            has_recursive_nests = False
+            cw.p('/* Policies */')
+            for struct in parsed.pure_nested_structs.values():
+                if struct.recursive:
+                    put_typol_fwd(cw, struct)
+                    has_recursive_nests = True
+            if has_recursive_nests:
+                cw.nl()
+            for name in parsed.pure_nested_structs:
+                struct = Struct(parsed, name)
+                put_typol(cw, struct)
+            for name in parsed.root_sets:
+                struct = Struct(parsed, name)
+                put_typol(cw, struct)
+
+            cw.p('/* Common nested types */')
+            if has_recursive_nests:
+                for attr_set, struct in parsed.pure_nested_structs.items():
+                    ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
+                    free_rsp_nested_prototype(ri)
+                    if struct.request:
+                        put_req_nested_prototype(ri, struct)
+                    if struct.reply:
+                        parse_rsp_nested_prototype(ri, struct)
+                cw.nl()
+            for attr_set, struct in parsed.pure_nested_structs.items():
+                ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
+
+                free_rsp_nested(ri, struct)
+                if struct.request:
+                    put_req_nested(ri, struct)
+                if struct.reply:
+                    parse_rsp_nested(ri, struct)
+
+            for op_name, op in parsed.ops.items():
+                cw.p(f"/* ============== {op.enum_name} ============== */")
+                if 'do' in op and 'event' not in op:
+                    cw.p(f"/* {op.enum_name} - do */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, "do")
+                    print_req_free(ri)
+                    print_rsp_free(ri)
+                    parse_rsp_msg(ri)
+                    print_req(ri)
+                    cw.nl()
+
+                if 'dump' in op:
+                    cw.p(f"/* {op.enum_name} - dump */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, "dump")
+                    if not ri.type_consistent:
+                        parse_rsp_msg(ri, deref=True)
+                    print_req_free(ri)
+                    print_dump_type_free(ri)
+                    print_dump(ri)
+                    cw.nl()
+
+                if op.has_ntf:
+                    cw.p(f"/* {op.enum_name} - notify */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, 'notify')
+                    if not ri.type_consistent:
+                        raise Exception(f'Only notifications with consistent types supported ({op.name})')
+                    print_ntf_type_free(ri)
+
+            for op_name, op in parsed.ntfs.items():
+                if 'event' in op:
+                    cw.p(f"/* {op.enum_name} - event */")
+
+                    ri = RenderInfo(cw, parsed, args.mode, op, "do")
+                    parse_rsp_msg(ri)
+
+                    ri = RenderInfo(cw, parsed, args.mode, op, "event")
+                    print_ntf_type_free(ri)
+            cw.nl()
+            render_user_family(parsed, cw, False)
+
+    if args.header:
+        cw.p(f'#endif /* {hdr_prot} */')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/net/ynl/pyynl/ynl_gen_rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py
new file mode 100755
index 000000000000..6c56d0d726b4
--- /dev/null
+++ b/tools/net/ynl/pyynl/ynl_gen_rst.py
@@ -0,0 +1,453 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# -*- coding: utf-8; mode: python -*-
+
+"""
+    Script to auto generate the documentation for Netlink specifications.
+
+    :copyright:  Copyright (C) 2023  Breno Leitao <leitao@debian.org>
+    :license:    GPL Version 2, June 1991 see linux/COPYING for details.
+
+    This script performs extensive parsing to the Linux kernel's netlink YAML
+    spec files, in an effort to avoid needing to heavily mark up the original
+    YAML file.
+
+    This code is split in three big parts:
+        1) RST formatters: Use to convert a string to a RST output
+        2) Parser helpers: Functions to parse the YAML data structure
+        3) Main function and small helpers
+"""
+
+from typing import Any, Dict, List
+import os.path
+import sys
+import argparse
+import logging
+import yaml
+
+
+SPACE_PER_LEVEL = 4
+
+
+# RST Formatters
+# ==============
+def headroom(level: int) -> str:
+    """Return space to format"""
+    return " " * (level * SPACE_PER_LEVEL)
+
+
+def bold(text: str) -> str:
+    """Format bold text"""
+    return f"**{text}**"
+
+
+def inline(text: str) -> str:
+    """Format inline text"""
+    return f"``{text}``"
+
+
+def sanitize(text: str) -> str:
+    """Remove newlines and multiple spaces"""
+    # This is useful for some fields that are spread across multiple lines
+    return str(text).replace("\n", " ").strip()
+
+
+def rst_fields(key: str, value: str, level: int = 0) -> str:
+    """Return a RST formatted field"""
+    return headroom(level) + f":{key}: {value}"
+
+
+def rst_definition(key: str, value: Any, level: int = 0) -> str:
+    """Format a single rst definition"""
+    return headroom(level) + key + "\n" + headroom(level + 1) + str(value)
+
+
+def rst_paragraph(paragraph: str, level: int = 0) -> str:
+    """Return a formatted paragraph"""
+    return headroom(level) + paragraph
+
+
+def rst_bullet(item: str, level: int = 0) -> str:
+    """Return a formatted a bullet"""
+    return headroom(level) + f"- {item}"
+
+
+def rst_subsection(title: str) -> str:
+    """Add a sub-section to the document"""
+    return f"{title}\n" + "-" * len(title)
+
+
+def rst_subsubsection(title: str) -> str:
+    """Add a sub-sub-section to the document"""
+    return f"{title}\n" + "~" * len(title)
+
+
+def rst_section(namespace: str, prefix: str, title: str) -> str:
+    """Add a section to the document"""
+    return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title)
+
+
+def rst_subtitle(title: str) -> str:
+    """Add a subtitle to the document"""
+    return "\n" + "-" * len(title) + f"\n{title}\n" + "-" * len(title) + "\n\n"
+
+
+def rst_title(title: str) -> str:
+    """Add a title to the document"""
+    return "=" * len(title) + f"\n{title}\n" + "=" * len(title) + "\n\n"
+
+
+def rst_list_inline(list_: List[str], level: int = 0) -> str:
+    """Format a list using inlines"""
+    return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]"
+
+
+def rst_ref(namespace: str, prefix: str, name: str) -> str:
+    """Add a hyperlink to the document"""
+    mappings = {'enum': 'definition',
+                'fixed-header': 'definition',
+                'nested-attributes': 'attribute-set',
+                'struct': 'definition'}
+    if prefix in mappings:
+        prefix = mappings[prefix]
+    return f":ref:`{namespace}-{prefix}-{name}`"
+
+
+def rst_header() -> str:
+    """The headers for all the auto generated RST files"""
+    lines = []
+
+    lines.append(rst_paragraph(".. SPDX-License-Identifier: GPL-2.0"))
+    lines.append(rst_paragraph(".. NOTE: This document was auto-generated.\n\n"))
+
+    return "\n".join(lines)
+
+
+def rst_toctree(maxdepth: int = 2) -> str:
+    """Generate a toctree RST primitive"""
+    lines = []
+
+    lines.append(".. toctree::")
+    lines.append(f"   :maxdepth: {maxdepth}\n\n")
+
+    return "\n".join(lines)
+
+
+def rst_label(title: str) -> str:
+    """Return a formatted label"""
+    return f".. _{title}:\n\n"
+
+
+# Parsers
+# =======
+
+
+def parse_mcast_group(mcast_group: List[Dict[str, Any]]) -> str:
+    """Parse 'multicast' group list and return a formatted string"""
+    lines = []
+    for group in mcast_group:
+        lines.append(rst_bullet(group["name"]))
+
+    return "\n".join(lines)
+
+
+def parse_do(do_dict: Dict[str, Any], level: int = 0) -> str:
+    """Parse 'do' section and return a formatted string"""
+    lines = []
+    for key in do_dict.keys():
+        lines.append(rst_paragraph(bold(key), level + 1))
+        if key in ['request', 'reply']:
+            lines.append(parse_do_attributes(do_dict[key], level + 1) + "\n")
+        else:
+            lines.append(headroom(level + 2) + do_dict[key] + "\n")
+
+    return "\n".join(lines)
+
+
+def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str:
+    """Parse 'attributes' section"""
+    if "attributes" not in attrs:
+        return ""
+    lines = [rst_fields("attributes", rst_list_inline(attrs["attributes"]), level + 1)]
+
+    return "\n".join(lines)
+
+
+def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str:
+    """Parse operations block"""
+    preprocessed = ["name", "doc", "title", "do", "dump", "flags"]
+    linkable = ["fixed-header", "attribute-set"]
+    lines = []
+
+    for operation in operations:
+        lines.append(rst_section(namespace, 'operation', operation["name"]))
+        lines.append(rst_paragraph(operation["doc"]) + "\n")
+
+        for key in operation.keys():
+            if key in preprocessed:
+                # Skip the special fields
+                continue
+            value = operation[key]
+            if key in linkable:
+                value = rst_ref(namespace, key, value)
+            lines.append(rst_fields(key, value, 0))
+        if 'flags' in operation:
+            lines.append(rst_fields('flags', rst_list_inline(operation['flags'])))
+
+        if "do" in operation:
+            lines.append(rst_paragraph(":do:", 0))
+            lines.append(parse_do(operation["do"], 0))
+        if "dump" in operation:
+            lines.append(rst_paragraph(":dump:", 0))
+            lines.append(parse_do(operation["dump"], 0))
+
+        # New line after fields
+        lines.append("\n")
+
+    return "\n".join(lines)
+
+
+def parse_entries(entries: List[Dict[str, Any]], level: int) -> str:
+    """Parse a list of entries"""
+    ignored = ["pad"]
+    lines = []
+    for entry in entries:
+        if isinstance(entry, dict):
+            # entries could be a list or a dictionary
+            field_name = entry.get("name", "")
+            if field_name in ignored:
+                continue
+            type_ = entry.get("type")
+            if type_:
+                field_name += f" ({inline(type_)})"
+            lines.append(
+                rst_fields(field_name, sanitize(entry.get("doc", "")), level)
+            )
+        elif isinstance(entry, list):
+            lines.append(rst_list_inline(entry, level))
+        else:
+            lines.append(rst_bullet(inline(sanitize(entry)), level))
+
+    lines.append("\n")
+    return "\n".join(lines)
+
+
+def parse_definitions(defs: Dict[str, Any], namespace: str) -> str:
+    """Parse definitions section"""
+    preprocessed = ["name", "entries", "members"]
+    ignored = ["render-max"]  # This is not printed
+    lines = []
+
+    for definition in defs:
+        lines.append(rst_section(namespace, 'definition', definition["name"]))
+        for k in definition.keys():
+            if k in preprocessed + ignored:
+                continue
+            lines.append(rst_fields(k, sanitize(definition[k]), 0))
+
+        # Field list needs to finish with a new line
+        lines.append("\n")
+        if "entries" in definition:
+            lines.append(rst_paragraph(":entries:", 0))
+            lines.append(parse_entries(definition["entries"], 1))
+        if "members" in definition:
+            lines.append(rst_paragraph(":members:", 0))
+            lines.append(parse_entries(definition["members"], 1))
+
+    return "\n".join(lines)
+
+
+def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str:
+    """Parse attribute from attribute-set"""
+    preprocessed = ["name", "type"]
+    linkable = ["enum", "nested-attributes", "struct", "sub-message"]
+    ignored = ["checks"]
+    lines = []
+
+    for entry in entries:
+        lines.append(rst_section(namespace, 'attribute-set', entry["name"]))
+        for attr in entry["attributes"]:
+            type_ = attr.get("type")
+            attr_line = attr["name"]
+            if type_:
+                # Add the attribute type in the same line
+                attr_line += f" ({inline(type_)})"
+
+            lines.append(rst_subsubsection(attr_line))
+
+            for k in attr.keys():
+                if k in preprocessed + ignored:
+                    continue
+                if k in linkable:
+                    value = rst_ref(namespace, k, attr[k])
+                else:
+                    value = sanitize(attr[k])
+                lines.append(rst_fields(k, value, 0))
+            lines.append("\n")
+
+    return "\n".join(lines)
+
+
+def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str:
+    """Parse sub-message definitions"""
+    lines = []
+
+    for entry in entries:
+        lines.append(rst_section(namespace, 'sub-message', entry["name"]))
+        for fmt in entry["formats"]:
+            value = fmt["value"]
+
+            lines.append(rst_bullet(bold(value)))
+            for attr in ['fixed-header', 'attribute-set']:
+                if attr in fmt:
+                    lines.append(rst_fields(attr,
+                                            rst_ref(namespace, attr, fmt[attr]),
+                                            1))
+            lines.append("\n")
+
+    return "\n".join(lines)
+
+
+def parse_yaml(obj: Dict[str, Any]) -> str:
+    """Format the whole YAML into a RST string"""
+    lines = []
+
+    # Main header
+
+    lines.append(rst_header())
+
+    family = obj['name']
+
+    title = f"Family ``{family}`` netlink specification"
+    lines.append(rst_title(title))
+    lines.append(rst_paragraph(".. contents:: :depth: 3\n"))
+
+    if "doc" in obj:
+        lines.append(rst_subtitle("Summary"))
+        lines.append(rst_paragraph(obj["doc"], 0))
+
+    # Operations
+    if "operations" in obj:
+        lines.append(rst_subtitle("Operations"))
+        lines.append(parse_operations(obj["operations"]["list"], family))
+
+    # Multicast groups
+    if "mcast-groups" in obj:
+        lines.append(rst_subtitle("Multicast groups"))
+        lines.append(parse_mcast_group(obj["mcast-groups"]["list"]))
+
+    # Definitions
+    if "definitions" in obj:
+        lines.append(rst_subtitle("Definitions"))
+        lines.append(parse_definitions(obj["definitions"], family))
+
+    # Attributes set
+    if "attribute-sets" in obj:
+        lines.append(rst_subtitle("Attribute sets"))
+        lines.append(parse_attr_sets(obj["attribute-sets"], family))
+
+    # Sub-messages
+    if "sub-messages" in obj:
+        lines.append(rst_subtitle("Sub-messages"))
+        lines.append(parse_sub_messages(obj["sub-messages"], family))
+
+    return "\n".join(lines)
+
+
+# Main functions
+# ==============
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse arguments from user"""
+    parser = argparse.ArgumentParser(description="Netlink RST generator")
+
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-o", "--output", help="Output file name")
+
+    # Index and input are mutually exclusive
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "-x", "--index", action="store_true", help="Generate the index page"
+    )
+    group.add_argument("-i", "--input", help="YAML file name")
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    if args.input and not os.path.isfile(args.input):
+        logging.warning("%s is not a valid file.", args.input)
+        sys.exit(-1)
+
+    if not args.output:
+        logging.error("No output file specified.")
+        sys.exit(-1)
+
+    if os.path.isfile(args.output):
+        logging.debug("%s already exists. Overwriting it.", args.output)
+
+    return args
+
+
+def parse_yaml_file(filename: str) -> str:
+    """Transform the YAML specified by filename into a rst-formmated string"""
+    with open(filename, "r", encoding="utf-8") as spec_file:
+        yaml_data = yaml.safe_load(spec_file)
+        content = parse_yaml(yaml_data)
+
+    return content
+
+
+def write_to_rstfile(content: str, filename: str) -> None:
+    """Write the generated content into an RST file"""
+    logging.debug("Saving RST file to %s", filename)
+
+    with open(filename, "w", encoding="utf-8") as rst_file:
+        rst_file.write(content)
+
+
+def generate_main_index_rst(output: str) -> None:
+    """Generate the `networking_spec/index` content and write to the file"""
+    lines = []
+
+    lines.append(rst_header())
+    lines.append(rst_label("specs"))
+    lines.append(rst_title("Netlink Family Specifications"))
+    lines.append(rst_toctree(1))
+
+    index_dir = os.path.dirname(output)
+    logging.debug("Looking for .rst files in %s", index_dir)
+    for filename in sorted(os.listdir(index_dir)):
+        if not filename.endswith(".rst") or filename == "index.rst":
+            continue
+        lines.append(f"   {filename.replace('.rst', '')}\n")
+
+    logging.debug("Writing an index file at %s", output)
+    write_to_rstfile("".join(lines), output)
+
+
+def main() -> None:
+    """Main function that reads the YAML files and generates the RST files"""
+
+    args = parse_arguments()
+
+    if args.input:
+        logging.debug("Parsing %s", args.input)
+        try:
+            content = parse_yaml_file(os.path.join(args.input))
+        except Exception as exception:
+            logging.warning("Failed to parse %s.", args.input)
+            logging.warning(exception)
+            sys.exit(-1)
+
+        write_to_rstfile(content, args.output)
+
+    if args.index:
+        # Generate the index RST file
+        generate_main_index_rst(args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
deleted file mode 100755
index d3a7dfbcf929..000000000000
--- a/tools/net/ynl/ynl-gen-c.py
+++ /dev/null
@@ -1,3044 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
-
-import argparse
-import collections
-import filecmp
-import pathlib
-import os
-import re
-import shutil
-import sys
-import tempfile
-import yaml
-
-sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
-from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation, SpecEnumSet, SpecEnumEntry
-
-
-def c_upper(name):
-    return name.upper().replace('-', '_')
-
-
-def c_lower(name):
-    return name.lower().replace('-', '_')
-
-
-def limit_to_number(name):
-    """
-    Turn a string limit like u32-max or s64-min into its numerical value
-    """
-    if name[0] == 'u' and name.endswith('-min'):
-        return 0
-    width = int(name[1:-4])
-    if name[0] == 's':
-        width -= 1
-    value = (1 << width) - 1
-    if name[0] == 's' and name.endswith('-min'):
-        value = -value - 1
-    return value
-
-
-class BaseNlLib:
-    def get_family_id(self):
-        return 'ys->family_id'
-
-
-class Type(SpecAttr):
-    def __init__(self, family, attr_set, attr, value):
-        super().__init__(family, attr_set, attr, value)
-
-        self.attr = attr
-        self.attr_set = attr_set
-        self.type = attr['type']
-        self.checks = attr.get('checks', {})
-
-        self.request = False
-        self.reply = False
-
-        if 'len' in attr:
-            self.len = attr['len']
-
-        if 'nested-attributes' in attr:
-            self.nested_attrs = attr['nested-attributes']
-            if self.nested_attrs == family.name:
-                self.nested_render_name = c_lower(f"{family.ident_name}")
-            else:
-                self.nested_render_name = c_lower(f"{family.ident_name}_{self.nested_attrs}")
-
-            if self.nested_attrs in self.family.consts:
-                self.nested_struct_type = 'struct ' + self.nested_render_name + '_'
-            else:
-                self.nested_struct_type = 'struct ' + self.nested_render_name
-
-        self.c_name = c_lower(self.name)
-        if self.c_name in _C_KW:
-            self.c_name += '_'
-
-        # Added by resolve():
-        self.enum_name = None
-        delattr(self, "enum_name")
-
-    def _get_real_attr(self):
-        # if the attr is for a subset return the "real" attr (just one down, does not recurse)
-        return self.family.attr_sets[self.attr_set.subset_of][self.name]
-
-    def set_request(self):
-        self.request = True
-        if self.attr_set.subset_of:
-            self._get_real_attr().set_request()
-
-    def set_reply(self):
-        self.reply = True
-        if self.attr_set.subset_of:
-            self._get_real_attr().set_reply()
-
-    def get_limit(self, limit, default=None):
-        value = self.checks.get(limit, default)
-        if value is None:
-            return value
-        if isinstance(value, int):
-            return value
-        if value in self.family.consts:
-            raise Exception("Resolving family constants not implemented, yet")
-        return limit_to_number(value)
-
-    def get_limit_str(self, limit, default=None, suffix=''):
-        value = self.checks.get(limit, default)
-        if value is None:
-            return ''
-        if isinstance(value, int):
-            return str(value) + suffix
-        if value in self.family.consts:
-            return c_upper(f"{self.family['name']}-{value}")
-        return c_upper(value)
-
-    def resolve(self):
-        if 'name-prefix' in self.attr:
-            enum_name = f"{self.attr['name-prefix']}{self.name}"
-        else:
-            enum_name = f"{self.attr_set.name_prefix}{self.name}"
-        self.enum_name = c_upper(enum_name)
-
-        if self.attr_set.subset_of:
-            if self.checks != self._get_real_attr().checks:
-                raise Exception("Overriding checks not supported by codegen, yet")
-
-    def is_multi_val(self):
-        return None
-
-    def is_scalar(self):
-        return self.type in {'u8', 'u16', 'u32', 'u64', 's32', 's64'}
-
-    def is_recursive(self):
-        return False
-
-    def is_recursive_for_op(self, ri):
-        return self.is_recursive() and not ri.op
-
-    def presence_type(self):
-        return 'bit'
-
-    def presence_member(self, space, type_filter):
-        if self.presence_type() != type_filter:
-            return
-
-        if self.presence_type() == 'bit':
-            pfx = '__' if space == 'user' else ''
-            return f"{pfx}u32 {self.c_name}:1;"
-
-        if self.presence_type() == 'len':
-            pfx = '__' if space == 'user' else ''
-            return f"{pfx}u32 {self.c_name}_len;"
-
-    def _complex_member_type(self, ri):
-        return None
-
-    def free_needs_iter(self):
-        return False
-
-    def free(self, ri, var, ref):
-        if self.is_multi_val() or self.presence_type() == 'len':
-            ri.cw.p(f'free({var}->{ref}{self.c_name});')
-
-    def arg_member(self, ri):
-        member = self._complex_member_type(ri)
-        if member:
-            arg = [member + ' *' + self.c_name]
-            if self.presence_type() == 'count':
-                arg += ['unsigned int n_' + self.c_name]
-            return arg
-        raise Exception(f"Struct member not implemented for class type {self.type}")
-
-    def struct_member(self, ri):
-        if self.is_multi_val():
-            ri.cw.p(f"unsigned int n_{self.c_name};")
-        member = self._complex_member_type(ri)
-        if member:
-            ptr = '*' if self.is_multi_val() else ''
-            if self.is_recursive_for_op(ri):
-                ptr = '*'
-            ri.cw.p(f"{member} {ptr}{self.c_name};")
-            return
-        members = self.arg_member(ri)
-        for one in members:
-            ri.cw.p(one + ';')
-
-    def _attr_policy(self, policy):
-        return '{ .type = ' + policy + ', }'
-
-    def attr_policy(self, cw):
-        policy = f'NLA_{c_upper(self.type)}'
-        if self.attr.get('byte-order') == 'big-endian':
-            if self.type in {'u16', 'u32'}:
-                policy = f'NLA_BE{self.type[1:]}'
-
-        spec = self._attr_policy(policy)
-        cw.p(f"\t[{self.enum_name}] = {spec},")
-
-    def _attr_typol(self):
-        raise Exception(f"Type policy not implemented for class type {self.type}")
-
-    def attr_typol(self, cw):
-        typol = self._attr_typol()
-        cw.p(f'[{self.enum_name}] = {"{"} .name = "{self.name}", {typol}{"}"},')
-
-    def _attr_put_line(self, ri, var, line):
-        if self.presence_type() == 'bit':
-            ri.cw.p(f"if ({var}->_present.{self.c_name})")
-        elif self.presence_type() == 'len':
-            ri.cw.p(f"if ({var}->_present.{self.c_name}_len)")
-        ri.cw.p(f"{line};")
-
-    def _attr_put_simple(self, ri, var, put_type):
-        line = f"ynl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name})"
-        self._attr_put_line(ri, var, line)
-
-    def attr_put(self, ri, var):
-        raise Exception(f"Put not implemented for class type {self.type}")
-
-    def _attr_get(self, ri, var):
-        raise Exception(f"Attr get not implemented for class type {self.type}")
-
-    def attr_get(self, ri, var, first):
-        lines, init_lines, local_vars = self._attr_get(ri, var)
-        if type(lines) is str:
-            lines = [lines]
-        if type(init_lines) is str:
-            init_lines = [init_lines]
-
-        kw = 'if' if first else 'else if'
-        ri.cw.block_start(line=f"{kw} (type == {self.enum_name})")
-        if local_vars:
-            for local in local_vars:
-                ri.cw.p(local)
-            ri.cw.nl()
-
-        if not self.is_multi_val():
-            ri.cw.p("if (ynl_attr_validate(yarg, attr))")
-            ri.cw.p("return YNL_PARSE_CB_ERROR;")
-            if self.presence_type() == 'bit':
-                ri.cw.p(f"{var}->_present.{self.c_name} = 1;")
-
-        if init_lines:
-            ri.cw.nl()
-            for line in init_lines:
-                ri.cw.p(line)
-
-        for line in lines:
-            ri.cw.p(line)
-        ri.cw.block_end()
-        return True
-
-    def _setter_lines(self, ri, member, presence):
-        raise Exception(f"Setter not implemented for class type {self.type}")
-
-    def setter(self, ri, space, direction, deref=False, ref=None):
-        ref = (ref if ref else []) + [self.c_name]
-        var = "req"
-        member = f"{var}->{'.'.join(ref)}"
-
-        code = []
-        presence = ''
-        for i in range(0, len(ref)):
-            presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}"
-            # Every layer below last is a nest, so we know it uses bit presence
-            # last layer is "self" and may be a complex type
-            if i == len(ref) - 1 and self.presence_type() != 'bit':
-                continue
-            code.append(presence + ' = 1;')
-        code += self._setter_lines(ri, member, presence)
-
-        func_name = f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}"
-        free = bool([x for x in code if 'free(' in x])
-        alloc = bool([x for x in code if 'alloc(' in x])
-        if free and not alloc:
-            func_name = '__' + func_name
-        ri.cw.write_func('static inline void', func_name, body=code,
-                         args=[f'{type_name(ri, direction, deref=deref)} *{var}'] + self.arg_member(ri))
-
-
-class TypeUnused(Type):
-    def presence_type(self):
-        return ''
-
-    def arg_member(self, ri):
-        return []
-
-    def _attr_get(self, ri, var):
-        return ['return YNL_PARSE_CB_ERROR;'], None, None
-
-    def _attr_typol(self):
-        return '.type = YNL_PT_REJECT, '
-
-    def attr_policy(self, cw):
-        pass
-
-    def attr_put(self, ri, var):
-        pass
-
-    def attr_get(self, ri, var, first):
-        pass
-
-    def setter(self, ri, space, direction, deref=False, ref=None):
-        pass
-
-
-class TypePad(Type):
-    def presence_type(self):
-        return ''
-
-    def arg_member(self, ri):
-        return []
-
-    def _attr_typol(self):
-        return '.type = YNL_PT_IGNORE, '
-
-    def attr_put(self, ri, var):
-        pass
-
-    def attr_get(self, ri, var, first):
-        pass
-
-    def attr_policy(self, cw):
-        pass
-
-    def setter(self, ri, space, direction, deref=False, ref=None):
-        pass
-
-
-class TypeScalar(Type):
-    def __init__(self, family, attr_set, attr, value):
-        super().__init__(family, attr_set, attr, value)
-
-        self.byte_order_comment = ''
-        if 'byte-order' in attr:
-            self.byte_order_comment = f" /* {attr['byte-order']} */"
-
-        if 'enum' in self.attr:
-            enum = self.family.consts[self.attr['enum']]
-            low, high = enum.value_range()
-            if 'min' not in self.checks:
-                if low != 0 or self.type[0] == 's':
-                    self.checks['min'] = low
-            if 'max' not in self.checks:
-                self.checks['max'] = high
-
-        if 'min' in self.checks and 'max' in self.checks:
-            if self.get_limit('min') > self.get_limit('max'):
-                raise Exception(f'Invalid limit for "{self.name}" min: {self.get_limit("min")} max: {self.get_limit("max")}')
-            self.checks['range'] = True
-
-        low = min(self.get_limit('min', 0), self.get_limit('max', 0))
-        high = max(self.get_limit('min', 0), self.get_limit('max', 0))
-        if low < 0 and self.type[0] == 'u':
-            raise Exception(f'Invalid limit for "{self.name}" negative limit for unsigned type')
-        if low < -32768 or high > 32767:
-            self.checks['full-range'] = True
-
-        # Added by resolve():
-        self.is_bitfield = None
-        delattr(self, "is_bitfield")
-        self.type_name = None
-        delattr(self, "type_name")
-
-    def resolve(self):
-        self.resolve_up(super())
-
-        if 'enum-as-flags' in self.attr and self.attr['enum-as-flags']:
-            self.is_bitfield = True
-        elif 'enum' in self.attr:
-            self.is_bitfield = self.family.consts[self.attr['enum']]['type'] == 'flags'
-        else:
-            self.is_bitfield = False
-
-        if not self.is_bitfield and 'enum' in self.attr:
-            self.type_name = self.family.consts[self.attr['enum']].user_type
-        elif self.is_auto_scalar:
-            self.type_name = '__' + self.type[0] + '64'
-        else:
-            self.type_name = '__' + self.type
-
-    def _attr_policy(self, policy):
-        if 'flags-mask' in self.checks or self.is_bitfield:
-            if self.is_bitfield:
-                enum = self.family.consts[self.attr['enum']]
-                mask = enum.get_mask(as_flags=True)
-            else:
-                flags = self.family.consts[self.checks['flags-mask']]
-                flag_cnt = len(flags['entries'])
-                mask = (1 << flag_cnt) - 1
-            return f"NLA_POLICY_MASK({policy}, 0x{mask:x})"
-        elif 'full-range' in self.checks:
-            return f"NLA_POLICY_FULL_RANGE({policy}, &{c_lower(self.enum_name)}_range)"
-        elif 'range' in self.checks:
-            return f"NLA_POLICY_RANGE({policy}, {self.get_limit_str('min')}, {self.get_limit_str('max')})"
-        elif 'min' in self.checks:
-            return f"NLA_POLICY_MIN({policy}, {self.get_limit_str('min')})"
-        elif 'max' in self.checks:
-            return f"NLA_POLICY_MAX({policy}, {self.get_limit_str('max')})"
-        return super()._attr_policy(policy)
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_U{c_upper(self.type[1:])}, '
-
-    def arg_member(self, ri):
-        return [f'{self.type_name} {self.c_name}{self.byte_order_comment}']
-
-    def attr_put(self, ri, var):
-        self._attr_put_simple(ri, var, self.type)
-
-    def _attr_get(self, ri, var):
-        return f"{var}->{self.c_name} = ynl_attr_get_{self.type}(attr);", None, None
-
-    def _setter_lines(self, ri, member, presence):
-        return [f"{member} = {self.c_name};"]
-
-
-class TypeFlag(Type):
-    def arg_member(self, ri):
-        return []
-
-    def _attr_typol(self):
-        return '.type = YNL_PT_FLAG, '
-
-    def attr_put(self, ri, var):
-        self._attr_put_line(ri, var, f"ynl_attr_put(nlh, {self.enum_name}, NULL, 0)")
-
-    def _attr_get(self, ri, var):
-        return [], None, None
-
-    def _setter_lines(self, ri, member, presence):
-        return []
-
-
-class TypeString(Type):
-    def arg_member(self, ri):
-        return [f"const char *{self.c_name}"]
-
-    def presence_type(self):
-        return 'len'
-
-    def struct_member(self, ri):
-        ri.cw.p(f"char *{self.c_name};")
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_NUL_STR, '
-
-    def _attr_policy(self, policy):
-        if 'exact-len' in self.checks:
-            mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')'
-        else:
-            mem = '{ .type = ' + policy
-            if 'max-len' in self.checks:
-                mem += ', .len = ' + self.get_limit_str('max-len')
-            mem += ', }'
-        return mem
-
-    def attr_policy(self, cw):
-        if self.checks.get('unterminated-ok', False):
-            policy = 'NLA_STRING'
-        else:
-            policy = 'NLA_NUL_STRING'
-
-        spec = self._attr_policy(policy)
-        cw.p(f"\t[{self.enum_name}] = {spec},")
-
-    def attr_put(self, ri, var):
-        self._attr_put_simple(ri, var, 'str')
-
-    def _attr_get(self, ri, var):
-        len_mem = var + '->_present.' + self.c_name + '_len'
-        return [f"{len_mem} = len;",
-                f"{var}->{self.c_name} = malloc(len + 1);",
-                f"memcpy({var}->{self.c_name}, ynl_attr_get_str(attr), len);",
-                f"{var}->{self.c_name}[len] = 0;"], \
-               ['len = strnlen(ynl_attr_get_str(attr), ynl_attr_data_len(attr));'], \
-               ['unsigned int len;']
-
-    def _setter_lines(self, ri, member, presence):
-        return [f"free({member});",
-                f"{presence}_len = strlen({self.c_name});",
-                f"{member} = malloc({presence}_len + 1);",
-                f'memcpy({member}, {self.c_name}, {presence}_len);',
-                f'{member}[{presence}_len] = 0;']
-
-
-class TypeBinary(Type):
-    def arg_member(self, ri):
-        return [f"const void *{self.c_name}", 'size_t len']
-
-    def presence_type(self):
-        return 'len'
-
-    def struct_member(self, ri):
-        ri.cw.p(f"void *{self.c_name};")
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_BINARY,'
-
-    def _attr_policy(self, policy):
-        if len(self.checks) == 0:
-            pass
-        elif len(self.checks) == 1:
-            check_name = list(self.checks)[0]
-            if check_name not in {'exact-len', 'min-len', 'max-len'}:
-                raise Exception('Unsupported check for binary type: ' + check_name)
-        else:
-            raise Exception('More than one check for binary type not implemented, yet')
-
-        if len(self.checks) == 0:
-            mem = '{ .type = NLA_BINARY, }'
-        elif 'exact-len' in self.checks:
-            mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')'
-        elif 'min-len' in self.checks:
-            mem = '{ .len = ' + self.get_limit_str('min-len') + ', }'
-        elif 'max-len' in self.checks:
-            mem = 'NLA_POLICY_MAX_LEN(' + self.get_limit_str('max-len') + ')'
-
-        return mem
-
-    def attr_put(self, ri, var):
-        self._attr_put_line(ri, var, f"ynl_attr_put(nlh, {self.enum_name}, " +
-                            f"{var}->{self.c_name}, {var}->_present.{self.c_name}_len)")
-
-    def _attr_get(self, ri, var):
-        len_mem = var + '->_present.' + self.c_name + '_len'
-        return [f"{len_mem} = len;",
-                f"{var}->{self.c_name} = malloc(len);",
-                f"memcpy({var}->{self.c_name}, ynl_attr_data(attr), len);"], \
-               ['len = ynl_attr_data_len(attr);'], \
-               ['unsigned int len;']
-
-    def _setter_lines(self, ri, member, presence):
-        return [f"free({member});",
-                f"{presence}_len = len;",
-                f"{member} = malloc({presence}_len);",
-                f'memcpy({member}, {self.c_name}, {presence}_len);']
-
-
-class TypeBitfield32(Type):
-    def _complex_member_type(self, ri):
-        return "struct nla_bitfield32"
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_BITFIELD32, '
-
-    def _attr_policy(self, policy):
-        if not 'enum' in self.attr:
-            raise Exception('Enum required for bitfield32 attr')
-        enum = self.family.consts[self.attr['enum']]
-        mask = enum.get_mask(as_flags=True)
-        return f"NLA_POLICY_BITFIELD32({mask})"
-
-    def attr_put(self, ri, var):
-        line = f"ynl_attr_put(nlh, {self.enum_name}, &{var}->{self.c_name}, sizeof(struct nla_bitfield32))"
-        self._attr_put_line(ri, var, line)
-
-    def _attr_get(self, ri, var):
-        return f"memcpy(&{var}->{self.c_name}, ynl_attr_data(attr), sizeof(struct nla_bitfield32));", None, None
-
-    def _setter_lines(self, ri, member, presence):
-        return [f"memcpy(&{member}, {self.c_name}, sizeof(struct nla_bitfield32));"]
-
-
-class TypeNest(Type):
-    def is_recursive(self):
-        return self.family.pure_nested_structs[self.nested_attrs].recursive
-
-    def _complex_member_type(self, ri):
-        return self.nested_struct_type
-
-    def free(self, ri, var, ref):
-        at = '&'
-        if self.is_recursive_for_op(ri):
-            at = ''
-            ri.cw.p(f'if ({var}->{ref}{self.c_name})')
-        ri.cw.p(f'{self.nested_render_name}_free({at}{var}->{ref}{self.c_name});')
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
-
-    def _attr_policy(self, policy):
-        return 'NLA_POLICY_NESTED(' + self.nested_render_name + '_nl_policy)'
-
-    def attr_put(self, ri, var):
-        at = '' if self.is_recursive_for_op(ri) else '&'
-        self._attr_put_line(ri, var, f"{self.nested_render_name}_put(nlh, " +
-                            f"{self.enum_name}, {at}{var}->{self.c_name})")
-
-    def _attr_get(self, ri, var):
-        get_lines = [f"if ({self.nested_render_name}_parse(&parg, attr))",
-                     "return YNL_PARSE_CB_ERROR;"]
-        init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
-                      f"parg.data = &{var}->{self.c_name};"]
-        return get_lines, init_lines, None
-
-    def setter(self, ri, space, direction, deref=False, ref=None):
-        ref = (ref if ref else []) + [self.c_name]
-
-        for _, attr in ri.family.pure_nested_structs[self.nested_attrs].member_list():
-            if attr.is_recursive():
-                continue
-            attr.setter(ri, self.nested_attrs, direction, deref=deref, ref=ref)
-
-
-class TypeMultiAttr(Type):
-    def __init__(self, family, attr_set, attr, value, base_type):
-        super().__init__(family, attr_set, attr, value)
-
-        self.base_type = base_type
-
-    def is_multi_val(self):
-        return True
-
-    def presence_type(self):
-        return 'count'
-
-    def _complex_member_type(self, ri):
-        if 'type' not in self.attr or self.attr['type'] == 'nest':
-            return self.nested_struct_type
-        elif self.attr['type'] in scalars:
-            scalar_pfx = '__' if ri.ku_space == 'user' else ''
-            return scalar_pfx + self.attr['type']
-        else:
-            raise Exception(f"Sub-type {self.attr['type']} not supported yet")
-
-    def free_needs_iter(self):
-        return 'type' not in self.attr or self.attr['type'] == 'nest'
-
-    def free(self, ri, var, ref):
-        if self.attr['type'] in scalars:
-            ri.cw.p(f"free({var}->{ref}{self.c_name});")
-        elif 'type' not in self.attr or self.attr['type'] == 'nest':
-            ri.cw.p(f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)")
-            ri.cw.p(f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);')
-            ri.cw.p(f"free({var}->{ref}{self.c_name});")
-        else:
-            raise Exception(f"Free of MultiAttr sub-type {self.attr['type']} not supported yet")
-
-    def _attr_policy(self, policy):
-        return self.base_type._attr_policy(policy)
-
-    def _attr_typol(self):
-        return self.base_type._attr_typol()
-
-    def _attr_get(self, ri, var):
-        return f'n_{self.c_name}++;', None, None
-
-    def attr_put(self, ri, var):
-        if self.attr['type'] in scalars:
-            put_type = self.type
-            ri.cw.p(f"for (unsigned int i = 0; i < {var}->n_{self.c_name}; i++)")
-            ri.cw.p(f"ynl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name}[i]);")
-        elif 'type' not in self.attr or self.attr['type'] == 'nest':
-            ri.cw.p(f"for (unsigned int i = 0; i < {var}->n_{self.c_name}; i++)")
-            self._attr_put_line(ri, var, f"{self.nested_render_name}_put(nlh, " +
-                                f"{self.enum_name}, &{var}->{self.c_name}[i])")
-        else:
-            raise Exception(f"Put of MultiAttr sub-type {self.attr['type']} not supported yet")
-
-    def _setter_lines(self, ri, member, presence):
-        # For multi-attr we have a count, not presence, hack up the presence
-        presence = presence[:-(len('_present.') + len(self.c_name))] + "n_" + self.c_name
-        return [f"free({member});",
-                f"{member} = {self.c_name};",
-                f"{presence} = n_{self.c_name};"]
-
-
-class TypeArrayNest(Type):
-    def is_multi_val(self):
-        return True
-
-    def presence_type(self):
-        return 'count'
-
-    def _complex_member_type(self, ri):
-        if 'sub-type' not in self.attr or self.attr['sub-type'] == 'nest':
-            return self.nested_struct_type
-        elif self.attr['sub-type'] in scalars:
-            scalar_pfx = '__' if ri.ku_space == 'user' else ''
-            return scalar_pfx + self.attr['sub-type']
-        else:
-            raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet")
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
-
-    def _attr_get(self, ri, var):
-        local_vars = ['const struct nlattr *attr2;']
-        get_lines = [f'attr_{self.c_name} = attr;',
-                     'ynl_attr_for_each_nested(attr2, attr)',
-                     f'\t{var}->n_{self.c_name}++;']
-        return get_lines, None, local_vars
-
-
-class TypeNestTypeValue(Type):
-    def _complex_member_type(self, ri):
-        return self.nested_struct_type
-
-    def _attr_typol(self):
-        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
-
-    def _attr_get(self, ri, var):
-        prev = 'attr'
-        tv_args = ''
-        get_lines = []
-        local_vars = []
-        init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
-                      f"parg.data = &{var}->{self.c_name};"]
-        if 'type-value' in self.attr:
-            tv_names = [c_lower(x) for x in self.attr["type-value"]]
-            local_vars += [f'const struct nlattr *attr_{", *attr_".join(tv_names)};']
-            local_vars += [f'__u32 {", ".join(tv_names)};']
-            for level in self.attr["type-value"]:
-                level = c_lower(level)
-                get_lines += [f'attr_{level} = ynl_attr_data({prev});']
-                get_lines += [f'{level} = ynl_attr_type(attr_{level});']
-                prev = 'attr_' + level
-
-            tv_args = f", {', '.join(tv_names)}"
-
-        get_lines += [f"{self.nested_render_name}_parse(&parg, {prev}{tv_args});"]
-        return get_lines, init_lines, local_vars
-
-
-class Struct:
-    def __init__(self, family, space_name, type_list=None, inherited=None):
-        self.family = family
-        self.space_name = space_name
-        self.attr_set = family.attr_sets[space_name]
-        # Use list to catch comparisons with empty sets
-        self._inherited = inherited if inherited is not None else []
-        self.inherited = []
-
-        self.nested = type_list is None
-        if family.name == c_lower(space_name):
-            self.render_name = c_lower(family.ident_name)
-        else:
-            self.render_name = c_lower(family.ident_name + '-' + space_name)
-        self.struct_name = 'struct ' + self.render_name
-        if self.nested and space_name in family.consts:
-            self.struct_name += '_'
-        self.ptr_name = self.struct_name + ' *'
-        # All attr sets this one contains, directly or multiple levels down
-        self.child_nests = set()
-
-        self.request = False
-        self.reply = False
-        self.recursive = False
-
-        self.attr_list = []
-        self.attrs = dict()
-        if type_list is not None:
-            for t in type_list:
-                self.attr_list.append((t, self.attr_set[t]),)
-        else:
-            for t in self.attr_set:
-                self.attr_list.append((t, self.attr_set[t]),)
-
-        max_val = 0
-        self.attr_max_val = None
-        for name, attr in self.attr_list:
-            if attr.value >= max_val:
-                max_val = attr.value
-                self.attr_max_val = attr
-            self.attrs[name] = attr
-
-    def __iter__(self):
-        yield from self.attrs
-
-    def __getitem__(self, key):
-        return self.attrs[key]
-
-    def member_list(self):
-        return self.attr_list
-
-    def set_inherited(self, new_inherited):
-        if self._inherited != new_inherited:
-            raise Exception("Inheriting different members not supported")
-        self.inherited = [c_lower(x) for x in sorted(self._inherited)]
-
-
-class EnumEntry(SpecEnumEntry):
-    def __init__(self, enum_set, yaml, prev, value_start):
-        super().__init__(enum_set, yaml, prev, value_start)
-
-        if prev:
-            self.value_change = (self.value != prev.value + 1)
-        else:
-            self.value_change = (self.value != 0)
-        self.value_change = self.value_change or self.enum_set['type'] == 'flags'
-
-        # Added by resolve:
-        self.c_name = None
-        delattr(self, "c_name")
-
-    def resolve(self):
-        self.resolve_up(super())
-
-        self.c_name = c_upper(self.enum_set.value_pfx + self.name)
-
-
-class EnumSet(SpecEnumSet):
-    def __init__(self, family, yaml):
-        self.render_name = c_lower(family.ident_name + '-' + yaml['name'])
-
-        if 'enum-name' in yaml:
-            if yaml['enum-name']:
-                self.enum_name = 'enum ' + c_lower(yaml['enum-name'])
-                self.user_type = self.enum_name
-            else:
-                self.enum_name = None
-        else:
-            self.enum_name = 'enum ' + self.render_name
-
-        if self.enum_name:
-            self.user_type = self.enum_name
-        else:
-            self.user_type = 'int'
-
-        self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-")
-        self.header = yaml.get('header', None)
-        self.enum_cnt_name = yaml.get('enum-cnt-name', None)
-
-        super().__init__(family, yaml)
-
-    def new_entry(self, entry, prev_entry, value_start):
-        return EnumEntry(self, entry, prev_entry, value_start)
-
-    def value_range(self):
-        low = min([x.value for x in self.entries.values()])
-        high = max([x.value for x in self.entries.values()])
-
-        if high - low + 1 != len(self.entries):
-            raise Exception("Can't get value range for a noncontiguous enum")
-
-        return low, high
-
-
-class AttrSet(SpecAttrSet):
-    def __init__(self, family, yaml):
-        super().__init__(family, yaml)
-
-        if self.subset_of is None:
-            if 'name-prefix' in yaml:
-                pfx = yaml['name-prefix']
-            elif self.name == family.name:
-                pfx = family.ident_name + '-a-'
-            else:
-                pfx = f"{family.ident_name}-a-{self.name}-"
-            self.name_prefix = c_upper(pfx)
-            self.max_name = c_upper(self.yaml.get('attr-max-name', f"{self.name_prefix}max"))
-            self.cnt_name = c_upper(self.yaml.get('attr-cnt-name', f"__{self.name_prefix}max"))
-        else:
-            self.name_prefix = family.attr_sets[self.subset_of].name_prefix
-            self.max_name = family.attr_sets[self.subset_of].max_name
-            self.cnt_name = family.attr_sets[self.subset_of].cnt_name
-
-        # Added by resolve:
-        self.c_name = None
-        delattr(self, "c_name")
-
-    def resolve(self):
-        self.c_name = c_lower(self.name)
-        if self.c_name in _C_KW:
-            self.c_name += '_'
-        if self.c_name == self.family.c_name:
-            self.c_name = ''
-
-    def new_attr(self, elem, value):
-        if elem['type'] in scalars:
-            t = TypeScalar(self.family, self, elem, value)
-        elif elem['type'] == 'unused':
-            t = TypeUnused(self.family, self, elem, value)
-        elif elem['type'] == 'pad':
-            t = TypePad(self.family, self, elem, value)
-        elif elem['type'] == 'flag':
-            t = TypeFlag(self.family, self, elem, value)
-        elif elem['type'] == 'string':
-            t = TypeString(self.family, self, elem, value)
-        elif elem['type'] == 'binary':
-            t = TypeBinary(self.family, self, elem, value)
-        elif elem['type'] == 'bitfield32':
-            t = TypeBitfield32(self.family, self, elem, value)
-        elif elem['type'] == 'nest':
-            t = TypeNest(self.family, self, elem, value)
-        elif elem['type'] == 'indexed-array' and 'sub-type' in elem:
-            if elem["sub-type"] == 'nest':
-                t = TypeArrayNest(self.family, self, elem, value)
-            else:
-                raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}')
-        elif elem['type'] == 'nest-type-value':
-            t = TypeNestTypeValue(self.family, self, elem, value)
-        else:
-            raise Exception(f"No typed class for type {elem['type']}")
-
-        if 'multi-attr' in elem and elem['multi-attr']:
-            t = TypeMultiAttr(self.family, self, elem, value, t)
-
-        return t
-
-
-class Operation(SpecOperation):
-    def __init__(self, family, yaml, req_value, rsp_value):
-        super().__init__(family, yaml, req_value, rsp_value)
-
-        self.render_name = c_lower(family.ident_name + '_' + self.name)
-
-        self.dual_policy = ('do' in yaml and 'request' in yaml['do']) and \
-                         ('dump' in yaml and 'request' in yaml['dump'])
-
-        self.has_ntf = False
-
-        # Added by resolve:
-        self.enum_name = None
-        delattr(self, "enum_name")
-
-    def resolve(self):
-        self.resolve_up(super())
-
-        if not self.is_async:
-            self.enum_name = self.family.op_prefix + c_upper(self.name)
-        else:
-            self.enum_name = self.family.async_op_prefix + c_upper(self.name)
-
-    def mark_has_ntf(self):
-        self.has_ntf = True
-
-
-class Family(SpecFamily):
-    def __init__(self, file_name, exclude_ops):
-        # Added by resolve:
-        self.c_name = None
-        delattr(self, "c_name")
-        self.op_prefix = None
-        delattr(self, "op_prefix")
-        self.async_op_prefix = None
-        delattr(self, "async_op_prefix")
-        self.mcgrps = None
-        delattr(self, "mcgrps")
-        self.consts = None
-        delattr(self, "consts")
-        self.hooks = None
-        delattr(self, "hooks")
-
-        super().__init__(file_name, exclude_ops=exclude_ops)
-
-        self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME'))
-        self.ver_key = c_upper(self.yaml.get('c-version-name', self.yaml["name"] + '_FAMILY_VERSION'))
-
-        if 'definitions' not in self.yaml:
-            self.yaml['definitions'] = []
-
-        if 'uapi-header' in self.yaml:
-            self.uapi_header = self.yaml['uapi-header']
-        else:
-            self.uapi_header = f"linux/{self.ident_name}.h"
-        if self.uapi_header.startswith("linux/") and self.uapi_header.endswith('.h'):
-            self.uapi_header_name = self.uapi_header[6:-2]
-        else:
-            self.uapi_header_name = self.ident_name
-
-    def resolve(self):
-        self.resolve_up(super())
-
-        if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}:
-            raise Exception("Codegen only supported for genetlink")
-
-        self.c_name = c_lower(self.ident_name)
-        if 'name-prefix' in self.yaml['operations']:
-            self.op_prefix = c_upper(self.yaml['operations']['name-prefix'])
-        else:
-            self.op_prefix = c_upper(self.yaml['name'] + '-cmd-')
-        if 'async-prefix' in self.yaml['operations']:
-            self.async_op_prefix = c_upper(self.yaml['operations']['async-prefix'])
-        else:
-            self.async_op_prefix = self.op_prefix
-
-        self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
-
-        self.hooks = dict()
-        for when in ['pre', 'post']:
-            self.hooks[when] = dict()
-            for op_mode in ['do', 'dump']:
-                self.hooks[when][op_mode] = dict()
-                self.hooks[when][op_mode]['set'] = set()
-                self.hooks[when][op_mode]['list'] = []
-
-        # dict space-name -> 'request': set(attrs), 'reply': set(attrs)
-        self.root_sets = dict()
-        # dict space-name -> set('request', 'reply')
-        self.pure_nested_structs = dict()
-
-        self._mark_notify()
-        self._mock_up_events()
-
-        self._load_root_sets()
-        self._load_nested_sets()
-        self._load_attr_use()
-        self._load_hooks()
-
-        self.kernel_policy = self.yaml.get('kernel-policy', 'split')
-        if self.kernel_policy == 'global':
-            self._load_global_policy()
-
-    def new_enum(self, elem):
-        return EnumSet(self, elem)
-
-    def new_attr_set(self, elem):
-        return AttrSet(self, elem)
-
-    def new_operation(self, elem, req_value, rsp_value):
-        return Operation(self, elem, req_value, rsp_value)
-
-    def _mark_notify(self):
-        for op in self.msgs.values():
-            if 'notify' in op:
-                self.ops[op['notify']].mark_has_ntf()
-
-    # Fake a 'do' equivalent of all events, so that we can render their response parsing
-    def _mock_up_events(self):
-        for op in self.yaml['operations']['list']:
-            if 'event' in op:
-                op['do'] = {
-                    'reply': {
-                        'attributes': op['event']['attributes']
-                    }
-                }
-
-    def _load_root_sets(self):
-        for op_name, op in self.msgs.items():
-            if 'attribute-set' not in op:
-                continue
-
-            req_attrs = set()
-            rsp_attrs = set()
-            for op_mode in ['do', 'dump']:
-                if op_mode in op and 'request' in op[op_mode]:
-                    req_attrs.update(set(op[op_mode]['request']['attributes']))
-                if op_mode in op and 'reply' in op[op_mode]:
-                    rsp_attrs.update(set(op[op_mode]['reply']['attributes']))
-            if 'event' in op:
-                rsp_attrs.update(set(op['event']['attributes']))
-
-            if op['attribute-set'] not in self.root_sets:
-                self.root_sets[op['attribute-set']] = {'request': req_attrs, 'reply': rsp_attrs}
-            else:
-                self.root_sets[op['attribute-set']]['request'].update(req_attrs)
-                self.root_sets[op['attribute-set']]['reply'].update(rsp_attrs)
-
-    def _sort_pure_types(self):
-        # Try to reorder according to dependencies
-        pns_key_list = list(self.pure_nested_structs.keys())
-        pns_key_seen = set()
-        rounds = len(pns_key_list) ** 2  # it's basically bubble sort
-        for _ in range(rounds):
-            if len(pns_key_list) == 0:
-                break
-            name = pns_key_list.pop(0)
-            finished = True
-            for _, spec in self.attr_sets[name].items():
-                if 'nested-attributes' in spec:
-                    nested = spec['nested-attributes']
-                    # If the unknown nest we hit is recursive it's fine, it'll be a pointer
-                    if self.pure_nested_structs[nested].recursive:
-                        continue
-                    if nested not in pns_key_seen:
-                        # Dicts are sorted, this will make struct last
-                        struct = self.pure_nested_structs.pop(name)
-                        self.pure_nested_structs[name] = struct
-                        finished = False
-                        break
-            if finished:
-                pns_key_seen.add(name)
-            else:
-                pns_key_list.append(name)
-
-    def _load_nested_sets(self):
-        attr_set_queue = list(self.root_sets.keys())
-        attr_set_seen = set(self.root_sets.keys())
-
-        while len(attr_set_queue):
-            a_set = attr_set_queue.pop(0)
-            for attr, spec in self.attr_sets[a_set].items():
-                if 'nested-attributes' not in spec:
-                    continue
-
-                nested = spec['nested-attributes']
-                if nested not in attr_set_seen:
-                    attr_set_queue.append(nested)
-                    attr_set_seen.add(nested)
-
-                inherit = set()
-                if nested not in self.root_sets:
-                    if nested not in self.pure_nested_structs:
-                        self.pure_nested_structs[nested] = Struct(self, nested, inherited=inherit)
-                else:
-                    raise Exception(f'Using attr set as root and nested not supported - {nested}')
-
-                if 'type-value' in spec:
-                    if nested in self.root_sets:
-                        raise Exception("Inheriting members to a space used as root not supported")
-                    inherit.update(set(spec['type-value']))
-                elif spec['type'] == 'indexed-array':
-                    inherit.add('idx')
-                self.pure_nested_structs[nested].set_inherited(inherit)
-
-        for root_set, rs_members in self.root_sets.items():
-            for attr, spec in self.attr_sets[root_set].items():
-                if 'nested-attributes' in spec:
-                    nested = spec['nested-attributes']
-                    if attr in rs_members['request']:
-                        self.pure_nested_structs[nested].request = True
-                    if attr in rs_members['reply']:
-                        self.pure_nested_structs[nested].reply = True
-
-        self._sort_pure_types()
-
-        # Propagate the request / reply / recursive
-        for attr_set, struct in reversed(self.pure_nested_structs.items()):
-            for _, spec in self.attr_sets[attr_set].items():
-                if 'nested-attributes' in spec:
-                    child_name = spec['nested-attributes']
-                    struct.child_nests.add(child_name)
-                    child = self.pure_nested_structs.get(child_name)
-                    if child:
-                        if not child.recursive:
-                            struct.child_nests.update(child.child_nests)
-                        child.request |= struct.request
-                        child.reply |= struct.reply
-                if attr_set in struct.child_nests:
-                    struct.recursive = True
-
-        self._sort_pure_types()
-
-    def _load_attr_use(self):
-        for _, struct in self.pure_nested_structs.items():
-            if struct.request:
-                for _, arg in struct.member_list():
-                    arg.set_request()
-            if struct.reply:
-                for _, arg in struct.member_list():
-                    arg.set_reply()
-
-        for root_set, rs_members in self.root_sets.items():
-            for attr, spec in self.attr_sets[root_set].items():
-                if attr in rs_members['request']:
-                    spec.set_request()
-                if attr in rs_members['reply']:
-                    spec.set_reply()
-
-    def _load_global_policy(self):
-        global_set = set()
-        attr_set_name = None
-        for op_name, op in self.ops.items():
-            if not op:
-                continue
-            if 'attribute-set' not in op:
-                continue
-
-            if attr_set_name is None:
-                attr_set_name = op['attribute-set']
-            if attr_set_name != op['attribute-set']:
-                raise Exception('For a global policy all ops must use the same set')
-
-            for op_mode in ['do', 'dump']:
-                if op_mode in op:
-                    req = op[op_mode].get('request')
-                    if req:
-                        global_set.update(req.get('attributes', []))
-
-        self.global_policy = []
-        self.global_policy_set = attr_set_name
-        for attr in self.attr_sets[attr_set_name]:
-            if attr in global_set:
-                self.global_policy.append(attr)
-
-    def _load_hooks(self):
-        for op in self.ops.values():
-            for op_mode in ['do', 'dump']:
-                if op_mode not in op:
-                    continue
-                for when in ['pre', 'post']:
-                    if when not in op[op_mode]:
-                        continue
-                    name = op[op_mode][when]
-                    if name in self.hooks[when][op_mode]['set']:
-                        continue
-                    self.hooks[when][op_mode]['set'].add(name)
-                    self.hooks[when][op_mode]['list'].append(name)
-
-
-class RenderInfo:
-    def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None):
-        self.family = family
-        self.nl = cw.nlib
-        self.ku_space = ku_space
-        self.op_mode = op_mode
-        self.op = op
-
-        self.fixed_hdr = None
-        if op and op.fixed_header:
-            self.fixed_hdr = 'struct ' + c_lower(op.fixed_header)
-
-        # 'do' and 'dump' response parsing is identical
-        self.type_consistent = True
-        if op_mode != 'do' and 'dump' in op:
-            if 'do' in op:
-                if ('reply' in op['do']) != ('reply' in op["dump"]):
-                    self.type_consistent = False
-                elif 'reply' in op['do'] and op["do"]["reply"] != op["dump"]["reply"]:
-                    self.type_consistent = False
-            else:
-                self.type_consistent = False
-
-        self.attr_set = attr_set
-        if not self.attr_set:
-            self.attr_set = op['attribute-set']
-
-        self.type_name_conflict = False
-        if op:
-            self.type_name = c_lower(op.name)
-        else:
-            self.type_name = c_lower(attr_set)
-            if attr_set in family.consts:
-                self.type_name_conflict = True
-
-        self.cw = cw
-
-        self.struct = dict()
-        if op_mode == 'notify':
-            op_mode = 'do'
-        for op_dir in ['request', 'reply']:
-            if op:
-                type_list = []
-                if op_dir in op[op_mode]:
-                    type_list = op[op_mode][op_dir]['attributes']
-                self.struct[op_dir] = Struct(family, self.attr_set, type_list=type_list)
-        if op_mode == 'event':
-            self.struct['reply'] = Struct(family, self.attr_set, type_list=op['event']['attributes'])
-
-
-class CodeWriter:
-    def __init__(self, nlib, out_file=None, overwrite=True):
-        self.nlib = nlib
-        self._overwrite = overwrite
-
-        self._nl = False
-        self._block_end = False
-        self._silent_block = False
-        self._ind = 0
-        self._ifdef_block = None
-        if out_file is None:
-            self._out = os.sys.stdout
-        else:
-            self._out = tempfile.NamedTemporaryFile('w+')
-            self._out_file = out_file
-
-    def __del__(self):
-        self.close_out_file()
-
-    def close_out_file(self):
-        if self._out == os.sys.stdout:
-            return
-        # Avoid modifying the file if contents didn't change
-        self._out.flush()
-        if not self._overwrite and os.path.isfile(self._out_file):
-            if filecmp.cmp(self._out.name, self._out_file, shallow=False):
-                return
-        with open(self._out_file, 'w+') as out_file:
-            self._out.seek(0)
-            shutil.copyfileobj(self._out, out_file)
-            self._out.close()
-        self._out = os.sys.stdout
-
-    @classmethod
-    def _is_cond(cls, line):
-        return line.startswith('if') or line.startswith('while') or line.startswith('for')
-
-    def p(self, line, add_ind=0):
-        if self._block_end:
-            self._block_end = False
-            if line.startswith('else'):
-                line = '} ' + line
-            else:
-                self._out.write('\t' * self._ind + '}\n')
-
-        if self._nl:
-            self._out.write('\n')
-            self._nl = False
-
-        ind = self._ind
-        if line[-1] == ':':
-            ind -= 1
-        if self._silent_block:
-            ind += 1
-        self._silent_block = line.endswith(')') and CodeWriter._is_cond(line)
-        if line[0] == '#':
-            ind = 0
-        if add_ind:
-            ind += add_ind
-        self._out.write('\t' * ind + line + '\n')
-
-    def nl(self):
-        self._nl = True
-
-    def block_start(self, line=''):
-        if line:
-            line = line + ' '
-        self.p(line + '{')
-        self._ind += 1
-
-    def block_end(self, line=''):
-        if line and line[0] not in {';', ','}:
-            line = ' ' + line
-        self._ind -= 1
-        self._nl = False
-        if not line:
-            # Delay printing closing bracket in case "else" comes next
-            if self._block_end:
-                self._out.write('\t' * (self._ind + 1) + '}\n')
-            self._block_end = True
-        else:
-            self.p('}' + line)
-
-    def write_doc_line(self, doc, indent=True):
-        words = doc.split()
-        line = ' *'
-        for word in words:
-            if len(line) + len(word) >= 79:
-                self.p(line)
-                line = ' *'
-                if indent:
-                    line += '  '
-            line += ' ' + word
-        self.p(line)
-
-    def write_func_prot(self, qual_ret, name, args=None, doc=None, suffix=''):
-        if not args:
-            args = ['void']
-
-        if doc:
-            self.p('/*')
-            self.p(' * ' + doc)
-            self.p(' */')
-
-        oneline = qual_ret
-        if qual_ret[-1] != '*':
-            oneline += ' '
-        oneline += f"{name}({', '.join(args)}){suffix}"
-
-        if len(oneline) < 80:
-            self.p(oneline)
-            return
-
-        v = qual_ret
-        if len(v) > 3:
-            self.p(v)
-            v = ''
-        elif qual_ret[-1] != '*':
-            v += ' '
-        v += name + '('
-        ind = '\t' * (len(v) // 8) + ' ' * (len(v) % 8)
-        delta_ind = len(v) - len(ind)
-        v += args[0]
-        i = 1
-        while i < len(args):
-            next_len = len(v) + len(args[i])
-            if v[0] == '\t':
-                next_len += delta_ind
-            if next_len > 76:
-                self.p(v + ',')
-                v = ind
-            else:
-                v += ', '
-            v += args[i]
-            i += 1
-        self.p(v + ')' + suffix)
-
-    def write_func_lvar(self, local_vars):
-        if not local_vars:
-            return
-
-        if type(local_vars) is str:
-            local_vars = [local_vars]
-
-        local_vars.sort(key=len, reverse=True)
-        for var in local_vars:
-            self.p(var)
-        self.nl()
-
-    def write_func(self, qual_ret, name, body, args=None, local_vars=None):
-        self.write_func_prot(qual_ret=qual_ret, name=name, args=args)
-        self.write_func_lvar(local_vars=local_vars)
-
-        self.block_start()
-        for line in body:
-            self.p(line)
-        self.block_end()
-
-    def writes_defines(self, defines):
-        longest = 0
-        for define in defines:
-            if len(define[0]) > longest:
-                longest = len(define[0])
-        longest = ((longest + 8) // 8) * 8
-        for define in defines:
-            line = '#define ' + define[0]
-            line += '\t' * ((longest - len(define[0]) + 7) // 8)
-            if type(define[1]) is int:
-                line += str(define[1])
-            elif type(define[1]) is str:
-                line += '"' + define[1] + '"'
-            self.p(line)
-
-    def write_struct_init(self, members):
-        longest = max([len(x[0]) for x in members])
-        longest += 1  # because we prepend a .
-        longest = ((longest + 8) // 8) * 8
-        for one in members:
-            line = '.' + one[0]
-            line += '\t' * ((longest - len(one[0]) - 1 + 7) // 8)
-            line += '= ' + str(one[1]) + ','
-            self.p(line)
-
-    def ifdef_block(self, config):
-        config_option = None
-        if config:
-            config_option = 'CONFIG_' + c_upper(config)
-        if self._ifdef_block == config_option:
-            return
-
-        if self._ifdef_block:
-            self.p('#endif /* ' + self._ifdef_block + ' */')
-        if config_option:
-            self.p('#ifdef ' + config_option)
-        self._ifdef_block = config_option
-
-
-scalars = {'u8', 'u16', 'u32', 'u64', 's32', 's64', 'uint', 'sint'}
-
-direction_to_suffix = {
-    'reply': '_rsp',
-    'request': '_req',
-    '': ''
-}
-
-op_mode_to_wrapper = {
-    'do': '',
-    'dump': '_list',
-    'notify': '_ntf',
-    'event': '',
-}
-
-_C_KW = {
-    'auto',
-    'bool',
-    'break',
-    'case',
-    'char',
-    'const',
-    'continue',
-    'default',
-    'do',
-    'double',
-    'else',
-    'enum',
-    'extern',
-    'float',
-    'for',
-    'goto',
-    'if',
-    'inline',
-    'int',
-    'long',
-    'register',
-    'return',
-    'short',
-    'signed',
-    'sizeof',
-    'static',
-    'struct',
-    'switch',
-    'typedef',
-    'union',
-    'unsigned',
-    'void',
-    'volatile',
-    'while'
-}
-
-
-def rdir(direction):
-    if direction == 'reply':
-        return 'request'
-    if direction == 'request':
-        return 'reply'
-    return direction
-
-
-def op_prefix(ri, direction, deref=False):
-    suffix = f"_{ri.type_name}"
-
-    if not ri.op_mode or ri.op_mode == 'do':
-        suffix += f"{direction_to_suffix[direction]}"
-    else:
-        if direction == 'request':
-            suffix += '_req_dump'
-        else:
-            if ri.type_consistent:
-                if deref:
-                    suffix += f"{direction_to_suffix[direction]}"
-                else:
-                    suffix += op_mode_to_wrapper[ri.op_mode]
-            else:
-                suffix += '_rsp'
-                suffix += '_dump' if deref else '_list'
-
-    return f"{ri.family.c_name}{suffix}"
-
-
-def type_name(ri, direction, deref=False):
-    return f"struct {op_prefix(ri, direction, deref=deref)}"
-
-
-def print_prototype(ri, direction, terminate=True, doc=None):
-    suffix = ';' if terminate else ''
-
-    fname = ri.op.render_name
-    if ri.op_mode == 'dump':
-        fname += '_dump'
-
-    args = ['struct ynl_sock *ys']
-    if 'request' in ri.op[ri.op_mode]:
-        args.append(f"{type_name(ri, direction)} *" + f"{direction_to_suffix[direction][1:]}")
-
-    ret = 'int'
-    if 'reply' in ri.op[ri.op_mode]:
-        ret = f"{type_name(ri, rdir(direction))} *"
-
-    ri.cw.write_func_prot(ret, fname, args, doc=doc, suffix=suffix)
-
-
-def print_req_prototype(ri):
-    print_prototype(ri, "request", doc=ri.op['doc'])
-
-
-def print_dump_prototype(ri):
-    print_prototype(ri, "request")
-
-
-def put_typol_fwd(cw, struct):
-    cw.p(f'extern const struct ynl_policy_nest {struct.render_name}_nest;')
-
-
-def put_typol(cw, struct):
-    type_max = struct.attr_set.max_name
-    cw.block_start(line=f'const struct ynl_policy_attr {struct.render_name}_policy[{type_max} + 1] =')
-
-    for _, arg in struct.member_list():
-        arg.attr_typol(cw)
-
-    cw.block_end(line=';')
-    cw.nl()
-
-    cw.block_start(line=f'const struct ynl_policy_nest {struct.render_name}_nest =')
-    cw.p(f'.max_attr = {type_max},')
-    cw.p(f'.table = {struct.render_name}_policy,')
-    cw.block_end(line=';')
-    cw.nl()
-
-
-def _put_enum_to_str_helper(cw, render_name, map_name, arg_name, enum=None):
-    args = [f'int {arg_name}']
-    if enum:
-        args = [enum.user_type + ' ' + arg_name]
-    cw.write_func_prot('const char *', f'{render_name}_str', args)
-    cw.block_start()
-    if enum and enum.type == 'flags':
-        cw.p(f'{arg_name} = ffs({arg_name}) - 1;')
-    cw.p(f'if ({arg_name} < 0 || {arg_name} >= (int)YNL_ARRAY_SIZE({map_name}))')
-    cw.p('return NULL;')
-    cw.p(f'return {map_name}[{arg_name}];')
-    cw.block_end()
-    cw.nl()
-
-
-def put_op_name_fwd(family, cw):
-    cw.write_func_prot('const char *', f'{family.c_name}_op_str', ['int op'], suffix=';')
-
-
-def put_op_name(family, cw):
-    map_name = f'{family.c_name}_op_strmap'
-    cw.block_start(line=f"static const char * const {map_name}[] =")
-    for op_name, op in family.msgs.items():
-        if op.rsp_value:
-            # Make sure we don't add duplicated entries, if multiple commands
-            # produce the same response in legacy families.
-            if family.rsp_by_value[op.rsp_value] != op:
-                cw.p(f'// skip "{op_name}", duplicate reply value')
-                continue
-
-            if op.req_value == op.rsp_value:
-                cw.p(f'[{op.enum_name}] = "{op_name}",')
-            else:
-                cw.p(f'[{op.rsp_value}] = "{op_name}",')
-    cw.block_end(line=';')
-    cw.nl()
-
-    _put_enum_to_str_helper(cw, family.c_name + '_op', map_name, 'op')
-
-
-def put_enum_to_str_fwd(family, cw, enum):
-    args = [enum.user_type + ' value']
-    cw.write_func_prot('const char *', f'{enum.render_name}_str', args, suffix=';')
-
-
-def put_enum_to_str(family, cw, enum):
-    map_name = f'{enum.render_name}_strmap'
-    cw.block_start(line=f"static const char * const {map_name}[] =")
-    for entry in enum.entries.values():
-        cw.p(f'[{entry.value}] = "{entry.name}",')
-    cw.block_end(line=';')
-    cw.nl()
-
-    _put_enum_to_str_helper(cw, enum.render_name, map_name, 'value', enum=enum)
-
-
-def put_req_nested_prototype(ri, struct, suffix=';'):
-    func_args = ['struct nlmsghdr *nlh',
-                 'unsigned int attr_type',
-                 f'{struct.ptr_name}obj']
-
-    ri.cw.write_func_prot('int', f'{struct.render_name}_put', func_args,
-                          suffix=suffix)
-
-
-def put_req_nested(ri, struct):
-    put_req_nested_prototype(ri, struct, suffix='')
-    ri.cw.block_start()
-    ri.cw.write_func_lvar('struct nlattr *nest;')
-
-    ri.cw.p("nest = ynl_attr_nest_start(nlh, attr_type);")
-
-    for _, arg in struct.member_list():
-        arg.attr_put(ri, "obj")
-
-    ri.cw.p("ynl_attr_nest_end(nlh, nest);")
-
-    ri.cw.nl()
-    ri.cw.p('return 0;')
-    ri.cw.block_end()
-    ri.cw.nl()
-
-
-def _multi_parse(ri, struct, init_lines, local_vars):
-    if struct.nested:
-        iter_line = "ynl_attr_for_each_nested(attr, nested)"
-    else:
-        if ri.fixed_hdr:
-            local_vars += ['void *hdr;']
-        iter_line = "ynl_attr_for_each(attr, nlh, yarg->ys->family->hdr_len)"
-
-    array_nests = set()
-    multi_attrs = set()
-    needs_parg = False
-    for arg, aspec in struct.member_list():
-        if aspec['type'] == 'indexed-array' and 'sub-type' in aspec:
-            if aspec["sub-type"] == 'nest':
-                local_vars.append(f'const struct nlattr *attr_{aspec.c_name};')
-                array_nests.add(arg)
-            else:
-                raise Exception(f'Not supported sub-type {aspec["sub-type"]}')
-        if 'multi-attr' in aspec:
-            multi_attrs.add(arg)
-        needs_parg |= 'nested-attributes' in aspec
-    if array_nests or multi_attrs:
-        local_vars.append('int i;')
-    if needs_parg:
-        local_vars.append('struct ynl_parse_arg parg;')
-        init_lines.append('parg.ys = yarg->ys;')
-
-    all_multi = array_nests | multi_attrs
-
-    for anest in sorted(all_multi):
-        local_vars.append(f"unsigned int n_{struct[anest].c_name} = 0;")
-
-    ri.cw.block_start()
-    ri.cw.write_func_lvar(local_vars)
-
-    for line in init_lines:
-        ri.cw.p(line)
-    ri.cw.nl()
-
-    for arg in struct.inherited:
-        ri.cw.p(f'dst->{arg} = {arg};')
-
-    if ri.fixed_hdr:
-        ri.cw.p('hdr = ynl_nlmsg_data_offset(nlh, sizeof(struct genlmsghdr));')
-        ri.cw.p(f"memcpy(&dst->_hdr, hdr, sizeof({ri.fixed_hdr}));")
-    for anest in sorted(all_multi):
-        aspec = struct[anest]
-        ri.cw.p(f"if (dst->{aspec.c_name})")
-        ri.cw.p(f'return ynl_error_parse(yarg, "attribute already present ({struct.attr_set.name}.{aspec.name})");')
-
-    ri.cw.nl()
-    ri.cw.block_start(line=iter_line)
-    ri.cw.p('unsigned int type = ynl_attr_type(attr);')
-    ri.cw.nl()
-
-    first = True
-    for _, arg in struct.member_list():
-        good = arg.attr_get(ri, 'dst', first=first)
-        # First may be 'unused' or 'pad', ignore those
-        first &= not good
-
-    ri.cw.block_end()
-    ri.cw.nl()
-
-    for anest in sorted(array_nests):
-        aspec = struct[anest]
-
-        ri.cw.block_start(line=f"if (n_{aspec.c_name})")
-        ri.cw.p(f"dst->{aspec.c_name} = calloc(n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));")
-        ri.cw.p(f"dst->n_{aspec.c_name} = n_{aspec.c_name};")
-        ri.cw.p('i = 0;')
-        ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;")
-        ri.cw.block_start(line=f"ynl_attr_for_each_nested(attr, attr_{aspec.c_name})")
-        ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];")
-        ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr, ynl_attr_type(attr)))")
-        ri.cw.p('return YNL_PARSE_CB_ERROR;')
-        ri.cw.p('i++;')
-        ri.cw.block_end()
-        ri.cw.block_end()
-    ri.cw.nl()
-
-    for anest in sorted(multi_attrs):
-        aspec = struct[anest]
-        ri.cw.block_start(line=f"if (n_{aspec.c_name})")
-        ri.cw.p(f"dst->{aspec.c_name} = calloc(n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));")
-        ri.cw.p(f"dst->n_{aspec.c_name} = n_{aspec.c_name};")
-        ri.cw.p('i = 0;')
-        if 'nested-attributes' in aspec:
-            ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;")
-        ri.cw.block_start(line=iter_line)
-        ri.cw.block_start(line=f"if (ynl_attr_type(attr) == {aspec.enum_name})")
-        if 'nested-attributes' in aspec:
-            ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];")
-            ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr))")
-            ri.cw.p('return YNL_PARSE_CB_ERROR;')
-        elif aspec.type in scalars:
-            ri.cw.p(f"dst->{aspec.c_name}[i] = ynl_attr_get_{aspec.type}(attr);")
-        else:
-            raise Exception('Nest parsing type not supported yet')
-        ri.cw.p('i++;')
-        ri.cw.block_end()
-        ri.cw.block_end()
-        ri.cw.block_end()
-    ri.cw.nl()
-
-    if struct.nested:
-        ri.cw.p('return 0;')
-    else:
-        ri.cw.p('return YNL_PARSE_CB_OK;')
-    ri.cw.block_end()
-    ri.cw.nl()
-
-
-def parse_rsp_nested_prototype(ri, struct, suffix=';'):
-    func_args = ['struct ynl_parse_arg *yarg',
-                 'const struct nlattr *nested']
-    for arg in struct.inherited:
-        func_args.append('__u32 ' + arg)
-
-    ri.cw.write_func_prot('int', f'{struct.render_name}_parse', func_args,
-                          suffix=suffix)
-
-
-def parse_rsp_nested(ri, struct):
-    parse_rsp_nested_prototype(ri, struct, suffix='')
-
-    local_vars = ['const struct nlattr *attr;',
-                  f'{struct.ptr_name}dst = yarg->data;']
-    init_lines = []
-
-    if struct.member_list():
-        _multi_parse(ri, struct, init_lines, local_vars)
-    else:
-        # Empty nest
-        ri.cw.block_start()
-        ri.cw.p('return 0;')
-        ri.cw.block_end()
-        ri.cw.nl()
-
-
-def parse_rsp_msg(ri, deref=False):
-    if 'reply' not in ri.op[ri.op_mode] and ri.op_mode != 'event':
-        return
-
-    func_args = ['const struct nlmsghdr *nlh',
-                 'struct ynl_parse_arg *yarg']
-
-    local_vars = [f'{type_name(ri, "reply", deref=deref)} *dst;',
-                  'const struct nlattr *attr;']
-    init_lines = ['dst = yarg->data;']
-
-    ri.cw.write_func_prot('int', f'{op_prefix(ri, "reply", deref=deref)}_parse', func_args)
-
-    if ri.struct["reply"].member_list():
-        _multi_parse(ri, ri.struct["reply"], init_lines, local_vars)
-    else:
-        # Empty reply
-        ri.cw.block_start()
-        ri.cw.p('return YNL_PARSE_CB_OK;')
-        ri.cw.block_end()
-        ri.cw.nl()
-
-
-def print_req(ri):
-    ret_ok = '0'
-    ret_err = '-1'
-    direction = "request"
-    local_vars = ['struct ynl_req_state yrs = { .yarg = { .ys = ys, }, };',
-                  'struct nlmsghdr *nlh;',
-                  'int err;']
-
-    if 'reply' in ri.op[ri.op_mode]:
-        ret_ok = 'rsp'
-        ret_err = 'NULL'
-        local_vars += [f'{type_name(ri, rdir(direction))} *rsp;']
-
-    if ri.fixed_hdr:
-        local_vars += ['size_t hdr_len;',
-                       'void *hdr;']
-
-    print_prototype(ri, direction, terminate=False)
-    ri.cw.block_start()
-    ri.cw.write_func_lvar(local_vars)
-
-    ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);")
-
-    ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;")
-    if 'reply' in ri.op[ri.op_mode]:
-        ri.cw.p(f"yrs.yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
-    ri.cw.nl()
-
-    if ri.fixed_hdr:
-        ri.cw.p("hdr_len = sizeof(req->_hdr);")
-        ri.cw.p("hdr = ynl_nlmsg_put_extra_header(nlh, hdr_len);")
-        ri.cw.p("memcpy(hdr, &req->_hdr, hdr_len);")
-        ri.cw.nl()
-
-    for _, attr in ri.struct["request"].member_list():
-        attr.attr_put(ri, "req")
-    ri.cw.nl()
-
-    if 'reply' in ri.op[ri.op_mode]:
-        ri.cw.p('rsp = calloc(1, sizeof(*rsp));')
-        ri.cw.p('yrs.yarg.data = rsp;')
-        ri.cw.p(f"yrs.cb = {op_prefix(ri, 'reply')}_parse;")
-        if ri.op.value is not None:
-            ri.cw.p(f'yrs.rsp_cmd = {ri.op.enum_name};')
-        else:
-            ri.cw.p(f'yrs.rsp_cmd = {ri.op.rsp_value};')
-        ri.cw.nl()
-    ri.cw.p("err = ynl_exec(ys, nlh, &yrs);")
-    ri.cw.p('if (err < 0)')
-    if 'reply' in ri.op[ri.op_mode]:
-        ri.cw.p('goto err_free;')
-    else:
-        ri.cw.p('return -1;')
-    ri.cw.nl()
-
-    ri.cw.p(f"return {ret_ok};")
-    ri.cw.nl()
-
-    if 'reply' in ri.op[ri.op_mode]:
-        ri.cw.p('err_free:')
-        ri.cw.p(f"{call_free(ri, rdir(direction), 'rsp')}")
-        ri.cw.p(f"return {ret_err};")
-
-    ri.cw.block_end()
-
-
-def print_dump(ri):
-    direction = "request"
-    print_prototype(ri, direction, terminate=False)
-    ri.cw.block_start()
-    local_vars = ['struct ynl_dump_state yds = {};',
-                  'struct nlmsghdr *nlh;',
-                  'int err;']
-
-    if ri.fixed_hdr:
-        local_vars += ['size_t hdr_len;',
-                       'void *hdr;']
-
-    ri.cw.write_func_lvar(local_vars)
-
-    ri.cw.p('yds.yarg.ys = ys;')
-    ri.cw.p(f"yds.yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
-    ri.cw.p("yds.yarg.data = NULL;")
-    ri.cw.p(f"yds.alloc_sz = sizeof({type_name(ri, rdir(direction))});")
-    ri.cw.p(f"yds.cb = {op_prefix(ri, 'reply', deref=True)}_parse;")
-    if ri.op.value is not None:
-        ri.cw.p(f'yds.rsp_cmd = {ri.op.enum_name};')
-    else:
-        ri.cw.p(f'yds.rsp_cmd = {ri.op.rsp_value};')
-    ri.cw.nl()
-    ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);")
-
-    if ri.fixed_hdr:
-        ri.cw.p("hdr_len = sizeof(req->_hdr);")
-        ri.cw.p("hdr = ynl_nlmsg_put_extra_header(nlh, hdr_len);")
-        ri.cw.p("memcpy(hdr, &req->_hdr, hdr_len);")
-        ri.cw.nl()
-
-    if "request" in ri.op[ri.op_mode]:
-        ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;")
-        ri.cw.nl()
-        for _, attr in ri.struct["request"].member_list():
-            attr.attr_put(ri, "req")
-    ri.cw.nl()
-
-    ri.cw.p('err = ynl_exec_dump(ys, nlh, &yds);')
-    ri.cw.p('if (err < 0)')
-    ri.cw.p('goto free_list;')
-    ri.cw.nl()
-
-    ri.cw.p('return yds.first;')
-    ri.cw.nl()
-    ri.cw.p('free_list:')
-    ri.cw.p(call_free(ri, rdir(direction), 'yds.first'))
-    ri.cw.p('return NULL;')
-    ri.cw.block_end()
-
-
-def call_free(ri, direction, var):
-    return f"{op_prefix(ri, direction)}_free({var});"
-
-
-def free_arg_name(direction):
-    if direction:
-        return direction_to_suffix[direction][1:]
-    return 'obj'
-
-
-def print_alloc_wrapper(ri, direction):
-    name = op_prefix(ri, direction)
-    ri.cw.write_func_prot(f'static inline struct {name} *', f"{name}_alloc", [f"void"])
-    ri.cw.block_start()
-    ri.cw.p(f'return calloc(1, sizeof(struct {name}));')
-    ri.cw.block_end()
-
-
-def print_free_prototype(ri, direction, suffix=';'):
-    name = op_prefix(ri, direction)
-    struct_name = name
-    if ri.type_name_conflict:
-        struct_name += '_'
-    arg = free_arg_name(direction)
-    ri.cw.write_func_prot('void', f"{name}_free", [f"struct {struct_name} *{arg}"], suffix=suffix)
-
-
-def _print_type(ri, direction, struct):
-    suffix = f'_{ri.type_name}{direction_to_suffix[direction]}'
-    if not direction and ri.type_name_conflict:
-        suffix += '_'
-
-    if ri.op_mode == 'dump':
-        suffix += '_dump'
-
-    ri.cw.block_start(line=f"struct {ri.family.c_name}{suffix}")
-
-    if ri.fixed_hdr:
-        ri.cw.p(ri.fixed_hdr + ' _hdr;')
-        ri.cw.nl()
-
-    meta_started = False
-    for _, attr in struct.member_list():
-        for type_filter in ['len', 'bit']:
-            line = attr.presence_member(ri.ku_space, type_filter)
-            if line:
-                if not meta_started:
-                    ri.cw.block_start(line=f"struct")
-                    meta_started = True
-                ri.cw.p(line)
-    if meta_started:
-        ri.cw.block_end(line='_present;')
-        ri.cw.nl()
-
-    for arg in struct.inherited:
-        ri.cw.p(f"__u32 {arg};")
-
-    for _, attr in struct.member_list():
-        attr.struct_member(ri)
-
-    ri.cw.block_end(line=';')
-    ri.cw.nl()
-
-
-def print_type(ri, direction):
-    _print_type(ri, direction, ri.struct[direction])
-
-
-def print_type_full(ri, struct):
-    _print_type(ri, "", struct)
-
-
-def print_type_helpers(ri, direction, deref=False):
-    print_free_prototype(ri, direction)
-    ri.cw.nl()
-
-    if ri.ku_space == 'user' and direction == 'request':
-        for _, attr in ri.struct[direction].member_list():
-            attr.setter(ri, ri.attr_set, direction, deref=deref)
-    ri.cw.nl()
-
-
-def print_req_type_helpers(ri):
-    if len(ri.struct["request"].attr_list) == 0:
-        return
-    print_alloc_wrapper(ri, "request")
-    print_type_helpers(ri, "request")
-
-
-def print_rsp_type_helpers(ri):
-    if 'reply' not in ri.op[ri.op_mode]:
-        return
-    print_type_helpers(ri, "reply")
-
-
-def print_parse_prototype(ri, direction, terminate=True):
-    suffix = "_rsp" if direction == "reply" else "_req"
-    term = ';' if terminate else ''
-
-    ri.cw.write_func_prot('void', f"{ri.op.render_name}{suffix}_parse",
-                          ['const struct nlattr **tb',
-                           f"struct {ri.op.render_name}{suffix} *req"],
-                          suffix=term)
-
-
-def print_req_type(ri):
-    if len(ri.struct["request"].attr_list) == 0:
-        return
-    print_type(ri, "request")
-
-
-def print_req_free(ri):
-    if 'request' not in ri.op[ri.op_mode]:
-        return
-    _free_type(ri, 'request', ri.struct['request'])
-
-
-def print_rsp_type(ri):
-    if (ri.op_mode == 'do' or ri.op_mode == 'dump') and 'reply' in ri.op[ri.op_mode]:
-        direction = 'reply'
-    elif ri.op_mode == 'event':
-        direction = 'reply'
-    else:
-        return
-    print_type(ri, direction)
-
-
-def print_wrapped_type(ri):
-    ri.cw.block_start(line=f"{type_name(ri, 'reply')}")
-    if ri.op_mode == 'dump':
-        ri.cw.p(f"{type_name(ri, 'reply')} *next;")
-    elif ri.op_mode == 'notify' or ri.op_mode == 'event':
-        ri.cw.p('__u16 family;')
-        ri.cw.p('__u8 cmd;')
-        ri.cw.p('struct ynl_ntf_base_type *next;')
-        ri.cw.p(f"void (*free)({type_name(ri, 'reply')} *ntf);")
-    ri.cw.p(f"{type_name(ri, 'reply', deref=True)} obj __attribute__((aligned(8)));")
-    ri.cw.block_end(line=';')
-    ri.cw.nl()
-    print_free_prototype(ri, 'reply')
-    ri.cw.nl()
-
-
-def _free_type_members_iter(ri, struct):
-    for _, attr in struct.member_list():
-        if attr.free_needs_iter():
-            ri.cw.p('unsigned int i;')
-            ri.cw.nl()
-            break
-
-
-def _free_type_members(ri, var, struct, ref=''):
-    for _, attr in struct.member_list():
-        attr.free(ri, var, ref)
-
-
-def _free_type(ri, direction, struct):
-    var = free_arg_name(direction)
-
-    print_free_prototype(ri, direction, suffix='')
-    ri.cw.block_start()
-    _free_type_members_iter(ri, struct)
-    _free_type_members(ri, var, struct)
-    if direction:
-        ri.cw.p(f'free({var});')
-    ri.cw.block_end()
-    ri.cw.nl()
-
-
-def free_rsp_nested_prototype(ri):
-        print_free_prototype(ri, "")
-
-
-def free_rsp_nested(ri, struct):
-    _free_type(ri, "", struct)
-
-
-def print_rsp_free(ri):
-    if 'reply' not in ri.op[ri.op_mode]:
-        return
-    _free_type(ri, 'reply', ri.struct['reply'])
-
-
-def print_dump_type_free(ri):
-    sub_type = type_name(ri, 'reply')
-
-    print_free_prototype(ri, 'reply', suffix='')
-    ri.cw.block_start()
-    ri.cw.p(f"{sub_type} *next = rsp;")
-    ri.cw.nl()
-    ri.cw.block_start(line='while ((void *)next != YNL_LIST_END)')
-    _free_type_members_iter(ri, ri.struct['reply'])
-    ri.cw.p('rsp = next;')
-    ri.cw.p('next = rsp->next;')
-    ri.cw.nl()
-
-    _free_type_members(ri, 'rsp', ri.struct['reply'], ref='obj.')
-    ri.cw.p(f'free(rsp);')
-    ri.cw.block_end()
-    ri.cw.block_end()
-    ri.cw.nl()
-
-
-def print_ntf_type_free(ri):
-    print_free_prototype(ri, 'reply', suffix='')
-    ri.cw.block_start()
-    _free_type_members_iter(ri, ri.struct['reply'])
-    _free_type_members(ri, 'rsp', ri.struct['reply'], ref='obj.')
-    ri.cw.p(f'free(rsp);')
-    ri.cw.block_end()
-    ri.cw.nl()
-
-
-def print_req_policy_fwd(cw, struct, ri=None, terminate=True):
-    if terminate and ri and policy_should_be_static(struct.family):
-        return
-
-    if terminate:
-        prefix = 'extern '
-    else:
-        if ri and policy_should_be_static(struct.family):
-            prefix = 'static '
-        else:
-            prefix = ''
-
-    suffix = ';' if terminate else ' = {'
-
-    max_attr = struct.attr_max_val
-    if ri:
-        name = ri.op.render_name
-        if ri.op.dual_policy:
-            name += '_' + ri.op_mode
-    else:
-        name = struct.render_name
-    cw.p(f"{prefix}const struct nla_policy {name}_nl_policy[{max_attr.enum_name} + 1]{suffix}")
-
-
-def print_req_policy(cw, struct, ri=None):
-    if ri and ri.op:
-        cw.ifdef_block(ri.op.get('config-cond', None))
-    print_req_policy_fwd(cw, struct, ri=ri, terminate=False)
-    for _, arg in struct.member_list():
-        arg.attr_policy(cw)
-    cw.p("};")
-    cw.ifdef_block(None)
-    cw.nl()
-
-
-def kernel_can_gen_family_struct(family):
-    return family.proto == 'genetlink'
-
-
-def policy_should_be_static(family):
-    return family.kernel_policy == 'split' or kernel_can_gen_family_struct(family)
-
-
-def print_kernel_policy_ranges(family, cw):
-    first = True
-    for _, attr_set in family.attr_sets.items():
-        if attr_set.subset_of:
-            continue
-
-        for _, attr in attr_set.items():
-            if not attr.request:
-                continue
-            if 'full-range' not in attr.checks:
-                continue
-
-            if first:
-                cw.p('/* Integer value ranges */')
-                first = False
-
-            sign = '' if attr.type[0] == 'u' else '_signed'
-            suffix = 'ULL' if attr.type[0] == 'u' else 'LL'
-            cw.block_start(line=f'static const struct netlink_range_validation{sign} {c_lower(attr.enum_name)}_range =')
-            members = []
-            if 'min' in attr.checks:
-                members.append(('min', attr.get_limit_str('min', suffix=suffix)))
-            if 'max' in attr.checks:
-                members.append(('max', attr.get_limit_str('max', suffix=suffix)))
-            cw.write_struct_init(members)
-            cw.block_end(line=';')
-            cw.nl()
-
-
-def print_kernel_op_table_fwd(family, cw, terminate):
-    exported = not kernel_can_gen_family_struct(family)
-
-    if not terminate or exported:
-        cw.p(f"/* Ops table for {family.ident_name} */")
-
-        pol_to_struct = {'global': 'genl_small_ops',
-                         'per-op': 'genl_ops',
-                         'split': 'genl_split_ops'}
-        struct_type = pol_to_struct[family.kernel_policy]
-
-        if not exported:
-            cnt = ""
-        elif family.kernel_policy == 'split':
-            cnt = 0
-            for op in family.ops.values():
-                if 'do' in op:
-                    cnt += 1
-                if 'dump' in op:
-                    cnt += 1
-        else:
-            cnt = len(family.ops)
-
-        qual = 'static const' if not exported else 'const'
-        line = f"{qual} struct {struct_type} {family.c_name}_nl_ops[{cnt}]"
-        if terminate:
-            cw.p(f"extern {line};")
-        else:
-            cw.block_start(line=line + ' =')
-
-    if not terminate:
-        return
-
-    cw.nl()
-    for name in family.hooks['pre']['do']['list']:
-        cw.write_func_prot('int', c_lower(name),
-                           ['const struct genl_split_ops *ops',
-                            'struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
-    for name in family.hooks['post']['do']['list']:
-        cw.write_func_prot('void', c_lower(name),
-                           ['const struct genl_split_ops *ops',
-                            'struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
-    for name in family.hooks['pre']['dump']['list']:
-        cw.write_func_prot('int', c_lower(name),
-                           ['struct netlink_callback *cb'], suffix=';')
-    for name in family.hooks['post']['dump']['list']:
-        cw.write_func_prot('int', c_lower(name),
-                           ['struct netlink_callback *cb'], suffix=';')
-
-    cw.nl()
-
-    for op_name, op in family.ops.items():
-        if op.is_async:
-            continue
-
-        if 'do' in op:
-            name = c_lower(f"{family.ident_name}-nl-{op_name}-doit")
-            cw.write_func_prot('int', name,
-                               ['struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
-
-        if 'dump' in op:
-            name = c_lower(f"{family.ident_name}-nl-{op_name}-dumpit")
-            cw.write_func_prot('int', name,
-                               ['struct sk_buff *skb', 'struct netlink_callback *cb'], suffix=';')
-    cw.nl()
-
-
-def print_kernel_op_table_hdr(family, cw):
-    print_kernel_op_table_fwd(family, cw, terminate=True)
-
-
-def print_kernel_op_table(family, cw):
-    print_kernel_op_table_fwd(family, cw, terminate=False)
-    if family.kernel_policy == 'global' or family.kernel_policy == 'per-op':
-        for op_name, op in family.ops.items():
-            if op.is_async:
-                continue
-
-            cw.ifdef_block(op.get('config-cond', None))
-            cw.block_start()
-            members = [('cmd', op.enum_name)]
-            if 'dont-validate' in op:
-                members.append(('validate',
-                                ' | '.join([c_upper('genl-dont-validate-' + x)
-                                            for x in op['dont-validate']])), )
-            for op_mode in ['do', 'dump']:
-                if op_mode in op:
-                    name = c_lower(f"{family.ident_name}-nl-{op_name}-{op_mode}it")
-                    members.append((op_mode + 'it', name))
-            if family.kernel_policy == 'per-op':
-                struct = Struct(family, op['attribute-set'],
-                                type_list=op['do']['request']['attributes'])
-
-                name = c_lower(f"{family.ident_name}-{op_name}-nl-policy")
-                members.append(('policy', name))
-                members.append(('maxattr', struct.attr_max_val.enum_name))
-            if 'flags' in op:
-                members.append(('flags', ' | '.join([c_upper('genl-' + x) for x in op['flags']])))
-            cw.write_struct_init(members)
-            cw.block_end(line=',')
-    elif family.kernel_policy == 'split':
-        cb_names = {'do':   {'pre': 'pre_doit', 'post': 'post_doit'},
-                    'dump': {'pre': 'start', 'post': 'done'}}
-
-        for op_name, op in family.ops.items():
-            for op_mode in ['do', 'dump']:
-                if op.is_async or op_mode not in op:
-                    continue
-
-                cw.ifdef_block(op.get('config-cond', None))
-                cw.block_start()
-                members = [('cmd', op.enum_name)]
-                if 'dont-validate' in op:
-                    dont_validate = []
-                    for x in op['dont-validate']:
-                        if op_mode == 'do' and x in ['dump', 'dump-strict']:
-                            continue
-                        if op_mode == "dump" and x == 'strict':
-                            continue
-                        dont_validate.append(x)
-
-                    if dont_validate:
-                        members.append(('validate',
-                                        ' | '.join([c_upper('genl-dont-validate-' + x)
-                                                    for x in dont_validate])), )
-                name = c_lower(f"{family.ident_name}-nl-{op_name}-{op_mode}it")
-                if 'pre' in op[op_mode]:
-                    members.append((cb_names[op_mode]['pre'], c_lower(op[op_mode]['pre'])))
-                members.append((op_mode + 'it', name))
-                if 'post' in op[op_mode]:
-                    members.append((cb_names[op_mode]['post'], c_lower(op[op_mode]['post'])))
-                if 'request' in op[op_mode]:
-                    struct = Struct(family, op['attribute-set'],
-                                    type_list=op[op_mode]['request']['attributes'])
-
-                    if op.dual_policy:
-                        name = c_lower(f"{family.ident_name}-{op_name}-{op_mode}-nl-policy")
-                    else:
-                        name = c_lower(f"{family.ident_name}-{op_name}-nl-policy")
-                    members.append(('policy', name))
-                    members.append(('maxattr', struct.attr_max_val.enum_name))
-                flags = (op['flags'] if 'flags' in op else []) + ['cmd-cap-' + op_mode]
-                members.append(('flags', ' | '.join([c_upper('genl-' + x) for x in flags])))
-                cw.write_struct_init(members)
-                cw.block_end(line=',')
-    cw.ifdef_block(None)
-
-    cw.block_end(line=';')
-    cw.nl()
-
-
-def print_kernel_mcgrp_hdr(family, cw):
-    if not family.mcgrps['list']:
-        return
-
-    cw.block_start('enum')
-    for grp in family.mcgrps['list']:
-        grp_id = c_upper(f"{family.ident_name}-nlgrp-{grp['name']},")
-        cw.p(grp_id)
-    cw.block_end(';')
-    cw.nl()
-
-
-def print_kernel_mcgrp_src(family, cw):
-    if not family.mcgrps['list']:
-        return
-
-    cw.block_start('static const struct genl_multicast_group ' + family.c_name + '_nl_mcgrps[] =')
-    for grp in family.mcgrps['list']:
-        name = grp['name']
-        grp_id = c_upper(f"{family.ident_name}-nlgrp-{name}")
-        cw.p('[' + grp_id + '] = { "' + name + '", },')
-    cw.block_end(';')
-    cw.nl()
-
-
-def print_kernel_family_struct_hdr(family, cw):
-    if not kernel_can_gen_family_struct(family):
-        return
-
-    cw.p(f"extern struct genl_family {family.c_name}_nl_family;")
-    cw.nl()
-    if 'sock-priv' in family.kernel_family:
-        cw.p(f'void {family.c_name}_nl_sock_priv_init({family.kernel_family["sock-priv"]} *priv);')
-        cw.p(f'void {family.c_name}_nl_sock_priv_destroy({family.kernel_family["sock-priv"]} *priv);')
-        cw.nl()
-
-
-def print_kernel_family_struct_src(family, cw):
-    if not kernel_can_gen_family_struct(family):
-        return
-
-    cw.block_start(f"struct genl_family {family.ident_name}_nl_family __ro_after_init =")
-    cw.p('.name\t\t= ' + family.fam_key + ',')
-    cw.p('.version\t= ' + family.ver_key + ',')
-    cw.p('.netnsok\t= true,')
-    cw.p('.parallel_ops\t= true,')
-    cw.p('.module\t\t= THIS_MODULE,')
-    if family.kernel_policy == 'per-op':
-        cw.p(f'.ops\t\t= {family.c_name}_nl_ops,')
-        cw.p(f'.n_ops\t\t= ARRAY_SIZE({family.c_name}_nl_ops),')
-    elif family.kernel_policy == 'split':
-        cw.p(f'.split_ops\t= {family.c_name}_nl_ops,')
-        cw.p(f'.n_split_ops\t= ARRAY_SIZE({family.c_name}_nl_ops),')
-    if family.mcgrps['list']:
-        cw.p(f'.mcgrps\t\t= {family.c_name}_nl_mcgrps,')
-        cw.p(f'.n_mcgrps\t= ARRAY_SIZE({family.c_name}_nl_mcgrps),')
-    if 'sock-priv' in family.kernel_family:
-        cw.p(f'.sock_priv_size\t= sizeof({family.kernel_family["sock-priv"]}),')
-        # Force cast here, actual helpers take pointer to the real type.
-        cw.p(f'.sock_priv_init\t= (void *){family.c_name}_nl_sock_priv_init,')
-        cw.p(f'.sock_priv_destroy = (void *){family.c_name}_nl_sock_priv_destroy,')
-    cw.block_end(';')
-
-
-def uapi_enum_start(family, cw, obj, ckey='', enum_name='enum-name'):
-    start_line = 'enum'
-    if enum_name in obj:
-        if obj[enum_name]:
-            start_line = 'enum ' + c_lower(obj[enum_name])
-    elif ckey and ckey in obj:
-        start_line = 'enum ' + family.c_name + '_' + c_lower(obj[ckey])
-    cw.block_start(line=start_line)
-
-
-def render_uapi_unified(family, cw, max_by_define, separate_ntf):
-    max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX"))
-    cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX"))
-    max_value = f"({cnt_name} - 1)"
-
-    uapi_enum_start(family, cw, family['operations'], 'enum-name')
-    val = 0
-    for op in family.msgs.values():
-        if separate_ntf and ('notify' in op or 'event' in op):
-            continue
-
-        suffix = ','
-        if op.value != val:
-            suffix = f" = {op.value},"
-            val = op.value
-        cw.p(op.enum_name + suffix)
-        val += 1
-    cw.nl()
-    cw.p(cnt_name + ('' if max_by_define else ','))
-    if not max_by_define:
-        cw.p(f"{max_name} = {max_value}")
-    cw.block_end(line=';')
-    if max_by_define:
-        cw.p(f"#define {max_name} {max_value}")
-    cw.nl()
-
-
-def render_uapi_directional(family, cw, max_by_define):
-    max_name = f"{family.op_prefix}USER_MAX"
-    cnt_name = f"__{family.op_prefix}USER_CNT"
-    max_value = f"({cnt_name} - 1)"
-
-    cw.block_start(line='enum')
-    cw.p(c_upper(f'{family.name}_MSG_USER_NONE = 0,'))
-    val = 0
-    for op in family.msgs.values():
-        if 'do' in op and 'event' not in op:
-            suffix = ','
-            if op.value and op.value != val:
-                suffix = f" = {op.value},"
-                val = op.value
-            cw.p(op.enum_name + suffix)
-            val += 1
-    cw.nl()
-    cw.p(cnt_name + ('' if max_by_define else ','))
-    if not max_by_define:
-        cw.p(f"{max_name} = {max_value}")
-    cw.block_end(line=';')
-    if max_by_define:
-        cw.p(f"#define {max_name} {max_value}")
-    cw.nl()
-
-    max_name = f"{family.op_prefix}KERNEL_MAX"
-    cnt_name = f"__{family.op_prefix}KERNEL_CNT"
-    max_value = f"({cnt_name} - 1)"
-
-    cw.block_start(line='enum')
-    cw.p(c_upper(f'{family.name}_MSG_KERNEL_NONE = 0,'))
-    val = 0
-    for op in family.msgs.values():
-        if ('do' in op and 'reply' in op['do']) or 'notify' in op or 'event' in op:
-            enum_name = op.enum_name
-            if 'event' not in op and 'notify' not in op:
-                enum_name = f'{enum_name}_REPLY'
-
-            suffix = ','
-            if op.value and op.value != val:
-                suffix = f" = {op.value},"
-                val = op.value
-            cw.p(enum_name + suffix)
-            val += 1
-    cw.nl()
-    cw.p(cnt_name + ('' if max_by_define else ','))
-    if not max_by_define:
-        cw.p(f"{max_name} = {max_value}")
-    cw.block_end(line=';')
-    if max_by_define:
-        cw.p(f"#define {max_name} {max_value}")
-    cw.nl()
-
-
-def render_uapi(family, cw):
-    hdr_prot = f"_UAPI_LINUX_{c_upper(family.uapi_header_name)}_H"
-    hdr_prot = hdr_prot.replace('/', '_')
-    cw.p('#ifndef ' + hdr_prot)
-    cw.p('#define ' + hdr_prot)
-    cw.nl()
-
-    defines = [(family.fam_key, family["name"]),
-               (family.ver_key, family.get('version', 1))]
-    cw.writes_defines(defines)
-    cw.nl()
-
-    defines = []
-    for const in family['definitions']:
-        if const['type'] != 'const':
-            cw.writes_defines(defines)
-            defines = []
-            cw.nl()
-
-        # Write kdoc for enum and flags (one day maybe also structs)
-        if const['type'] == 'enum' or const['type'] == 'flags':
-            enum = family.consts[const['name']]
-
-            if enum.header:
-                continue
-
-            if enum.has_doc():
-                if enum.has_entry_doc():
-                    cw.p('/**')
-                    doc = ''
-                    if 'doc' in enum:
-                        doc = ' - ' + enum['doc']
-                    cw.write_doc_line(enum.enum_name + doc)
-                else:
-                    cw.p('/*')
-                    cw.write_doc_line(enum['doc'], indent=False)
-                for entry in enum.entries.values():
-                    if entry.has_doc():
-                        doc = '@' + entry.c_name + ': ' + entry['doc']
-                        cw.write_doc_line(doc)
-                cw.p(' */')
-
-            uapi_enum_start(family, cw, const, 'name')
-            name_pfx = const.get('name-prefix', f"{family.ident_name}-{const['name']}-")
-            for entry in enum.entries.values():
-                suffix = ','
-                if entry.value_change:
-                    suffix = f" = {entry.user_value()}" + suffix
-                cw.p(entry.c_name + suffix)
-
-            if const.get('render-max', False):
-                cw.nl()
-                cw.p('/* private: */')
-                if const['type'] == 'flags':
-                    max_name = c_upper(name_pfx + 'mask')
-                    max_val = f' = {enum.get_mask()},'
-                    cw.p(max_name + max_val)
-                else:
-                    cnt_name = enum.enum_cnt_name
-                    max_name = c_upper(name_pfx + 'max')
-                    if not cnt_name:
-                        cnt_name = '__' + name_pfx + 'max'
-                    cw.p(c_upper(cnt_name) + ',')
-                    cw.p(max_name + ' = (' + c_upper(cnt_name) + ' - 1)')
-            cw.block_end(line=';')
-            cw.nl()
-        elif const['type'] == 'const':
-            defines.append([c_upper(family.get('c-define-name',
-                                               f"{family.ident_name}-{const['name']}")),
-                            const['value']])
-
-    if defines:
-        cw.writes_defines(defines)
-        cw.nl()
-
-    max_by_define = family.get('max-by-define', False)
-
-    for _, attr_set in family.attr_sets.items():
-        if attr_set.subset_of:
-            continue
-
-        max_value = f"({attr_set.cnt_name} - 1)"
-
-        val = 0
-        uapi_enum_start(family, cw, attr_set.yaml, 'enum-name')
-        for _, attr in attr_set.items():
-            suffix = ','
-            if attr.value != val:
-                suffix = f" = {attr.value},"
-                val = attr.value
-            val += 1
-            cw.p(attr.enum_name + suffix)
-        if attr_set.items():
-            cw.nl()
-        cw.p(attr_set.cnt_name + ('' if max_by_define else ','))
-        if not max_by_define:
-            cw.p(f"{attr_set.max_name} = {max_value}")
-        cw.block_end(line=';')
-        if max_by_define:
-            cw.p(f"#define {attr_set.max_name} {max_value}")
-        cw.nl()
-
-    # Commands
-    separate_ntf = 'async-prefix' in family['operations']
-
-    if family.msg_id_model == 'unified':
-        render_uapi_unified(family, cw, max_by_define, separate_ntf)
-    elif family.msg_id_model == 'directional':
-        render_uapi_directional(family, cw, max_by_define)
-    else:
-        raise Exception(f'Unsupported message enum-model {family.msg_id_model}')
-
-    if separate_ntf:
-        uapi_enum_start(family, cw, family['operations'], enum_name='async-enum')
-        for op in family.msgs.values():
-            if separate_ntf and not ('notify' in op or 'event' in op):
-                continue
-
-            suffix = ','
-            if 'value' in op:
-                suffix = f" = {op['value']},"
-            cw.p(op.enum_name + suffix)
-        cw.block_end(line=';')
-        cw.nl()
-
-    # Multicast
-    defines = []
-    for grp in family.mcgrps['list']:
-        name = grp['name']
-        defines.append([c_upper(grp.get('c-define-name', f"{family.ident_name}-mcgrp-{name}")),
-                        f'{name}'])
-    cw.nl()
-    if defines:
-        cw.writes_defines(defines)
-        cw.nl()
-
-    cw.p(f'#endif /* {hdr_prot} */')
-
-
-def _render_user_ntf_entry(ri, op):
-    ri.cw.block_start(line=f"[{op.enum_name}] = ")
-    ri.cw.p(f".alloc_sz\t= sizeof({type_name(ri, 'event')}),")
-    ri.cw.p(f".cb\t\t= {op_prefix(ri, 'reply', deref=True)}_parse,")
-    ri.cw.p(f".policy\t\t= &{ri.struct['reply'].render_name}_nest,")
-    ri.cw.p(f".free\t\t= (void *){op_prefix(ri, 'notify')}_free,")
-    ri.cw.block_end(line=',')
-
-
-def render_user_family(family, cw, prototype):
-    symbol = f'const struct ynl_family ynl_{family.c_name}_family'
-    if prototype:
-        cw.p(f'extern {symbol};')
-        return
-
-    if family.ntfs:
-        cw.block_start(line=f"static const struct ynl_ntf_info {family['name']}_ntf_info[] = ")
-        for ntf_op_name, ntf_op in family.ntfs.items():
-            if 'notify' in ntf_op:
-                op = family.ops[ntf_op['notify']]
-                ri = RenderInfo(cw, family, "user", op, "notify")
-            elif 'event' in ntf_op:
-                ri = RenderInfo(cw, family, "user", ntf_op, "event")
-            else:
-                raise Exception('Invalid notification ' + ntf_op_name)
-            _render_user_ntf_entry(ri, ntf_op)
-        for op_name, op in family.ops.items():
-            if 'event' not in op:
-                continue
-            ri = RenderInfo(cw, family, "user", op, "event")
-            _render_user_ntf_entry(ri, op)
-        cw.block_end(line=";")
-        cw.nl()
-
-    cw.block_start(f'{symbol} = ')
-    cw.p(f'.name\t\t= "{family.c_name}",')
-    if family.fixed_header:
-        cw.p(f'.hdr_len\t= sizeof(struct genlmsghdr) + sizeof(struct {c_lower(family.fixed_header)}),')
-    else:
-        cw.p('.hdr_len\t= sizeof(struct genlmsghdr),')
-    if family.ntfs:
-        cw.p(f".ntf_info\t= {family['name']}_ntf_info,")
-        cw.p(f".ntf_info_size\t= YNL_ARRAY_SIZE({family['name']}_ntf_info),")
-    cw.block_end(line=';')
-
-
-def family_contains_bitfield32(family):
-    for _, attr_set in family.attr_sets.items():
-        if attr_set.subset_of:
-            continue
-        for _, attr in attr_set.items():
-            if attr.type == "bitfield32":
-                return True
-    return False
-
-
-def find_kernel_root(full_path):
-    sub_path = ''
-    while True:
-        sub_path = os.path.join(os.path.basename(full_path), sub_path)
-        full_path = os.path.dirname(full_path)
-        maintainers = os.path.join(full_path, "MAINTAINERS")
-        if os.path.exists(maintainers):
-            return full_path, sub_path[:-1]
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Netlink simple parsing generator')
-    parser.add_argument('--mode', dest='mode', type=str, required=True,
-                        choices=('user', 'kernel', 'uapi'))
-    parser.add_argument('--spec', dest='spec', type=str, required=True)
-    parser.add_argument('--header', dest='header', action='store_true', default=None)
-    parser.add_argument('--source', dest='header', action='store_false')
-    parser.add_argument('--user-header', nargs='+', default=[])
-    parser.add_argument('--cmp-out', action='store_true', default=None,
-                        help='Do not overwrite the output file if the new output is identical to the old')
-    parser.add_argument('--exclude-op', action='append', default=[])
-    parser.add_argument('-o', dest='out_file', type=str, default=None)
-    args = parser.parse_args()
-
-    if args.header is None:
-        parser.error("--header or --source is required")
-
-    exclude_ops = [re.compile(expr) for expr in args.exclude_op]
-
-    try:
-        parsed = Family(args.spec, exclude_ops)
-        if parsed.license != '((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)':
-            print('Spec license:', parsed.license)
-            print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)')
-            os.sys.exit(1)
-    except yaml.YAMLError as exc:
-        print(exc)
-        os.sys.exit(1)
-        return
-
-    cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out))
-
-    _, spec_kernel = find_kernel_root(args.spec)
-    if args.mode == 'uapi' or args.header:
-        cw.p(f'/* SPDX-License-Identifier: {parsed.license} */')
-    else:
-        cw.p(f'// SPDX-License-Identifier: {parsed.license}')
-    cw.p("/* Do not edit directly, auto-generated from: */")
-    cw.p(f"/*\t{spec_kernel} */")
-    cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */")
-    if args.exclude_op or args.user_header:
-        line = ''
-        line += ' --user-header '.join([''] + args.user_header)
-        line += ' --exclude-op '.join([''] + args.exclude_op)
-        cw.p(f'/* YNL-ARG{line} */')
-    cw.nl()
-
-    if args.mode == 'uapi':
-        render_uapi(parsed, cw)
-        return
-
-    hdr_prot = f"_LINUX_{parsed.c_name.upper()}_GEN_H"
-    if args.header:
-        cw.p('#ifndef ' + hdr_prot)
-        cw.p('#define ' + hdr_prot)
-        cw.nl()
-
-    if args.out_file:
-        hdr_file = os.path.basename(args.out_file[:-2]) + ".h"
-    else:
-        hdr_file = "generated_header_file.h"
-
-    if args.mode == 'kernel':
-        cw.p('#include <net/netlink.h>')
-        cw.p('#include <net/genetlink.h>')
-        cw.nl()
-        if not args.header:
-            if args.out_file:
-                cw.p(f'#include "{hdr_file}"')
-            cw.nl()
-        headers = ['uapi/' + parsed.uapi_header]
-        headers += parsed.kernel_family.get('headers', [])
-    else:
-        cw.p('#include <stdlib.h>')
-        cw.p('#include <string.h>')
-        if args.header:
-            cw.p('#include <linux/types.h>')
-            if family_contains_bitfield32(parsed):
-                cw.p('#include <linux/netlink.h>')
-        else:
-            cw.p(f'#include "{hdr_file}"')
-            cw.p('#include "ynl.h"')
-        headers = []
-    for definition in parsed['definitions']:
-        if 'header' in definition:
-            headers.append(definition['header'])
-    if args.mode == 'user':
-        headers.append(parsed.uapi_header)
-    seen_header = []
-    for one in headers:
-        if one not in seen_header:
-            cw.p(f"#include <{one}>")
-            seen_header.append(one)
-    cw.nl()
-
-    if args.mode == "user":
-        if not args.header:
-            cw.p("#include <linux/genetlink.h>")
-            cw.nl()
-            for one in args.user_header:
-                cw.p(f'#include "{one}"')
-        else:
-            cw.p('struct ynl_sock;')
-            cw.nl()
-            render_user_family(parsed, cw, True)
-        cw.nl()
-
-    if args.mode == "kernel":
-        if args.header:
-            for _, struct in sorted(parsed.pure_nested_structs.items()):
-                if struct.request:
-                    cw.p('/* Common nested types */')
-                    break
-            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
-                if struct.request:
-                    print_req_policy_fwd(cw, struct)
-            cw.nl()
-
-            if parsed.kernel_policy == 'global':
-                cw.p(f"/* Global operation policy for {parsed.name} */")
-
-                struct = Struct(parsed, parsed.global_policy_set, type_list=parsed.global_policy)
-                print_req_policy_fwd(cw, struct)
-                cw.nl()
-
-            if parsed.kernel_policy in {'per-op', 'split'}:
-                for op_name, op in parsed.ops.items():
-                    if 'do' in op and 'event' not in op:
-                        ri = RenderInfo(cw, parsed, args.mode, op, "do")
-                        print_req_policy_fwd(cw, ri.struct['request'], ri=ri)
-                        cw.nl()
-
-            print_kernel_op_table_hdr(parsed, cw)
-            print_kernel_mcgrp_hdr(parsed, cw)
-            print_kernel_family_struct_hdr(parsed, cw)
-        else:
-            print_kernel_policy_ranges(parsed, cw)
-
-            for _, struct in sorted(parsed.pure_nested_structs.items()):
-                if struct.request:
-                    cw.p('/* Common nested types */')
-                    break
-            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
-                if struct.request:
-                    print_req_policy(cw, struct)
-            cw.nl()
-
-            if parsed.kernel_policy == 'global':
-                cw.p(f"/* Global operation policy for {parsed.name} */")
-
-                struct = Struct(parsed, parsed.global_policy_set, type_list=parsed.global_policy)
-                print_req_policy(cw, struct)
-                cw.nl()
-
-            for op_name, op in parsed.ops.items():
-                if parsed.kernel_policy in {'per-op', 'split'}:
-                    for op_mode in ['do', 'dump']:
-                        if op_mode in op and 'request' in op[op_mode]:
-                            cw.p(f"/* {op.enum_name} - {op_mode} */")
-                            ri = RenderInfo(cw, parsed, args.mode, op, op_mode)
-                            print_req_policy(cw, ri.struct['request'], ri=ri)
-                            cw.nl()
-
-            print_kernel_op_table(parsed, cw)
-            print_kernel_mcgrp_src(parsed, cw)
-            print_kernel_family_struct_src(parsed, cw)
-
-    if args.mode == "user":
-        if args.header:
-            cw.p('/* Enums */')
-            put_op_name_fwd(parsed, cw)
-
-            for name, const in parsed.consts.items():
-                if isinstance(const, EnumSet):
-                    put_enum_to_str_fwd(parsed, cw, const)
-            cw.nl()
-
-            cw.p('/* Common nested types */')
-            for attr_set, struct in parsed.pure_nested_structs.items():
-                ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
-                print_type_full(ri, struct)
-
-            for op_name, op in parsed.ops.items():
-                cw.p(f"/* ============== {op.enum_name} ============== */")
-
-                if 'do' in op and 'event' not in op:
-                    cw.p(f"/* {op.enum_name} - do */")
-                    ri = RenderInfo(cw, parsed, args.mode, op, "do")
-                    print_req_type(ri)
-                    print_req_type_helpers(ri)
-                    cw.nl()
-                    print_rsp_type(ri)
-                    print_rsp_type_helpers(ri)
-                    cw.nl()
-                    print_req_prototype(ri)
-                    cw.nl()
-
-                if 'dump' in op:
-                    cw.p(f"/* {op.enum_name} - dump */")
-                    ri = RenderInfo(cw, parsed, args.mode, op, 'dump')
-                    print_req_type(ri)
-                    print_req_type_helpers(ri)
-                    if not ri.type_consistent:
-                        print_rsp_type(ri)
-                    print_wrapped_type(ri)
-                    print_dump_prototype(ri)
-                    cw.nl()
-
-                if op.has_ntf:
-                    cw.p(f"/* {op.enum_name} - notify */")
-                    ri = RenderInfo(cw, parsed, args.mode, op, 'notify')
-                    if not ri.type_consistent:
-                        raise Exception(f'Only notifications with consistent types supported ({op.name})')
-                    print_wrapped_type(ri)
-
-            for op_name, op in parsed.ntfs.items():
-                if 'event' in op:
-                    ri = RenderInfo(cw, parsed, args.mode, op, 'event')
-                    cw.p(f"/* {op.enum_name} - event */")
-                    print_rsp_type(ri)
-                    cw.nl()
-                    print_wrapped_type(ri)
-            cw.nl()
-        else:
-            cw.p('/* Enums */')
-            put_op_name(parsed, cw)
-
-            for name, const in parsed.consts.items():
-                if isinstance(const, EnumSet):
-                    put_enum_to_str(parsed, cw, const)
-            cw.nl()
-
-            has_recursive_nests = False
-            cw.p('/* Policies */')
-            for struct in parsed.pure_nested_structs.values():
-                if struct.recursive:
-                    put_typol_fwd(cw, struct)
-                    has_recursive_nests = True
-            if has_recursive_nests:
-                cw.nl()
-            for name in parsed.pure_nested_structs:
-                struct = Struct(parsed, name)
-                put_typol(cw, struct)
-            for name in parsed.root_sets:
-                struct = Struct(parsed, name)
-                put_typol(cw, struct)
-
-            cw.p('/* Common nested types */')
-            if has_recursive_nests:
-                for attr_set, struct in parsed.pure_nested_structs.items():
-                    ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
-                    free_rsp_nested_prototype(ri)
-                    if struct.request:
-                        put_req_nested_prototype(ri, struct)
-                    if struct.reply:
-                        parse_rsp_nested_prototype(ri, struct)
-                cw.nl()
-            for attr_set, struct in parsed.pure_nested_structs.items():
-                ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
-
-                free_rsp_nested(ri, struct)
-                if struct.request:
-                    put_req_nested(ri, struct)
-                if struct.reply:
-                    parse_rsp_nested(ri, struct)
-
-            for op_name, op in parsed.ops.items():
-                cw.p(f"/* ============== {op.enum_name} ============== */")
-                if 'do' in op and 'event' not in op:
-                    cw.p(f"/* {op.enum_name} - do */")
-                    ri = RenderInfo(cw, parsed, args.mode, op, "do")
-                    print_req_free(ri)
-                    print_rsp_free(ri)
-                    parse_rsp_msg(ri)
-                    print_req(ri)
-                    cw.nl()
-
-                if 'dump' in op:
-                    cw.p(f"/* {op.enum_name} - dump */")
-                    ri = RenderInfo(cw, parsed, args.mode, op, "dump")
-                    if not ri.type_consistent:
-                        parse_rsp_msg(ri, deref=True)
-                    print_req_free(ri)
-                    print_dump_type_free(ri)
-                    print_dump(ri)
-                    cw.nl()
-
-                if op.has_ntf:
-                    cw.p(f"/* {op.enum_name} - notify */")
-                    ri = RenderInfo(cw, parsed, args.mode, op, 'notify')
-                    if not ri.type_consistent:
-                        raise Exception(f'Only notifications with consistent types supported ({op.name})')
-                    print_ntf_type_free(ri)
-
-            for op_name, op in parsed.ntfs.items():
-                if 'event' in op:
-                    cw.p(f"/* {op.enum_name} - event */")
-
-                    ri = RenderInfo(cw, parsed, args.mode, op, "do")
-                    parse_rsp_msg(ri)
-
-                    ri = RenderInfo(cw, parsed, args.mode, op, "event")
-                    print_ntf_type_free(ri)
-            cw.nl()
-            render_user_family(parsed, cw, False)
-
-    if args.header:
-        cw.p(f'#endif /* {hdr_prot} */')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py
deleted file mode 100755
index 6c56d0d726b4..000000000000
--- a/tools/net/ynl/ynl-gen-rst.py
+++ /dev/null
@@ -1,453 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-# -*- coding: utf-8; mode: python -*-
-
-"""
-    Script to auto generate the documentation for Netlink specifications.
-
-    :copyright:  Copyright (C) 2023  Breno Leitao <leitao@debian.org>
-    :license:    GPL Version 2, June 1991 see linux/COPYING for details.
-
-    This script performs extensive parsing to the Linux kernel's netlink YAML
-    spec files, in an effort to avoid needing to heavily mark up the original
-    YAML file.
-
-    This code is split in three big parts:
-        1) RST formatters: Use to convert a string to a RST output
-        2) Parser helpers: Functions to parse the YAML data structure
-        3) Main function and small helpers
-"""
-
-from typing import Any, Dict, List
-import os.path
-import sys
-import argparse
-import logging
-import yaml
-
-
-SPACE_PER_LEVEL = 4
-
-
-# RST Formatters
-# ==============
-def headroom(level: int) -> str:
-    """Return space to format"""
-    return " " * (level * SPACE_PER_LEVEL)
-
-
-def bold(text: str) -> str:
-    """Format bold text"""
-    return f"**{text}**"
-
-
-def inline(text: str) -> str:
-    """Format inline text"""
-    return f"``{text}``"
-
-
-def sanitize(text: str) -> str:
-    """Remove newlines and multiple spaces"""
-    # This is useful for some fields that are spread across multiple lines
-    return str(text).replace("\n", " ").strip()
-
-
-def rst_fields(key: str, value: str, level: int = 0) -> str:
-    """Return a RST formatted field"""
-    return headroom(level) + f":{key}: {value}"
-
-
-def rst_definition(key: str, value: Any, level: int = 0) -> str:
-    """Format a single rst definition"""
-    return headroom(level) + key + "\n" + headroom(level + 1) + str(value)
-
-
-def rst_paragraph(paragraph: str, level: int = 0) -> str:
-    """Return a formatted paragraph"""
-    return headroom(level) + paragraph
-
-
-def rst_bullet(item: str, level: int = 0) -> str:
-    """Return a formatted a bullet"""
-    return headroom(level) + f"- {item}"
-
-
-def rst_subsection(title: str) -> str:
-    """Add a sub-section to the document"""
-    return f"{title}\n" + "-" * len(title)
-
-
-def rst_subsubsection(title: str) -> str:
-    """Add a sub-sub-section to the document"""
-    return f"{title}\n" + "~" * len(title)
-
-
-def rst_section(namespace: str, prefix: str, title: str) -> str:
-    """Add a section to the document"""
-    return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title)
-
-
-def rst_subtitle(title: str) -> str:
-    """Add a subtitle to the document"""
-    return "\n" + "-" * len(title) + f"\n{title}\n" + "-" * len(title) + "\n\n"
-
-
-def rst_title(title: str) -> str:
-    """Add a title to the document"""
-    return "=" * len(title) + f"\n{title}\n" + "=" * len(title) + "\n\n"
-
-
-def rst_list_inline(list_: List[str], level: int = 0) -> str:
-    """Format a list using inlines"""
-    return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]"
-
-
-def rst_ref(namespace: str, prefix: str, name: str) -> str:
-    """Add a hyperlink to the document"""
-    mappings = {'enum': 'definition',
-                'fixed-header': 'definition',
-                'nested-attributes': 'attribute-set',
-                'struct': 'definition'}
-    if prefix in mappings:
-        prefix = mappings[prefix]
-    return f":ref:`{namespace}-{prefix}-{name}`"
-
-
-def rst_header() -> str:
-    """The headers for all the auto generated RST files"""
-    lines = []
-
-    lines.append(rst_paragraph(".. SPDX-License-Identifier: GPL-2.0"))
-    lines.append(rst_paragraph(".. NOTE: This document was auto-generated.\n\n"))
-
-    return "\n".join(lines)
-
-
-def rst_toctree(maxdepth: int = 2) -> str:
-    """Generate a toctree RST primitive"""
-    lines = []
-
-    lines.append(".. toctree::")
-    lines.append(f"   :maxdepth: {maxdepth}\n\n")
-
-    return "\n".join(lines)
-
-
-def rst_label(title: str) -> str:
-    """Return a formatted label"""
-    return f".. _{title}:\n\n"
-
-
-# Parsers
-# =======
-
-
-def parse_mcast_group(mcast_group: List[Dict[str, Any]]) -> str:
-    """Parse 'multicast' group list and return a formatted string"""
-    lines = []
-    for group in mcast_group:
-        lines.append(rst_bullet(group["name"]))
-
-    return "\n".join(lines)
-
-
-def parse_do(do_dict: Dict[str, Any], level: int = 0) -> str:
-    """Parse 'do' section and return a formatted string"""
-    lines = []
-    for key in do_dict.keys():
-        lines.append(rst_paragraph(bold(key), level + 1))
-        if key in ['request', 'reply']:
-            lines.append(parse_do_attributes(do_dict[key], level + 1) + "\n")
-        else:
-            lines.append(headroom(level + 2) + do_dict[key] + "\n")
-
-    return "\n".join(lines)
-
-
-def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str:
-    """Parse 'attributes' section"""
-    if "attributes" not in attrs:
-        return ""
-    lines = [rst_fields("attributes", rst_list_inline(attrs["attributes"]), level + 1)]
-
-    return "\n".join(lines)
-
-
-def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str:
-    """Parse operations block"""
-    preprocessed = ["name", "doc", "title", "do", "dump", "flags"]
-    linkable = ["fixed-header", "attribute-set"]
-    lines = []
-
-    for operation in operations:
-        lines.append(rst_section(namespace, 'operation', operation["name"]))
-        lines.append(rst_paragraph(operation["doc"]) + "\n")
-
-        for key in operation.keys():
-            if key in preprocessed:
-                # Skip the special fields
-                continue
-            value = operation[key]
-            if key in linkable:
-                value = rst_ref(namespace, key, value)
-            lines.append(rst_fields(key, value, 0))
-        if 'flags' in operation:
-            lines.append(rst_fields('flags', rst_list_inline(operation['flags'])))
-
-        if "do" in operation:
-            lines.append(rst_paragraph(":do:", 0))
-            lines.append(parse_do(operation["do"], 0))
-        if "dump" in operation:
-            lines.append(rst_paragraph(":dump:", 0))
-            lines.append(parse_do(operation["dump"], 0))
-
-        # New line after fields
-        lines.append("\n")
-
-    return "\n".join(lines)
-
-
-def parse_entries(entries: List[Dict[str, Any]], level: int) -> str:
-    """Parse a list of entries"""
-    ignored = ["pad"]
-    lines = []
-    for entry in entries:
-        if isinstance(entry, dict):
-            # entries could be a list or a dictionary
-            field_name = entry.get("name", "")
-            if field_name in ignored:
-                continue
-            type_ = entry.get("type")
-            if type_:
-                field_name += f" ({inline(type_)})"
-            lines.append(
-                rst_fields(field_name, sanitize(entry.get("doc", "")), level)
-            )
-        elif isinstance(entry, list):
-            lines.append(rst_list_inline(entry, level))
-        else:
-            lines.append(rst_bullet(inline(sanitize(entry)), level))
-
-    lines.append("\n")
-    return "\n".join(lines)
-
-
-def parse_definitions(defs: Dict[str, Any], namespace: str) -> str:
-    """Parse definitions section"""
-    preprocessed = ["name", "entries", "members"]
-    ignored = ["render-max"]  # This is not printed
-    lines = []
-
-    for definition in defs:
-        lines.append(rst_section(namespace, 'definition', definition["name"]))
-        for k in definition.keys():
-            if k in preprocessed + ignored:
-                continue
-            lines.append(rst_fields(k, sanitize(definition[k]), 0))
-
-        # Field list needs to finish with a new line
-        lines.append("\n")
-        if "entries" in definition:
-            lines.append(rst_paragraph(":entries:", 0))
-            lines.append(parse_entries(definition["entries"], 1))
-        if "members" in definition:
-            lines.append(rst_paragraph(":members:", 0))
-            lines.append(parse_entries(definition["members"], 1))
-
-    return "\n".join(lines)
-
-
-def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str:
-    """Parse attribute from attribute-set"""
-    preprocessed = ["name", "type"]
-    linkable = ["enum", "nested-attributes", "struct", "sub-message"]
-    ignored = ["checks"]
-    lines = []
-
-    for entry in entries:
-        lines.append(rst_section(namespace, 'attribute-set', entry["name"]))
-        for attr in entry["attributes"]:
-            type_ = attr.get("type")
-            attr_line = attr["name"]
-            if type_:
-                # Add the attribute type in the same line
-                attr_line += f" ({inline(type_)})"
-
-            lines.append(rst_subsubsection(attr_line))
-
-            for k in attr.keys():
-                if k in preprocessed + ignored:
-                    continue
-                if k in linkable:
-                    value = rst_ref(namespace, k, attr[k])
-                else:
-                    value = sanitize(attr[k])
-                lines.append(rst_fields(k, value, 0))
-            lines.append("\n")
-
-    return "\n".join(lines)
-
-
-def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str:
-    """Parse sub-message definitions"""
-    lines = []
-
-    for entry in entries:
-        lines.append(rst_section(namespace, 'sub-message', entry["name"]))
-        for fmt in entry["formats"]:
-            value = fmt["value"]
-
-            lines.append(rst_bullet(bold(value)))
-            for attr in ['fixed-header', 'attribute-set']:
-                if attr in fmt:
-                    lines.append(rst_fields(attr,
-                                            rst_ref(namespace, attr, fmt[attr]),
-                                            1))
-            lines.append("\n")
-
-    return "\n".join(lines)
-
-
-def parse_yaml(obj: Dict[str, Any]) -> str:
-    """Format the whole YAML into a RST string"""
-    lines = []
-
-    # Main header
-
-    lines.append(rst_header())
-
-    family = obj['name']
-
-    title = f"Family ``{family}`` netlink specification"
-    lines.append(rst_title(title))
-    lines.append(rst_paragraph(".. contents:: :depth: 3\n"))
-
-    if "doc" in obj:
-        lines.append(rst_subtitle("Summary"))
-        lines.append(rst_paragraph(obj["doc"], 0))
-
-    # Operations
-    if "operations" in obj:
-        lines.append(rst_subtitle("Operations"))
-        lines.append(parse_operations(obj["operations"]["list"], family))
-
-    # Multicast groups
-    if "mcast-groups" in obj:
-        lines.append(rst_subtitle("Multicast groups"))
-        lines.append(parse_mcast_group(obj["mcast-groups"]["list"]))
-
-    # Definitions
-    if "definitions" in obj:
-        lines.append(rst_subtitle("Definitions"))
-        lines.append(parse_definitions(obj["definitions"], family))
-
-    # Attributes set
-    if "attribute-sets" in obj:
-        lines.append(rst_subtitle("Attribute sets"))
-        lines.append(parse_attr_sets(obj["attribute-sets"], family))
-
-    # Sub-messages
-    if "sub-messages" in obj:
-        lines.append(rst_subtitle("Sub-messages"))
-        lines.append(parse_sub_messages(obj["sub-messages"], family))
-
-    return "\n".join(lines)
-
-
-# Main functions
-# ==============
-
-
-def parse_arguments() -> argparse.Namespace:
-    """Parse arguments from user"""
-    parser = argparse.ArgumentParser(description="Netlink RST generator")
-
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-o", "--output", help="Output file name")
-
-    # Index and input are mutually exclusive
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "-x", "--index", action="store_true", help="Generate the index page"
-    )
-    group.add_argument("-i", "--input", help="YAML file name")
-
-    args = parser.parse_args()
-
-    if args.verbose:
-        logging.basicConfig(level=logging.DEBUG)
-
-    if args.input and not os.path.isfile(args.input):
-        logging.warning("%s is not a valid file.", args.input)
-        sys.exit(-1)
-
-    if not args.output:
-        logging.error("No output file specified.")
-        sys.exit(-1)
-
-    if os.path.isfile(args.output):
-        logging.debug("%s already exists. Overwriting it.", args.output)
-
-    return args
-
-
-def parse_yaml_file(filename: str) -> str:
-    """Transform the YAML specified by filename into a rst-formmated string"""
-    with open(filename, "r", encoding="utf-8") as spec_file:
-        yaml_data = yaml.safe_load(spec_file)
-        content = parse_yaml(yaml_data)
-
-    return content
-
-
-def write_to_rstfile(content: str, filename: str) -> None:
-    """Write the generated content into an RST file"""
-    logging.debug("Saving RST file to %s", filename)
-
-    with open(filename, "w", encoding="utf-8") as rst_file:
-        rst_file.write(content)
-
-
-def generate_main_index_rst(output: str) -> None:
-    """Generate the `networking_spec/index` content and write to the file"""
-    lines = []
-
-    lines.append(rst_header())
-    lines.append(rst_label("specs"))
-    lines.append(rst_title("Netlink Family Specifications"))
-    lines.append(rst_toctree(1))
-
-    index_dir = os.path.dirname(output)
-    logging.debug("Looking for .rst files in %s", index_dir)
-    for filename in sorted(os.listdir(index_dir)):
-        if not filename.endswith(".rst") or filename == "index.rst":
-            continue
-        lines.append(f"   {filename.replace('.rst', '')}\n")
-
-    logging.debug("Writing an index file at %s", output)
-    write_to_rstfile("".join(lines), output)
-
-
-def main() -> None:
-    """Main function that reads the YAML files and generates the RST files"""
-
-    args = parse_arguments()
-
-    if args.input:
-        logging.debug("Parsing %s", args.input)
-        try:
-            content = parse_yaml_file(os.path.join(args.input))
-        except Exception as exception:
-            logging.warning("Failed to parse %s.", args.input)
-            logging.warning(exception)
-            sys.exit(-1)
-
-        write_to_rstfile(content, args.output)
-
-    if args.index:
-        # Generate the index RST file
-        generate_main_index_rst(args.output)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh
index a37304dcc88e..81b4ecd89100 100755
--- a/tools/net/ynl/ynl-regen.sh
+++ b/tools/net/ynl/ynl-regen.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
-TOOL=$(dirname $(realpath $0))/ynl-gen-c.py
+TOOL=$(dirname $(realpath $0))/pyynl/ynl_gen_c.py
 
 force=
 search=
diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
index 076a7e8dc3eb..ad1e36baee2a 100644
--- a/tools/testing/selftests/net/lib/py/ynl.py
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -13,14 +13,14 @@ try:
         SPEC_PATH = KSFT_DIR / "net/lib/specs"
 
         sys.path.append(tools_full_path.as_posix())
-        from net.lib.ynl.lib import YnlFamily, NlError
+        from net.lib.ynl.pyynl.lib import YnlFamily, NlError
     else:
         # Running in tree
         tools_full_path = KSRC / "tools"
         SPEC_PATH = KSRC / "Documentation/netlink/specs"
 
         sys.path.append(tools_full_path.as_posix())
-        from net.ynl.lib import YnlFamily, NlError
+        from net.ynl.pyynl.lib import YnlFamily, NlError
 except ModuleNotFoundError as e:
     ksft_pr("Failed importing `ynl` library from kernel sources")
     ksft_pr(str(e))
diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk
index d43afe243779..12e7cae251be 100644
--- a/tools/testing/selftests/net/ynl.mk
+++ b/tools/testing/selftests/net/ynl.mk
@@ -31,7 +31,8 @@ $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
 	$(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a
 
 EXTRA_CLEAN += \
-	$(top_srcdir)/tools/net/ynl/lib/__pycache__ \
+	$(top_srcdir)/tools/net/ynl/pyynl/__pycache__ \
+	$(top_srcdir)/tools/net/ynl/pyynl/lib/__pycache__ \
 	$(top_srcdir)/tools/net/ynl/lib/*.[ado] \
 	$(OUTPUT)/.libynl-*.sig \
 	$(OUTPUT)/libynl.a
-- 
cgit v1.2.3


From a12afefa2eabf435cb1683752b11b9cc2a42502a Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Wed, 8 Jan 2025 14:56:15 +0100
Subject: tools: ynl: add initial pyproject.toml for packaging

Add pyproject.toml and define authors, dependencies and
user-facing scripts. This will be used later by pip to
install python code.

Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/b184b43340f08aef97387bfd7f2b2cd9b015c343.1736343575.git.jstancek@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyproject.toml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 tools/net/ynl/pyproject.toml

(limited to 'tools')

diff --git a/tools/net/ynl/pyproject.toml b/tools/net/ynl/pyproject.toml
new file mode 100644
index 000000000000..a81d8779b0e0
--- /dev/null
+++ b/tools/net/ynl/pyproject.toml
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pyynl"
+authors = [
+    {name = "Donald Hunter", email = "donald.hunter@gmail.com"},
+    {name = "Jakub Kicinski", email = "kuba@kernel.org"},
+]
+description = "yaml netlink (ynl)"
+version = "0.0.1"
+requires-python = ">=3.9"
+dependencies = [
+    "pyyaml==6.*",
+    "jsonschema==4.*"
+]
+
+[tool.setuptools.packages.find]
+include = ["pyynl", "pyynl.lib"]
+
+[project.scripts]
+ynl = "pyynl.cli:main"
+ynl-ethtool = "pyynl.ethtool:main"
-- 
cgit v1.2.3


From 1b038af9f75284d8ea2b8ca5a2a3106c5ac3f232 Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Wed, 8 Jan 2025 14:56:16 +0100
Subject: tools: ynl: add install target for generated content

Generate docs using ynl_gen_rst and add install target for
headers, specs and generates rst files.

Factor out SPECS_DIR since it's repeated many times.

Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/645c68e3d201f1ef4276e3daddfe06262a0c2804.1736343575.git.jstancek@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/generated/.gitignore |  1 +
 tools/net/ynl/generated/Makefile   | 49 ++++++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/generated/.gitignore b/tools/net/ynl/generated/.gitignore
index ade488626d26..859a6fb446e1 100644
--- a/tools/net/ynl/generated/.gitignore
+++ b/tools/net/ynl/generated/.gitignore
@@ -1,2 +1,3 @@
 *-user.c
 *-user.h
+*.rst
diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile
index 00af721b1571..21f9e299dc75 100644
--- a/tools/net/ynl/generated/Makefile
+++ b/tools/net/ynl/generated/Makefile
@@ -7,32 +7,44 @@ ifeq ("$(DEBUG)","1")
   CFLAGS += -g -fsanitize=address -fsanitize=leak -static-libasan
 endif
 
+INSTALL     ?= install
+prefix      ?= /usr
+datarootdir ?= $(prefix)/share
+docdir      ?= $(datarootdir)/doc
+includedir  ?= $(prefix)/include
+
 include ../Makefile.deps
 
 YNL_GEN_ARG_ethtool:=--user-header linux/ethtool_netlink.h \
 	--exclude-op stats-get
 
 TOOL:=../pyynl/ynl_gen_c.py
+TOOL_RST:=../pyynl/ynl_gen_rst.py
 
+SPECS_DIR:=../../../../Documentation/netlink/specs
 GENS_PATHS=$(shell grep -nrI --files-without-match \
 		'protocol: netlink' \
-		../../../../Documentation/netlink/specs/)
-GENS=$(patsubst ../../../../Documentation/netlink/specs/%.yaml,%,${GENS_PATHS})
+		$(SPECS_DIR))
+GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS})
 SRCS=$(patsubst %,%-user.c,${GENS})
 HDRS=$(patsubst %,%-user.h,${GENS})
 OBJS=$(patsubst %,%-user.o,${GENS})
 
-all: protos.a $(HDRS) $(SRCS) $(KHDRS) $(KSRCS) $(UAPI)
+SPECS_PATHS=$(wildcard $(SPECS_DIR)/*.yaml)
+SPECS=$(patsubst $(SPECS_DIR)/%.yaml,%,${SPECS_PATHS})
+RSTS=$(patsubst %,%.rst,${SPECS})
+
+all: protos.a $(HDRS) $(SRCS) $(KHDRS) $(KSRCS) $(UAPI) $(RSTS)
 
 protos.a: $(OBJS)
 	@echo -e "\tAR $@"
 	@ar rcs $@ $(OBJS)
 
-%-user.h: ../../../../Documentation/netlink/specs/%.yaml $(TOOL)
+%-user.h: $(SPECS_DIR)/%.yaml $(TOOL)
 	@echo -e "\tGEN $@"
 	@$(TOOL) --mode user --header --spec $< -o $@ $(YNL_GEN_ARG_$*)
 
-%-user.c: ../../../../Documentation/netlink/specs/%.yaml $(TOOL)
+%-user.c: $(SPECS_DIR)/%.yaml $(TOOL)
 	@echo -e "\tGEN $@"
 	@$(TOOL) --mode user --source --spec $< -o $@ $(YNL_GEN_ARG_$*)
 
@@ -40,14 +52,37 @@ protos.a: $(OBJS)
 	@echo -e "\tCC $@"
 	@$(COMPILE.c) $(CFLAGS_$*) -o $@ $<
 
+%.rst: $(SPECS_DIR)/%.yaml $(TOOL_RST)
+	@echo -e "\tGEN_RST $@"
+	@$(TOOL_RST) -o $@ -i $<
+
 clean:
 	rm -f *.o
 
 distclean: clean
-	rm -f *.c *.h *.a
+	rm -f *.c *.h *.a *.rst
 
 regen:
 	@../ynl-regen.sh
 
-.PHONY: all clean distclean regen
+install-headers: $(HDRS)
+	@echo -e "\tINSTALL generated headers"
+	@$(INSTALL) -d $(DESTDIR)$(includedir)/ynl
+	@$(INSTALL) -m 0644 *.h $(DESTDIR)$(includedir)/ynl/
+
+install-rsts: $(RSTS)
+	@echo -e "\tINSTALL generated docs"
+	@$(INSTALL) -d $(DESTDIR)$(docdir)/ynl
+	@$(INSTALL) -m 0644 $(RSTS) $(DESTDIR)$(docdir)/ynl/
+
+install-specs:
+	@echo -e "\tINSTALL specs"
+	@$(INSTALL) -d $(DESTDIR)$(datarootdir)/ynl
+	@$(INSTALL) -m 0644 ../../../../Documentation/netlink/*.yaml $(DESTDIR)$(datarootdir)/ynl/
+	@$(INSTALL) -d $(DESTDIR)$(datarootdir)/ynl/specs
+	@$(INSTALL) -m 0644 $(SPECS_DIR)/*.yaml $(DESTDIR)$(datarootdir)/ynl/specs/
+
+install: install-headers install-rsts install-specs
+
+.PHONY: all clean distclean regen install install-headers install-rsts install-specs
 .DEFAULT_GOAL: all
-- 
cgit v1.2.3


From e5ad1d98234a028dadc3c5b811201399887358c1 Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Wed, 8 Jan 2025 14:56:17 +0100
Subject: tools: ynl: add main install target

This will install C library, specs, rsts and pyynl. The initial
structure is:

	$ mkdir /tmp/myroot
	$ make DESTDIR=/tmp/myroot install

	/usr
	/usr/lib64
	/usr/lib64/libynl.a
	/usr/lib/python3.XX/site-packages/pyynl/*
	/usr/lib/python3.XX/site-packages/pyynl-0.0.1.dist-info/*
	/usr/bin
	/usr/bin/ynl
	/usr/bin/ynl-ethtool
        /usr/include/ynl/*.h
	/usr/share
	/usr/share/doc
	/usr/share/doc/ynl
	/usr/share/doc/ynl/*.rst
	/usr/share/ynl
	/usr/share/ynl/genetlink-c.yaml
	/usr/share/ynl/genetlink-legacy.yaml
	/usr/share/ynl/genetlink.yaml
	/usr/share/ynl/netlink-raw.yaml
	/usr/share/ynl/specs
	/usr/share/ynl/specs/devlink.yaml
	/usr/share/ynl/specs/dpll.yaml
	/usr/share/ynl/specs/ethtool.yaml
	/usr/share/ynl/specs/fou.yaml
	/usr/share/ynl/specs/handshake.yaml
	/usr/share/ynl/specs/mptcp_pm.yaml
	/usr/share/ynl/specs/netdev.yaml
	/usr/share/ynl/specs/net_shaper.yaml
	/usr/share/ynl/specs/nfsd.yaml
	/usr/share/ynl/specs/nftables.yaml
	/usr/share/ynl/specs/nlctrl.yaml
	/usr/share/ynl/specs/ovs_datapath.yaml
	/usr/share/ynl/specs/ovs_flow.yaml
	/usr/share/ynl/specs/ovs_vport.yaml
	/usr/share/ynl/specs/rt_addr.yaml
	/usr/share/ynl/specs/rt_link.yaml
	/usr/share/ynl/specs/rt_neigh.yaml
	/usr/share/ynl/specs/rt_route.yaml
	/usr/share/ynl/specs/rt_rule.yaml
	/usr/share/ynl/specs/tcp_metrics.yaml
	/usr/share/ynl/specs/tc.yaml
	/usr/share/ynl/specs/team.yaml

Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Link: https://patch.msgid.link/c882688d751295c7f35c7d4eba104cd5174a0861.1736343575.git.jstancek@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/Makefile | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile
index 5268b91bf7ed..211df5a93ad9 100644
--- a/tools/net/ynl/Makefile
+++ b/tools/net/ynl/Makefile
@@ -1,5 +1,17 @@
 # SPDX-License-Identifier: GPL-2.0
 
+include ../../scripts/Makefile.arch
+
+INSTALL	?= install
+prefix  ?= /usr
+ifeq ($(LP64), 1)
+  libdir_relative = lib64
+else
+  libdir_relative = lib
+endif
+libdir  ?= $(prefix)/$(libdir_relative)
+includedir ?= $(prefix)/include
+
 SUBDIRS = lib generated samples
 
 all: $(SUBDIRS) libynl.a
@@ -23,5 +35,18 @@ clean distclean:
 	rm -f libynl.a
 	rm -rf pyynl/__pycache__
 	rm -rf pyynl/lib/__pycache__
+	rm -rf pyynl.egg-info
+	rm -rf build
+
+install: libynl.a lib/*.h
+	@echo -e "\tINSTALL libynl.a"
+	@$(INSTALL) -d $(DESTDIR)$(libdir)
+	@$(INSTALL) -m 0644 libynl.a $(DESTDIR)$(libdir)/libynl.a
+	@echo -e "\tINSTALL libynl headers"
+	@$(INSTALL) -d $(DESTDIR)$(includedir)/ynl
+	@$(INSTALL) -m 0644 lib/*.h $(DESTDIR)$(includedir)/ynl/
+	@echo -e "\tINSTALL pyynl"
+	@pip install --prefix=$(DESTDIR)$(prefix) .
+	@make -C generated install
 
-.PHONY: all clean distclean $(SUBDIRS)
+.PHONY: all clean distclean install $(SUBDIRS)
-- 
cgit v1.2.3


From 61f51cc6defeb0faad1f60f1dbc41613e93f31da Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 8 Jan 2025 03:50:26 -0800
Subject: netconsole: selftest: Split the helpers from the selftest

Split helper functions from the netconsole basic test into a separate
library file to enable reuse across different netconsole tests. This
change only moves the existing helper functions to lib/sh/lib_netcons.sh
while preserving the same test functionality.

The helpers provide common functions for:
- Setting up network namespaces and interfaces
- Managing netconsole dynamic targets
- Setting user data
- Handling test dependencies
- Cleanup operations

Do not make any change in the code, other than the mechanical
separation.

Signed-off-by: Breno Leitao <leitao@debian.org>
Tested-by: Simon Horman <horms@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250108-netcons_overflow_test-v3-2-3d85eb091bec@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/Makefile       |   1 +
 .../selftests/drivers/net/lib/sh/lib_netcons.sh    | 225 +++++++++++++++++++++
 .../testing/selftests/drivers/net/netcons_basic.sh | 218 +-------------------
 3 files changed, 227 insertions(+), 217 deletions(-)
 create mode 100644 tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 0fec8f9801ad..dafff5d7fe88 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 TEST_INCLUDES := $(wildcard lib/py/*.py) \
+		 $(wildcard lib/sh/*.sh) \
 		 ../../net/net_helper.sh \
 		 ../../net/lib.sh \
 
diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
new file mode 100644
index 000000000000..fdd45a3468f1
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -0,0 +1,225 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This file contains functions and helpers to support the netconsole
+# selftests
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+SRCIF="" # to be populated later
+SRCIP=192.0.2.1
+DSTIF="" # to be populated later
+DSTIP=192.0.2.2
+
+PORT="6666"
+MSG="netconsole selftest"
+USERDATA_KEY="key"
+USERDATA_VALUE="value"
+TARGET=$(mktemp -u netcons_XXXXX)
+DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
+NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
+NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
+KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
+# NAMESPACE will be populated by setup_ns with a random value
+NAMESPACE=""
+
+# IDs for netdevsim
+NSIM_DEV_1_ID=$((256 + RANDOM % 256))
+NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
+
+# Used to create and delete namespaces
+source "${LIBDIR}"/../../../../net/lib.sh
+source "${LIBDIR}"/../../../../net/net_helper.sh
+
+# Create netdevsim interfaces
+create_ifaces() {
+
+	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
+	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
+	udevadm settle 2> /dev/null || true
+
+	local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID"
+	local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID"
+
+	# These are global variables
+	SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \
+		-path "$NSIM1"/net -exec basename {} \;)
+	DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \
+		-path "$NSIM2"/net -exec basename {} \;)
+}
+
+link_ifaces() {
+	local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
+	local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex)
+	local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex)
+
+	exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}"
+	exec {INITNS_FD}</proc/self/ns/net
+
+	# Bind the dst interface to namespace
+	ip link set "${DSTIF}" netns "${NAMESPACE}"
+
+	# Linking one device to the other one (on the other namespace}
+	if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX"  > $NSIM_DEV_SYS_LINK
+	then
+		echo "linking netdevsim1 with netdevsim2 should succeed"
+		cleanup
+		exit "${ksft_skip}"
+	fi
+}
+
+function configure_ip() {
+	# Configure the IPs for both interfaces
+	ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}"
+	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up
+
+	ip addr add "${SRCIP}"/24 dev "${SRCIF}"
+	ip link set "${SRCIF}" up
+}
+
+function set_network() {
+	# setup_ns function is coming from lib.sh
+	setup_ns NAMESPACE
+
+	# Create both interfaces, and assign the destination to a different
+	# namespace
+	create_ifaces
+
+	# Link both interfaces back to back
+	link_ifaces
+
+	configure_ip
+}
+
+function create_dynamic_target() {
+	DSTMAC=$(ip netns exec "${NAMESPACE}" \
+		 ip link show "${DSTIF}" | awk '/ether/ {print $2}')
+
+	# Create a dynamic target
+	mkdir "${NETCONS_PATH}"
+
+	echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
+	echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
+	echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
+	echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
+
+	echo 1 > "${NETCONS_PATH}"/enabled
+}
+
+function cleanup() {
+	local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device"
+
+	# delete netconsole dynamic reconfiguration
+	echo 0 > "${NETCONS_PATH}"/enabled
+	# Remove key
+	rmdir "${KEY_PATH}"
+	# Remove the configfs entry
+	rmdir "${NETCONS_PATH}"
+
+	# Delete netdevsim devices
+	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL"
+	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL"
+
+	# this is coming from lib.sh
+	cleanup_all_ns
+
+	# Restoring printk configurations
+	echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
+}
+
+function set_user_data() {
+	if [[ ! -d "${NETCONS_PATH}""/userdata" ]]
+	then
+		echo "Userdata path not available in ${NETCONS_PATH}/userdata"
+		exit "${ksft_skip}"
+	fi
+
+	mkdir -p "${KEY_PATH}"
+	VALUE_PATH="${KEY_PATH}""/value"
+	echo "${USERDATA_VALUE}" > "${VALUE_PATH}"
+}
+
+function listen_port_and_save_to() {
+	local OUTPUT=${1}
+	# Just wait for 2 seconds
+	timeout 2 ip netns exec "${NAMESPACE}" \
+		socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}"
+}
+
+function validate_result() {
+	local TMPFILENAME="$1"
+
+	# TMPFILENAME will contain something like:
+	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
+	#  key=value
+
+	# Check if the file exists
+	if [ ! -f "$TMPFILENAME" ]; then
+		echo "FAIL: File was not generated." >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${MSG}" "${TMPFILENAME}"; then
+		echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2
+		cat "${TMPFILENAME}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then
+		echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2
+		cat "${TMPFILENAME}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	# Delete the file once it is validated, otherwise keep it
+	# for debugging purposes
+	rm "${TMPFILENAME}"
+	exit "${ksft_pass}"
+}
+
+function check_for_dependencies() {
+	if [ "$(id -u)" -ne 0 ]; then
+		echo "This test must be run as root" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which socat > /dev/null ; then
+		echo "SKIP: socat(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which ip > /dev/null ; then
+		echo "SKIP: ip(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which udevadm > /dev/null ; then
+		echo "SKIP: udevadm(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
+		echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
+		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ip link show "${DSTIF}" 2> /dev/null; then
+		echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then
+		echo "SKIP: IPs already in use. Skipping it" >&2
+		exit "${ksft_skip}"
+	fi
+}
diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index b175f4d966e5..fe765da498e8 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -18,224 +18,8 @@ set -euo pipefail
 
 SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
 
-# Simple script to test dynamic targets in netconsole
-SRCIF="" # to be populated later
-SRCIP=192.0.2.1
-DSTIF="" # to be populated later
-DSTIP=192.0.2.2
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
 
-PORT="6666"
-MSG="netconsole selftest"
-USERDATA_KEY="key"
-USERDATA_VALUE="value"
-TARGET=$(mktemp -u netcons_XXXXX)
-DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
-NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
-NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
-KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
-# NAMESPACE will be populated by setup_ns with a random value
-NAMESPACE=""
-
-# IDs for netdevsim
-NSIM_DEV_1_ID=$((256 + RANDOM % 256))
-NSIM_DEV_2_ID=$((512 + RANDOM % 256))
-NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
-
-# Used to create and delete namespaces
-source "${SCRIPTDIR}"/../../net/lib.sh
-source "${SCRIPTDIR}"/../../net/net_helper.sh
-
-# Create netdevsim interfaces
-create_ifaces() {
-
-	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
-	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
-	udevadm settle 2> /dev/null || true
-
-	local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID"
-	local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID"
-
-	# These are global variables
-	SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \
-		-path "$NSIM1"/net -exec basename {} \;)
-	DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \
-		-path "$NSIM2"/net -exec basename {} \;)
-}
-
-link_ifaces() {
-	local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
-	local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex)
-	local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex)
-
-	exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}"
-	exec {INITNS_FD}</proc/self/ns/net
-
-	# Bind the dst interface to namespace
-	ip link set "${DSTIF}" netns "${NAMESPACE}"
-
-	# Linking one device to the other one (on the other namespace}
-	if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX"  > $NSIM_DEV_SYS_LINK
-	then
-		echo "linking netdevsim1 with netdevsim2 should succeed"
-		cleanup
-		exit "${ksft_skip}"
-	fi
-}
-
-function configure_ip() {
-	# Configure the IPs for both interfaces
-	ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}"
-	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up
-
-	ip addr add "${SRCIP}"/24 dev "${SRCIF}"
-	ip link set "${SRCIF}" up
-}
-
-function set_network() {
-	# setup_ns function is coming from lib.sh
-	setup_ns NAMESPACE
-
-	# Create both interfaces, and assign the destination to a different
-	# namespace
-	create_ifaces
-
-	# Link both interfaces back to back
-	link_ifaces
-
-	configure_ip
-}
-
-function create_dynamic_target() {
-	DSTMAC=$(ip netns exec "${NAMESPACE}" \
-		 ip link show "${DSTIF}" | awk '/ether/ {print $2}')
-
-	# Create a dynamic target
-	mkdir "${NETCONS_PATH}"
-
-	echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
-	echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
-	echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
-	echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
-
-	echo 1 > "${NETCONS_PATH}"/enabled
-}
-
-function cleanup() {
-	local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device"
-
-	# delete netconsole dynamic reconfiguration
-	echo 0 > "${NETCONS_PATH}"/enabled
-	# Remove key
-	rmdir "${KEY_PATH}"
-	# Remove the configfs entry
-	rmdir "${NETCONS_PATH}"
-
-	# Delete netdevsim devices
-	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL"
-	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL"
-
-	# this is coming from lib.sh
-	cleanup_all_ns
-
-	# Restoring printk configurations
-	echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
-}
-
-function set_user_data() {
-	if [[ ! -d "${NETCONS_PATH}""/userdata" ]]
-	then
-		echo "Userdata path not available in ${NETCONS_PATH}/userdata"
-		exit "${ksft_skip}"
-	fi
-
-	mkdir -p "${KEY_PATH}"
-	VALUE_PATH="${KEY_PATH}""/value"
-	echo "${USERDATA_VALUE}" > "${VALUE_PATH}"
-}
-
-function listen_port_and_save_to() {
-	local OUTPUT=${1}
-	# Just wait for 2 seconds
-	timeout 2 ip netns exec "${NAMESPACE}" \
-		socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}"
-}
-
-function validate_result() {
-	local TMPFILENAME="$1"
-
-	# TMPFILENAME will contain something like:
-	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
-	#  key=value
-
-	# Check if the file exists
-	if [ ! -f "$TMPFILENAME" ]; then
-		echo "FAIL: File was not generated." >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "${MSG}" "${TMPFILENAME}"; then
-		echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2
-		cat "${TMPFILENAME}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then
-		echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2
-		cat "${TMPFILENAME}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	# Delete the file once it is validated, otherwise keep it
-	# for debugging purposes
-	rm "${TMPFILENAME}"
-	exit "${ksft_pass}"
-}
-
-function check_for_dependencies() {
-	if [ "$(id -u)" -ne 0 ]; then
-		echo "This test must be run as root" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ! which socat > /dev/null ; then
-		echo "SKIP: socat(1) is not available" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ! which ip > /dev/null ; then
-		echo "SKIP: ip(1) is not available" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ! which udevadm > /dev/null ; then
-		echo "SKIP: udevadm(1) is not available" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
-		echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
-		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ip link show "${DSTIF}" 2> /dev/null; then
-		echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then
-		echo "SKIP: IPs already in use. Skipping it" >&2
-		exit "${ksft_skip}"
-	fi
-}
-
-# ========== #
-# Start here #
-# ========== #
 modprobe netdevsim 2> /dev/null || true
 modprobe netconsole 2> /dev/null || true
 
-- 
cgit v1.2.3


From 7dcb65351b30500a759c857590b73b98c05ac7fd Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 8 Jan 2025 03:50:27 -0800
Subject: netconsole: selftest: Delete all userdata keys

Modify the cleanup function to remove all userdata keys created during the
test, instead of just deleting a single predefined key. This ensures a
more thorough cleanup of temporary resources.

Move the KEY_PATH variable definition inside the set_user_data function
to reduce global variables and improve encapsulation. The KEY_PATH
variable is now dynamically created when setting user data.

This change has no effect on the current test, while improving an
upcoming test that would create several userdata entries.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250108-netcons_overflow_test-v3-3-3d85eb091bec@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
index fdd45a3468f1..3acaba41ac7b 100644
--- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -23,7 +23,6 @@ TARGET=$(mktemp -u netcons_XXXXX)
 DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
 NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
 NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
-KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
 # NAMESPACE will be populated by setup_ns with a random value
 NAMESPACE=""
 
@@ -116,8 +115,8 @@ function cleanup() {
 
 	# delete netconsole dynamic reconfiguration
 	echo 0 > "${NETCONS_PATH}"/enabled
-	# Remove key
-	rmdir "${KEY_PATH}"
+	# Remove all the keys that got created during the selftest
+	find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete
 	# Remove the configfs entry
 	rmdir "${NETCONS_PATH}"
 
@@ -139,6 +138,7 @@ function set_user_data() {
 		exit "${ksft_skip}"
 	fi
 
+	KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
 	mkdir -p "${KEY_PATH}"
 	VALUE_PATH="${KEY_PATH}""/value"
 	echo "${USERDATA_VALUE}" > "${VALUE_PATH}"
-- 
cgit v1.2.3


From daea6d23cd2f99a0c70e871a27d473f3ad48845b Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 8 Jan 2025 03:50:28 -0800
Subject: netconsole: selftest: verify userdata entry limit

Add a new selftest for netconsole that tests the userdata entry limit
functionality. The test performs two key verifications:

1. Create MAX_USERDATA_ITEMS (16) userdata entries successfully
2. Confirm that attempting to create an additional userdata entry fails

The selftest script uses the netcons library and checks the behavior
by attempting to create entries beyond the maximum allowed limit.

Signed-off-by: Breno Leitao <leitao@debian.org>
Tested-by: Simon Horman <horms@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250108-netcons_overflow_test-v3-4-3d85eb091bec@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/Makefile       |  1 +
 .../selftests/drivers/net/netcons_overflow.sh      | 67 ++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/netcons_overflow.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index dafff5d7fe88..469179c18935 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -7,6 +7,7 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \
 
 TEST_PROGS := \
 	netcons_basic.sh \
+	netcons_overflow.sh \
 	ping.py \
 	queues.py \
 	stats.py \
diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh
new file mode 100755
index 000000000000..29bad56448a2
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netcons_overflow.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test verifies that users can successfully create up to
+# MAX_USERDATA_ITEMS userdata entries without encountering any failures.
+#
+# Additionally, it tests for expected failure when attempting to exceed this
+# maximum limit.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
+# This is coming from netconsole code. Check for it in drivers/net/netconsole.c
+MAX_USERDATA_ITEMS=16
+
+# Function to create userdata entries
+function create_userdata_max_entries() {
+	# All these keys should be created without any error
+	for i in $(seq $MAX_USERDATA_ITEMS)
+	do
+		# USERDATA_KEY is used by set_user_data
+		USERDATA_KEY="key"${i}
+		set_user_data
+	done
+}
+
+# Function to verify the entry limit
+function verify_entry_limit() {
+	# Allowing the test to fail without exiting, since the next command
+	# will fail
+	set +e
+	mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null
+	ret="$?"
+	set -e
+	if [ "$ret" -eq 0 ];
+	then
+		echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2
+		ls "${NETCONS_PATH}/userdata/" >&2
+		exit "${ksft_fail}"
+	fi
+}
+
+# ========== #
+# Start here #
+# ========== #
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+# Create a dynamic target for netconsole
+create_dynamic_target
+# populate the maximum number of supported keys in userdata
+create_userdata_max_entries
+# Verify an additional entry is not allowed
+verify_entry_limit
+exit "${ksft_pass}"
-- 
cgit v1.2.3


From b665ee5f08df401a9e4a68e7d5f662d3f72ba1e1 Mon Sep 17 00:00:00 2001
From: Ba Jing <bajing@cmss.chinamobile.com>
Date: Mon, 18 Nov 2024 12:24:07 +0800
Subject: selftests/landlock: Remove unused macros in ptrace_test.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After reviewing the code, it was found that these macros are never
referenced in the code. Just remove them.

Signed-off-by: Ba Jing <bajing@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20241118042407.12900-1-bajing@cmss.chinamobile.com
[mic: Reword subject]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/ptrace_test.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c
index a19db4d0b3bd..8f31b673ff2d 100644
--- a/tools/testing/selftests/landlock/ptrace_test.c
+++ b/tools/testing/selftests/landlock/ptrace_test.c
@@ -22,8 +22,6 @@
 /* Copied from security/yama/yama_lsm.c */
 #define YAMA_SCOPE_DISABLED 0
 #define YAMA_SCOPE_RELATIONAL 1
-#define YAMA_SCOPE_CAPABILITY 2
-#define YAMA_SCOPE_NO_ATTACH 3
 
 static void create_domain(struct __test_metadata *const _metadata)
 {
-- 
cgit v1.2.3


From ba113ecad81a5167854d6ca05ea19a63eca1c4a3 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 8 Jan 2025 14:28:57 +0000
Subject: perf docs: arm_spe: Document new discard mode

Document the flag along with PMU events to hint what it's used for and
give an example with other useful options to get minimal output.

Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20250108142904.401139-3-james.clark@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/perf/Documentation/perf-arm-spe.txt | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt
index de2b0b479249..37afade4f1b2 100644
--- a/tools/perf/Documentation/perf-arm-spe.txt
+++ b/tools/perf/Documentation/perf-arm-spe.txt
@@ -150,6 +150,7 @@ arm_spe/load_filter=1,min_latency=10/'
   pct_enable=1        - collect physical timestamp instead of virtual timestamp (PMSCR.PCT) - requires privilege
   store_filter=1      - collect stores only (PMSFCR.ST)
   ts_enable=1         - enable timestamping with value of generic timer (PMSCR.TS)
+  discard=1           - enable SPE PMU events but don't collect sample data - see 'Discard mode' (PMBLIMITR.FM = DISCARD)
 
 +++*+++ Latency is the total latency from the point at which sampling started on that instruction, rather
 than only the execution latency.
@@ -220,6 +221,31 @@ Common errors
 
    Increase sampling interval (see above)
 
+PMU events
+~~~~~~~~~~
+
+SPE has events that can be counted on core PMUs. These are prefixed with
+SAMPLE_, for example SAMPLE_POP, SAMPLE_FEED, SAMPLE_COLLISION and
+SAMPLE_FEED_BR.
+
+These events will only count when an SPE event is running on the same core that
+the PMU event is opened on, otherwise they read as 0. There are various ways to
+ensure that the PMU event and SPE event are scheduled together depending on the
+way the event is opened. For example opening both events as per-process events
+on the same process, although it's not guaranteed that the PMU event is enabled
+first when context switching. For that reason it may be better to open the PMU
+event as a systemwide event and then open SPE on the process of interest.
+
+Discard mode
+~~~~~~~~~~~~
+
+SPE related (SAMPLE_* etc) core PMU events can be used without the overhead of
+collecting sample data if discard mode is supported (optional from Armv8.6).
+First run a system wide SPE session (or on the core of interest) using options
+to minimize output. Then run perf stat:
+
+  perf record -e arm_spe/discard/ -a -N -B --no-bpf-event -o - > /dev/null &
+  perf stat -e SAMPLE_FEED_LD
 
 SEE ALSO
 --------
-- 
cgit v1.2.3


From 2e1ce39fde7caacc98bc0472d15e8c641dfb31bf Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 9 Jan 2025 22:14:53 +0900
Subject: sched_ext: Add scx_bpf_now() for BPF scheduler

scx_bpf_now() is added to the header files so the BPF scheduler
can use it.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h | 1 +
 tools/sched_ext/include/scx/compat.bpf.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 858ba1f438f6..5c9517190713 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -76,6 +76,7 @@ bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
+u64 scx_bpf_now(void) __ksym __weak;
 
 /*
  * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index d56520100a26..50e1499ae093 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -125,6 +125,11 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
 	false;									\
 })
 
+#define scx_bpf_now()								\
+	(bpf_ksym_exists(scx_bpf_now) ?						\
+	 scx_bpf_now() :							\
+	 bpf_ktime_get_ns())
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
-- 
cgit v1.2.3


From d07be814fc7165cb804317c99228243382e81188 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 9 Jan 2025 22:14:54 +0900
Subject: sched_ext: Add time helpers for BPF schedulers

The following functions are added for BPF schedulers:
- time_delta(after, before)
- time_after(a, b)
- time_before(a, b)
- time_after_eq(a, b)
- time_before_eq(a, b)
- time_in_range(a, b, c)
- time_in_range_open(a, b, c)

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h | 94 ++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

(limited to 'tools')

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 5c9517190713..f3e15e9efa76 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -408,6 +408,100 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
 
+/*
+ * Time helpers, most of which are from jiffies.h.
+ */
+
+/**
+ * time_delta - Calculate the delta between new and old time stamp
+ * @after: first comparable as u64
+ * @before: second comparable as u64
+ *
+ * Return: the time difference, which is >= 0
+ */
+static inline s64 time_delta(u64 after, u64 before)
+{
+	return (s64)(after - before) > 0 ? : 0;
+}
+
+/**
+ * time_after - returns true if the time a is after time b.
+ * @a: first comparable as u64
+ * @b: second comparable as u64
+ *
+ * Do this with "<0" and ">=0" to only test the sign of the result. A
+ * good compiler would generate better code (and a really good compiler
+ * wouldn't care). Gcc is currently neither.
+ *
+ * Return: %true is time a is after time b, otherwise %false.
+ */
+static inline bool time_after(u64 a, u64 b)
+{
+	 return (s64)(b - a) < 0;
+}
+
+/**
+ * time_before - returns true if the time a is before time b.
+ * @a: first comparable as u64
+ * @b: second comparable as u64
+ *
+ * Return: %true is time a is before time b, otherwise %false.
+ */
+static inline bool time_before(u64 a, u64 b)
+{
+	return time_after(b, a);
+}
+
+/**
+ * time_after_eq - returns true if the time a is after or the same as time b.
+ * @a: first comparable as u64
+ * @b: second comparable as u64
+ *
+ * Return: %true is time a is after or the same as time b, otherwise %false.
+ */
+static inline bool time_after_eq(u64 a, u64 b)
+{
+	 return (s64)(a - b) >= 0;
+}
+
+/**
+ * time_before_eq - returns true if the time a is before or the same as time b.
+ * @a: first comparable as u64
+ * @b: second comparable as u64
+ *
+ * Return: %true is time a is before or the same as time b, otherwise %false.
+ */
+static inline bool time_before_eq(u64 a, u64 b)
+{
+	return time_after_eq(b, a);
+}
+
+/**
+ * time_in_range - Calculate whether a is in the range of [b, c].
+ * @a: time to test
+ * @b: beginning of the range
+ * @c: end of the range
+ *
+ * Return: %true is time a is in the range [b, c], otherwise %false.
+ */
+static inline bool time_in_range(u64 a, u64 b, u64 c)
+{
+	return time_after_eq(a, b) && time_before_eq(a, c);
+}
+
+/**
+ * time_in_range_open - Calculate whether a is in the range of [b, c).
+ * @a: time to test
+ * @b: beginning of the range
+ * @c: end of the range
+ *
+ * Return: %true is time a is in the range [b, c), otherwise %false.
+ */
+static inline bool time_in_range_open(u64 a, u64 b, u64 c)
+{
+	return time_after_eq(a, b) && time_before(a, c);
+}
+
 
 /*
  * Other helpers
-- 
cgit v1.2.3


From 0f130bc341d09a82ad23e7fc59d4306528c4c4ce Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 9 Jan 2025 22:14:55 +0900
Subject: sched_ext: Replace bpf_ktime_get_ns() to scx_bpf_now()

In the BPF schedulers that use bpf_ktime_get_ns() -- scx_central and
scx_flatcg, replace bpf_ktime_get_ns() calls to scx_bpf_now().

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.bpf.c | 4 ++--
 tools/sched_ext/scx_flatcg.bpf.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 2907df78241e..4239034ad593 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -245,7 +245,7 @@ void BPF_STRUCT_OPS(central_running, struct task_struct *p)
 	s32 cpu = scx_bpf_task_cpu(p);
 	u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
 	if (started_at)
-		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
+		*started_at = scx_bpf_now() ?: 1;	/* 0 indicates idle */
 }
 
 void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
@@ -258,7 +258,7 @@ void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
 
 static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
-	u64 now = bpf_ktime_get_ns();
+	u64 now = scx_bpf_now();
 	u64 nr_to_kick = nr_queued;
 	s32 i, curr_cpu;
 
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 3dbfa82883be..5f588963fb2f 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -734,7 +734,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	struct fcg_cpu_ctx *cpuc;
 	struct fcg_cgrp_ctx *cgc;
 	struct cgroup *cgrp;
-	u64 now = bpf_ktime_get_ns();
+	u64 now = scx_bpf_now();
 	bool picked_next = false;
 
 	cpuc = find_cpu_ctx();
-- 
cgit v1.2.3


From 62addc6dbf3644272c064c16076221bf4f633f25 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 9 Jan 2025 22:14:56 +0900
Subject: sched_ext: Use time helpers in BPF schedulers

Modify the BPF schedulers to use time helpers defined in common.bpf.h

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.bpf.c |  7 +------
 tools/sched_ext/scx_flatcg.bpf.c  | 21 ++++++++-------------
 tools/sched_ext/scx_simple.bpf.c  |  9 ++-------
 3 files changed, 11 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 4239034ad593..50bc1737c167 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -87,11 +87,6 @@ struct {
 	__type(value, struct central_timer);
 } central_timer SEC(".maps");
 
-static bool vtime_before(u64 a, u64 b)
-{
-	return (s64)(a - b) < 0;
-}
-
 s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -279,7 +274,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
 		/* kick iff the current one exhausted its slice */
 		started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
 		if (started_at && *started_at &&
-		    vtime_before(now, *started_at + slice_ns))
+		    time_before(now, *started_at + slice_ns))
 			continue;
 
 		/* and there's something pending */
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 5f588963fb2f..2c720e3ecad5 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -137,11 +137,6 @@ static u64 div_round_up(u64 dividend, u64 divisor)
 	return (dividend + divisor - 1) / divisor;
 }
 
-static bool vtime_before(u64 a, u64 b)
-{
-	return (s64)(a - b) < 0;
-}
-
 static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
 {
 	struct cgv_node *cgc_a, *cgc_b;
@@ -271,7 +266,7 @@ static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
 	 */
 	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
 		(2 * FCG_HWEIGHT_ONE);
-	if (vtime_before(cvtime, cvtime_now - max_budget))
+	if (time_before(cvtime, cvtime_now - max_budget))
 		cvtime = cvtime_now - max_budget;
 
 	cgv_node->cvtime = cvtime;
@@ -401,7 +396,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		 * Limit the amount of budget that an idling task can accumulate
 		 * to one slice.
 		 */
-		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
+		if (time_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
 			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
 
 		scx_bpf_dsq_insert_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
@@ -535,7 +530,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
 		 * from multiple CPUs and thus racy. Any error should be
 		 * contained and temporary. Let's just live with it.
 		 */
-		if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
+		if (time_before(cgc->tvtime_now, p->scx.dsq_vtime))
 			cgc->tvtime_now = p->scx.dsq_vtime;
 	}
 	bpf_cgroup_release(cgrp);
@@ -645,7 +640,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
 	cgid = cgv_node->cgid;
 
-	if (vtime_before(cvtime_now, cgv_node->cvtime))
+	if (time_before(cvtime_now, cgv_node->cvtime))
 		cvtime_now = cgv_node->cvtime;
 
 	/*
@@ -744,7 +739,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	if (!cpuc->cur_cgid)
 		goto pick_next_cgroup;
 
-	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
+	if (time_before(now, cpuc->cur_at + cgrp_slice_ns)) {
 		if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) {
 			stat_inc(FCG_STAT_CNS_KEEP);
 			return;
@@ -920,14 +915,14 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 		    struct cgroup *from, struct cgroup *to)
 {
 	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
-	s64 vtime_delta;
+	s64 delta;
 
 	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
 	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
 		return;
 
-	vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
-	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
+	delta = time_delta(p->scx.dsq_vtime, from_cgc->tvtime_now);
+	p->scx.dsq_vtime = to_cgc->tvtime_now + delta;
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 31f915b286c6..e6de99dba7db 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -52,11 +52,6 @@ static void stat_inc(u32 idx)
 		(*cnt_p)++;
 }
 
-static inline bool vtime_before(u64 a, u64 b)
-{
-	return (s64)(a - b) < 0;
-}
-
 s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	bool is_idle = false;
@@ -84,7 +79,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 		 * Limit the amount of budget that an idling task can accumulate
 		 * to one slice.
 		 */
-		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+		if (time_before(vtime, vtime_now - SCX_SLICE_DFL))
 			vtime = vtime_now - SCX_SLICE_DFL;
 
 		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
@@ -108,7 +103,7 @@ void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
 	 * thus racy. Any error should be contained and temporary. Let's just
 	 * live with it.
 	 */
-	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+	if (time_before(vtime_now, p->scx.dsq_vtime))
 		vtime_now = p->scx.dsq_vtime;
 }
 
-- 
cgit v1.2.3


From e2b0bda62d549d9dcbc11f5371c6a41c8c4f54b0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 9 Jan 2025 09:40:23 -0800
Subject: libbpf: Add unique_match option for multi kprobe

Jordan reported an issue in Meta production environment where func
try_to_wake_up() is renamed to try_to_wake_up.llvm.<hash>() by clang
compiler at lto mode. The original 'kprobe/try_to_wake_up' does not
work any more since try_to_wake_up() does not match the actual func
name in /proc/kallsyms.

There are a couple of ways to resolve this issue. For example, in
attach_kprobe(), we could do lookup in /proc/kallsyms so try_to_wake_up()
can be replaced by try_to_wake_up.llvm.<hach>(). Or we can force users
to use bpf_program__attach_kprobe() where they need to lookup
/proc/kallsyms to find out try_to_wake_up.llvm.<hach>(). But these two
approaches requires extra work by either libbpf or user.

Luckily, suggested by Andrii, multi kprobe already supports wildcard ('*')
for symbol matching. In the above example, 'try_to_wake_up*' can match
to try_to_wake_up() or try_to_wake_up.llvm.<hash>() and this allows
bpf prog works for different kernels as some kernels may have
try_to_wake_up() and some others may have try_to_wake_up.llvm.<hash>().

The original intention is to kprobe try_to_wake_up() only, so an optional
field unique_match is added to struct bpf_kprobe_multi_opts. If the
field is set to true, the number of matched functions must be one.
Otherwise, the attachment will fail. In the above case, multi kprobe
with 'try_to_wake_up*' and unique_match preserves user functionality.

Reported-by: Jordan Rome <linux@jordanrome.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250109174023.3368432-1-yonghong.song@linux.dev
---
 tools/lib/bpf/libbpf.c | 13 ++++++++++++-
 tools/lib/bpf/libbpf.h |  4 +++-
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 46492cc0927d..6c262d0152f8 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11534,7 +11534,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 	struct bpf_link *link = NULL;
 	const unsigned long *addrs;
 	int err, link_fd, prog_fd;
-	bool retprobe, session;
+	bool retprobe, session, unique_match;
 	const __u64 *cookies;
 	const char **syms;
 	size_t cnt;
@@ -11553,6 +11553,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 	addrs   = OPTS_GET(opts, addrs, false);
 	cnt     = OPTS_GET(opts, cnt, false);
 	cookies = OPTS_GET(opts, cookies, false);
+	unique_match = OPTS_GET(opts, unique_match, false);
 
 	if (!pattern && !addrs && !syms)
 		return libbpf_err_ptr(-EINVAL);
@@ -11560,6 +11561,8 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 		return libbpf_err_ptr(-EINVAL);
 	if (!pattern && !cnt)
 		return libbpf_err_ptr(-EINVAL);
+	if (!pattern && unique_match)
+		return libbpf_err_ptr(-EINVAL);
 	if (addrs && syms)
 		return libbpf_err_ptr(-EINVAL);
 
@@ -11570,6 +11573,14 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 			err = libbpf_available_kallsyms_parse(&res);
 		if (err)
 			goto error;
+
+		if (unique_match && res.cnt != 1) {
+			pr_warn("prog '%s': failed to find a unique match for '%s' (%zu matches)\n",
+				prog->name, pattern, res.cnt);
+			err = -EINVAL;
+			goto error;
+		}
+
 		addrs = res.addrs;
 		cnt = res.cnt;
 	}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index d45807103565..3020ee45303a 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -552,10 +552,12 @@ struct bpf_kprobe_multi_opts {
 	bool retprobe;
 	/* create session kprobes */
 	bool session;
+	/* enforce unique match */
+	bool unique_match;
 	size_t :0;
 };
 
-#define bpf_kprobe_multi_opts__last_field session
+#define bpf_kprobe_multi_opts__last_field unique_match
 
 LIBBPF_API struct bpf_link *
 bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
-- 
cgit v1.2.3


From a43796b5201270b258b7a418c41816ec03393ce5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 9 Jan 2025 09:40:28 -0800
Subject: selftests/bpf: Add a test for kprobe multi with unique_match

Add a kprobe multi subtest to test kprobe multi unique_match option.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250109174028.3368967-1-yonghong.song@linux.dev
---
 .../selftests/bpf/prog_tests/kprobe_multi_test.c   | 27 ++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index 66ab1cae923e..e19ef509ebf8 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -397,6 +397,31 @@ cleanup:
 	kprobe_multi_session_cookie__destroy(skel);
 }
 
+static void test_unique_match(void)
+{
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	struct kprobe_multi *skel = NULL;
+	struct bpf_link *link = NULL;
+
+	skel = kprobe_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kprobe_multi__open_and_load"))
+		return;
+
+	opts.unique_match = true;
+	skel->bss->pid = getpid();
+	link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_manual,
+						     "bpf_fentry_test*", &opts);
+	if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_multi_opts"))
+		bpf_link__destroy(link);
+
+	link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_manual,
+						     "bpf_fentry_test8*", &opts);
+	if (ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts"))
+		bpf_link__destroy(link);
+
+	kprobe_multi__destroy(skel);
+}
+
 static size_t symbol_hash(long key, void *ctx __maybe_unused)
 {
 	return str_hash((const char *) key);
@@ -765,5 +790,7 @@ void test_kprobe_multi_test(void)
 		test_session_skel_api();
 	if (test__start_subtest("session_cookie"))
 		test_session_cookie_skel_api();
+	if (test__start_subtest("unique_match"))
+		test_unique_match();
 	RUN_TESTS(kprobe_multi_verifier);
 }
-- 
cgit v1.2.3


From defac894af93cb347fae0520fe8f14ca36f6fe87 Mon Sep 17 00:00:00 2001
From: Christoph Werle <christoph.werle@longjmp.de>
Date: Wed, 8 Jan 2025 23:09:37 +0100
Subject: bpftool: Fix control flow graph segfault during edge creation

If the last instruction of a control flow graph building block is a
BPF_CALL, an incorrect edge with e->dst set to NULL is created and
results in a segfault during graph output.

Ensure that BPF_CALL as last instruction of a building block is handled
correctly and only generates a single edge unlike actual BPF_JUMP*
instructions.

Signed-off-by: Christoph Werle <christoph.werle@longjmp.de>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Quentin Monnet <qmo@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20250108220937.1470029-1-christoph.werle@longjmp.de
---
 tools/bpf/bpftool/cfg.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/cfg.c b/tools/bpf/bpftool/cfg.c
index eec437cca2ea..e3785f9a697d 100644
--- a/tools/bpf/bpftool/cfg.c
+++ b/tools/bpf/bpftool/cfg.c
@@ -302,6 +302,7 @@ static bool func_add_bb_edges(struct func_node *func)
 
 		insn = bb->tail;
 		if (!is_jmp_insn(insn->code) ||
+		    BPF_OP(insn->code) == BPF_CALL ||
 		    BPF_OP(insn->code) == BPF_EXIT) {
 			e->dst = bb_next(bb);
 			e->flags |= EDGE_FLAG_FALLTHROUGH;
-- 
cgit v1.2.3


From 95ad526edebcb3a68429315c81024ffc4ec0980a Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Thu, 9 Jan 2025 16:42:29 -0700
Subject: veristat: Document verifier log dumping capability

`-vl2` is a useful combination of flags to dump the entire
verification log. This is helpful when making changes to the verifier,
as you can see what it thinks program one instruction at a time.

This was more or less a hidden feature before. Document it so others can
discover it.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/d57bbcca81e06ae8dcdadaedb99a48dced67e422.1736466129.git.dxu@dxuuu.xyz
---
 tools/testing/selftests/bpf/veristat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 974c808f9321..8dcf5ee000ca 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -216,7 +216,8 @@ const char argp_program_doc[] =
 "\n"
 "USAGE: veristat <obj-file> [<obj-file>...]\n"
 "   OR: veristat -C <baseline.csv> <comparison.csv>\n"
-"   OR: veristat -R <results.csv>\n";
+"   OR: veristat -R <results.csv>\n"
+"   OR: veristat -vl2 <to_analyze.bpf.o>\n";
 
 enum {
 	OPT_LOG_FIXED = 1000,
@@ -228,7 +229,7 @@ static const struct argp_option opts[] = {
 	{ "version", 'V', NULL, 0, "Print version" },
 	{ "verbose", 'v', NULL, 0, "Verbose mode" },
 	{ "debug", 'd', NULL, 0, "Debug mode (turns on libbpf debug logging)" },
-	{ "log-level", 'l', "LEVEL", 0, "Verifier log level (default 0 for normal mode, 1 for verbose mode)" },
+	{ "log-level", 'l', "LEVEL", 0, "Verifier log level (default 0 for normal mode, 1 for verbose mode, 2 for full verification log)" },
 	{ "log-fixed", OPT_LOG_FIXED, NULL, 0, "Disable verifier log rotation" },
 	{ "log-size", OPT_LOG_SIZE, "BYTES", 0, "Customize verifier log size (default to 16MB)" },
 	{ "top-n", 'n', "N", 0, "Emit only up to first N results." },
-- 
cgit v1.2.3


From 2c6c5c7c1ad18761c399ef3376f5320eb13f92eb Mon Sep 17 00:00:00 2001
From: "Bastien Curutchet (eBPF Foundation)" <bastien.curutchet@bootlin.com>
Date: Fri, 10 Jan 2025 10:21:09 +0100
Subject: selftests/bpf: test_xdp_redirect: Rename BPF sections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC("redirect_to_111") and SEC("redirect_to_222") can't be loaded by the
__load() helper.

Rename both sections SEC("xdp") so it can be interpreted by the __load()
helper in upcoming patch.
Update the test_xdp_redirect.sh to use the program name instead of the
section name to load the BPF program.

Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://patch.msgid.link/20250110-xdp_redirect-v2-1-b8f3ae53e894@bootlin.com
---
 tools/testing/selftests/bpf/progs/test_xdp_redirect.c | 4 ++--
 tools/testing/selftests/bpf/test_xdp_redirect.sh      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
index b778cad45485..7025aee08a00 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
@@ -12,12 +12,12 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
-SEC("redirect_to_111")
+SEC("xdp")
 int xdp_redirect_to_111(struct xdp_md *xdp)
 {
 	return bpf_redirect(111, 0);
 }
-SEC("redirect_to_222")
+SEC("xdp")
 int xdp_redirect_to_222(struct xdp_md *xdp)
 {
 	return bpf_redirect(222, 0);
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
index 0746a4fde9d3..3c61a1c22b08 100755
--- a/tools/testing/selftests/bpf/test_xdp_redirect.sh
+++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh
@@ -56,8 +56,8 @@ test_xdp_redirect()
 
 	ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null
 	ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null
-	ip link set dev veth1 $xdpmode obj test_xdp_redirect.bpf.o sec redirect_to_222 &> /dev/null
-	ip link set dev veth2 $xdpmode obj test_xdp_redirect.bpf.o sec redirect_to_111 &> /dev/null
+	ip link set dev veth1 $xdpmode obj test_xdp_redirect.bpf.o program xdp_redirect_to_222 &> /dev/null
+	ip link set dev veth2 $xdpmode obj test_xdp_redirect.bpf.o program xdp_redirect_to_111 &> /dev/null
 
 	if ip netns exec ${NS1} ping -c 1 10.1.1.22 &> /dev/null &&
 	   ip netns exec ${NS2} ping -c 1 10.1.1.11 &> /dev/null; then
-- 
cgit v1.2.3


From a94df601091ffcee576925973b51ce8838851d26 Mon Sep 17 00:00:00 2001
From: "Bastien Curutchet (eBPF Foundation)" <bastien.curutchet@bootlin.com>
Date: Fri, 10 Jan 2025 10:21:10 +0100
Subject: selftests/bpf: Migrate test_xdp_redirect.sh to xdp_do_redirect.c

test_xdp_redirect.sh can't be used by the BPF CI.

Migrate test_xdp_redirect.sh into a new test case in xdp_do_redirect.c.
It uses the same network topology and the same BPF programs located in
progs/test_xdp_redirect.c and progs/xdp_dummy.c.
Remove test_xdp_redirect.sh and its Makefile entry.

Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20250110-xdp_redirect-v2-2-b8f3ae53e894@bootlin.com
---
 tools/testing/selftests/bpf/Makefile               |   1 -
 .../selftests/bpf/prog_tests/xdp_do_redirect.c     | 165 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_xdp_redirect.sh   |  79 ----------
 3 files changed, 165 insertions(+), 80 deletions(-)
 delete mode 100755 tools/testing/selftests/bpf/test_xdp_redirect.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ea9cee5de0f8..b59aa621b06b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -100,7 +100,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
-	test_xdp_redirect.sh \
 	test_xdp_redirect_multi.sh \
 	test_xdp_meta.sh \
 	test_tunnel.sh \
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index d12f926b4b8b..94206433c583 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -11,6 +11,8 @@
 #include <bpf/bpf_endian.h>
 #include <uapi/linux/netdev.h>
 #include "test_xdp_do_redirect.skel.h"
+#include "test_xdp_redirect.skel.h"
+#include "xdp_dummy.skel.h"
 
 struct udp_packet {
 	struct ethhdr eth;
@@ -246,3 +248,166 @@ out:
 	SYS_NOFAIL("ip netns del testns");
 	test_xdp_do_redirect__destroy(skel);
 }
+
+#define NS_NB		3
+#define NS0		"NS0"
+#define NS1		"NS1"
+#define NS2		"NS2"
+#define IPV4_NETWORK	"10.1.1"
+#define VETH1_INDEX	111
+#define VETH2_INDEX	222
+
+struct test_data {
+	struct netns_obj *ns[NS_NB];
+	u32 xdp_flags;
+};
+
+static void cleanup(struct test_data *data)
+{
+	int i;
+
+	for (i = 0; i < NS_NB; i++)
+		netns_free(data->ns[i]);
+}
+
+/**
+ * ping_setup -
+ * Create two veth peers and forward packets in-between using XDP
+ *
+ *    ------------           ------------
+ *    |    NS1   |           |    NS2   |
+ *    |   veth0  |           |   veth0  |
+ *    | 10.1.1.1 |           | 10.1.1.2 |
+ *    -----|------           ------|-----
+ *         |                       |
+ *         |                       |
+ *    -----|-----------------------|-------
+ *    |  veth1                   veth2    |
+ *    | (id:111)                (id:222)  |
+ *    |    |                        |     |
+ *    |    ----- xdp forwarding -----     |
+ *    |                                   |
+ *    |               NS0                 |
+ *    -------------------------------------
+ */
+static int ping_setup(struct test_data *data)
+{
+	int i;
+
+	data->ns[0] = netns_new(NS0, false);
+	if (!ASSERT_OK_PTR(data->ns[0], "create ns"))
+		return -1;
+
+	for (i = 1; i < NS_NB; i++) {
+		char ns_name[4] = {};
+
+		snprintf(ns_name, 4, "NS%d", i);
+		data->ns[i] = netns_new(ns_name, false);
+		if (!ASSERT_OK_PTR(data->ns[i], "create ns"))
+			goto fail;
+
+		SYS(fail,
+		    "ip -n %s link add veth%d index %d%d%d type veth peer name veth0 netns %s",
+		    NS0, i, i, i, i, ns_name);
+		SYS(fail, "ip -n %s link set veth%d up", NS0, i);
+
+		SYS(fail, "ip -n %s addr add %s.%d/24 dev veth0", ns_name, IPV4_NETWORK, i);
+		SYS(fail, "ip -n %s link set veth0 up", ns_name);
+	}
+
+	return 0;
+
+fail:
+	cleanup(data);
+	return -1;
+}
+
+static void ping_test(struct test_data *data)
+{
+	struct test_xdp_redirect *skel = NULL;
+	struct xdp_dummy *skel_dummy = NULL;
+	struct nstoken *nstoken = NULL;
+	int i, ret;
+
+	skel_dummy = xdp_dummy__open_and_load();
+	if (!ASSERT_OK_PTR(skel_dummy, "open and load xdp_dummy skeleton"))
+		goto close;
+
+	for (i = 1; i < NS_NB; i++) {
+		char ns_name[4] = {};
+
+		snprintf(ns_name, 4, "NS%d", i);
+		nstoken = open_netns(ns_name);
+		if (!ASSERT_OK_PTR(nstoken, "open ns"))
+			goto close;
+
+		ret = bpf_xdp_attach(if_nametoindex("veth0"),
+				     bpf_program__fd(skel_dummy->progs.xdp_dummy_prog),
+				     data->xdp_flags, NULL);
+		if (!ASSERT_GE(ret, 0, "bpf_xdp_attach dummy_prog"))
+			goto close;
+
+		close_netns(nstoken);
+		nstoken = NULL;
+	}
+
+	skel = test_xdp_redirect__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open and load skeleton"))
+		goto close;
+
+	nstoken = open_netns(NS0);
+	if (!ASSERT_OK_PTR(nstoken, "open NS0"))
+		goto close;
+
+	ret = bpf_xdp_attach(VETH2_INDEX,
+			     bpf_program__fd(skel->progs.xdp_redirect_to_111),
+			     data->xdp_flags, NULL);
+	if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
+		goto close;
+
+	ret = bpf_xdp_attach(VETH1_INDEX,
+			     bpf_program__fd(skel->progs.xdp_redirect_to_222),
+			     data->xdp_flags, NULL);
+	if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
+		goto close;
+
+	close_netns(nstoken);
+	nstoken = NULL;
+
+	nstoken = open_netns(NS1);
+	if (!ASSERT_OK_PTR(nstoken, "open NS1"))
+		goto close;
+
+	SYS(close, "ping -c 1 %s.2 > /dev/null", IPV4_NETWORK);
+
+close:
+	close_netns(nstoken);
+	xdp_dummy__destroy(skel_dummy);
+	test_xdp_redirect__destroy(skel);
+}
+
+
+static void xdp_redirect_ping(u32 xdp_flags)
+{
+	struct test_data data = {};
+
+	if (ping_setup(&data) < 0)
+		return;
+
+	data.xdp_flags = xdp_flags;
+	ping_test(&data);
+	cleanup(&data);
+}
+
+void test_xdp_index_redirect(void)
+{
+	if (test__start_subtest("noflag"))
+		xdp_redirect_ping(0);
+
+	if (test__start_subtest("drvflag"))
+		xdp_redirect_ping(XDP_FLAGS_DRV_MODE);
+
+	if (test__start_subtest("skbflag"))
+		xdp_redirect_ping(XDP_FLAGS_SKB_MODE);
+}
+
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
deleted file mode 100755
index 3c61a1c22b08..000000000000
--- a/tools/testing/selftests/bpf/test_xdp_redirect.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Create 2 namespaces with two veth peers, and
-# forward packets in-between using generic XDP
-#
-# NS1(veth11)     NS2(veth22)
-#     |               |
-#     |               |
-#   (veth1, ------ (veth2,
-#   id:111)         id:222)
-#     | xdp forwarding |
-#     ------------------
-
-readonly NS1="ns1-$(mktemp -u XXXXXX)"
-readonly NS2="ns2-$(mktemp -u XXXXXX)"
-ret=0
-
-setup()
-{
-
-	local xdpmode=$1
-
-	ip netns add ${NS1}
-	ip netns add ${NS2}
-
-	ip link add veth1 index 111 type veth peer name veth11 netns ${NS1}
-	ip link add veth2 index 222 type veth peer name veth22 netns ${NS2}
-
-	ip link set veth1 up
-	ip link set veth2 up
-	ip -n ${NS1} link set dev veth11 up
-	ip -n ${NS2} link set dev veth22 up
-
-	ip -n ${NS1} addr add 10.1.1.11/24 dev veth11
-	ip -n ${NS2} addr add 10.1.1.22/24 dev veth22
-}
-
-cleanup()
-{
-	ip link del veth1 2> /dev/null
-	ip link del veth2 2> /dev/null
-	ip netns del ${NS1} 2> /dev/null
-	ip netns del ${NS2} 2> /dev/null
-}
-
-test_xdp_redirect()
-{
-	local xdpmode=$1
-
-	setup
-
-	ip link set dev veth1 $xdpmode off &> /dev/null
-	if [ $? -ne 0 ];then
-		echo "selftests: test_xdp_redirect $xdpmode [SKIP]"
-		return 0
-	fi
-
-	ip -n ${NS1} link set veth11 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null
-	ip -n ${NS2} link set veth22 $xdpmode obj xdp_dummy.bpf.o sec xdp &> /dev/null
-	ip link set dev veth1 $xdpmode obj test_xdp_redirect.bpf.o program xdp_redirect_to_222 &> /dev/null
-	ip link set dev veth2 $xdpmode obj test_xdp_redirect.bpf.o program xdp_redirect_to_111 &> /dev/null
-
-	if ip netns exec ${NS1} ping -c 1 10.1.1.22 &> /dev/null &&
-	   ip netns exec ${NS2} ping -c 1 10.1.1.11 &> /dev/null; then
-		echo "selftests: test_xdp_redirect $xdpmode [PASS]";
-	else
-		ret=1
-		echo "selftests: test_xdp_redirect $xdpmode [FAILED]";
-	fi
-
-	cleanup
-}
-
-set -e
-trap cleanup 2 3 6 9
-
-test_xdp_redirect xdpgeneric
-test_xdp_redirect xdpdrv
-
-exit $ret
-- 
cgit v1.2.3


From 3e99fa9fab1951400599cf67abb11a9c90564e48 Mon Sep 17 00:00:00 2001
From: "Bastien Curutchet (eBPF Foundation)" <bastien.curutchet@bootlin.com>
Date: Fri, 10 Jan 2025 10:21:11 +0100
Subject: selftests/bpf: Migrate test_xdp_redirect.c to test_xdp_do_redirect.c

prog_tests/xdp_do_redirect.c is the only user of the BPF programs
located in progs/test_xdp_do_redirect.c and progs/test_xdp_redirect.c.
There is no need to keep both files with such close names.

Move test_xdp_redirect.c contents to test_xdp_do_redirect.c and remove
progs/test_xdp_redirect.c

Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20250110-xdp_redirect-v2-3-b8f3ae53e894@bootlin.com
---
 .../selftests/bpf/prog_tests/xdp_do_redirect.c     |  7 +++---
 .../selftests/bpf/progs/test_xdp_do_redirect.c     | 12 ++++++++++
 .../selftests/bpf/progs/test_xdp_redirect.c        | 26 ----------------------
 3 files changed, 15 insertions(+), 30 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/progs/test_xdp_redirect.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index 94206433c583..7dac044664ac 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -11,7 +11,6 @@
 #include <bpf/bpf_endian.h>
 #include <uapi/linux/netdev.h>
 #include "test_xdp_do_redirect.skel.h"
-#include "test_xdp_redirect.skel.h"
 #include "xdp_dummy.skel.h"
 
 struct udp_packet {
@@ -324,7 +323,7 @@ fail:
 
 static void ping_test(struct test_data *data)
 {
-	struct test_xdp_redirect *skel = NULL;
+	struct test_xdp_do_redirect *skel = NULL;
 	struct xdp_dummy *skel_dummy = NULL;
 	struct nstoken *nstoken = NULL;
 	int i, ret;
@@ -351,7 +350,7 @@ static void ping_test(struct test_data *data)
 		nstoken = NULL;
 	}
 
-	skel = test_xdp_redirect__open_and_load();
+	skel = test_xdp_do_redirect__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "open and load skeleton"))
 		goto close;
 
@@ -383,7 +382,7 @@ static void ping_test(struct test_data *data)
 close:
 	close_netns(nstoken);
 	xdp_dummy__destroy(skel_dummy);
-	test_xdp_redirect__destroy(skel);
+	test_xdp_do_redirect__destroy(skel);
 }
 
 
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
index 3abf068b8446..5928ed0911ca 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
@@ -98,6 +98,18 @@ int xdp_count_pkts(struct xdp_md *xdp)
 	return XDP_DROP;
 }
 
+SEC("xdp")
+int xdp_redirect_to_111(struct xdp_md *xdp)
+{
+	return bpf_redirect(111, 0);
+}
+
+SEC("xdp")
+int xdp_redirect_to_222(struct xdp_md *xdp)
+{
+	return bpf_redirect(222, 0);
+}
+
 SEC("tc")
 int tc_count_pkts(struct __sk_buff *skb)
 {
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
deleted file mode 100644
index 7025aee08a00..000000000000
--- a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2017 VMware
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-
-SEC("xdp")
-int xdp_redirect_to_111(struct xdp_md *xdp)
-{
-	return bpf_redirect(111, 0);
-}
-SEC("xdp")
-int xdp_redirect_to_222(struct xdp_md *xdp)
-{
-	return bpf_redirect(222, 0);
-}
-
-char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 08ac69b24507ab06871c18adc421c9d4f1008c61 Mon Sep 17 00:00:00 2001
From: Etienne Champetier <champetier.etienne@gmail.com>
Date: Wed, 8 Jan 2025 22:28:19 -0500
Subject: selftests: bonding: add ipvlan over bond testing

This rework bond_macvlan.sh into bond_macvlan_ipvlan.sh
We only test bridge mode for macvlan and l2 mode

]# ./bond_macvlan_ipvlan.sh
TEST: active-backup/macvlan_bridge: IPv4: client->server            [ OK ]
...
TEST: active-backup/ipvlan_l2: IPv4: client->server                 [ OK ]
...
TEST: balance-tlb/macvlan_bridge: IPv4: client->server              [ OK ]
...
TEST: balance-tlb/ipvlan_l2: IPv4: client->server                   [ OK ]
...
TEST: balance-alb/macvlan_bridge: IPv4: client->server              [ OK ]
...
TEST: balance-alb/ipvlan_l2: IPv4: client->server                   [ OK ]
...

Signed-off-by: Etienne Champetier <champetier.etienne@gmail.com>
Link: https://patch.msgid.link/20250109032819.326528-3-champetier.etienne@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../testing/selftests/drivers/net/bonding/Makefile |  2 +-
 .../selftests/drivers/net/bonding/bond_macvlan.sh  | 99 ----------------------
 .../drivers/net/bonding/bond_macvlan_ipvlan.sh     | 96 +++++++++++++++++++++
 tools/testing/selftests/drivers/net/bonding/config |  1 +
 4 files changed, 98 insertions(+), 100 deletions(-)
 delete mode 100755 tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
 create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 03a089165d3f..2b10854e4b1e 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -10,7 +10,7 @@ TEST_PROGS := \
 	mode-2-recovery-updelay.sh \
 	bond_options.sh \
 	bond-eth-type-change.sh \
-	bond_macvlan.sh
+	bond_macvlan_ipvlan.sh
 
 TEST_FILES := \
 	lag_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
deleted file mode 100755
index b609fb6231f4..000000000000
--- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Test macvlan over balance-alb
-
-lib_dir=$(dirname "$0")
-source ${lib_dir}/bond_topo_2d1c.sh
-
-m1_ns="m1-$(mktemp -u XXXXXX)"
-m2_ns="m1-$(mktemp -u XXXXXX)"
-m1_ip4="192.0.2.11"
-m1_ip6="2001:db8::11"
-m2_ip4="192.0.2.12"
-m2_ip6="2001:db8::12"
-
-cleanup()
-{
-	ip -n ${m1_ns} link del macv0
-	ip netns del ${m1_ns}
-	ip -n ${m2_ns} link del macv0
-	ip netns del ${m2_ns}
-
-	client_destroy
-	server_destroy
-	gateway_destroy
-}
-
-check_connection()
-{
-	local ns=${1}
-	local target=${2}
-	local message=${3:-"macvlan_over_bond"}
-	RET=0
-
-
-	ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null
-	check_err $? "ping failed"
-	log_test "$mode: $message"
-}
-
-macvlan_over_bond()
-{
-	local param="$1"
-	RET=0
-
-	# setup new bond mode
-	bond_reset "${param}"
-
-	ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge
-	ip -n ${s_ns} link set macv0 netns ${m1_ns}
-	ip -n ${m1_ns} link set dev macv0 up
-	ip -n ${m1_ns} addr add ${m1_ip4}/24 dev macv0
-	ip -n ${m1_ns} addr add ${m1_ip6}/24 dev macv0
-
-	ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge
-	ip -n ${s_ns} link set macv0 netns ${m2_ns}
-	ip -n ${m2_ns} link set dev macv0 up
-	ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0
-	ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0
-
-	sleep 2
-
-	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
-	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
-	check_connection "${c_ns}" "${m1_ip4}" "IPv4: client->macvlan_1"
-	check_connection "${c_ns}" "${m1_ip6}" "IPv6: client->macvlan_1"
-	check_connection "${c_ns}" "${m2_ip4}" "IPv4: client->macvlan_2"
-	check_connection "${c_ns}" "${m2_ip6}" "IPv6: client->macvlan_2"
-	check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2"
-	check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2"
-
-
-	sleep 5
-
-	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
-	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
-	check_connection "${m1_ns}" "${c_ip4}" "IPv4: macvlan_1->client"
-	check_connection "${m1_ns}" "${c_ip6}" "IPv6: macvlan_1->client"
-	check_connection "${m2_ns}" "${c_ip4}" "IPv4: macvlan_2->client"
-	check_connection "${m2_ns}" "${c_ip6}" "IPv6: macvlan_2->client"
-	check_connection "${m2_ns}" "${m1_ip4}" "IPv4: macvlan_2->macvlan_2"
-	check_connection "${m2_ns}" "${m1_ip6}" "IPv6: macvlan_2->macvlan_2"
-
-	ip -n ${c_ns} neigh flush dev eth0
-}
-
-trap cleanup EXIT
-
-setup_prepare
-ip netns add ${m1_ns}
-ip netns add ${m2_ns}
-
-modes="active-backup balance-tlb balance-alb"
-
-for mode in $modes; do
-	macvlan_over_bond "mode $mode"
-done
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
new file mode 100755
index 000000000000..c4711272fe45
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test macvlan/ipvlan over bond
+
+lib_dir=$(dirname "$0")
+source ${lib_dir}/bond_topo_2d1c.sh
+
+xvlan1_ns="xvlan1-$(mktemp -u XXXXXX)"
+xvlan2_ns="xvlan2-$(mktemp -u XXXXXX)"
+xvlan1_ip4="192.0.2.11"
+xvlan1_ip6="2001:db8::11"
+xvlan2_ip4="192.0.2.12"
+xvlan2_ip6="2001:db8::12"
+
+cleanup()
+{
+	client_destroy
+	server_destroy
+	gateway_destroy
+
+	ip netns del ${xvlan1_ns}
+	ip netns del ${xvlan2_ns}
+}
+
+check_connection()
+{
+	local ns=${1}
+	local target=${2}
+	local message=${3}
+	RET=0
+
+	ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null
+	check_err $? "ping failed"
+	log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}"
+}
+
+xvlan_over_bond()
+{
+	local param="$1"
+	local xvlan_type="$2"
+	local xvlan_mode="$3"
+	RET=0
+
+	# setup new bond mode
+	bond_reset "${param}"
+
+	ip -n ${s_ns} link add link bond0 name ${xvlan_type}0 type ${xvlan_type} mode ${xvlan_mode}
+	ip -n ${s_ns} link set ${xvlan_type}0 netns ${xvlan1_ns}
+	ip -n ${xvlan1_ns} link set dev ${xvlan_type}0 up
+	ip -n ${xvlan1_ns} addr add ${xvlan1_ip4}/24 dev ${xvlan_type}0
+	ip -n ${xvlan1_ns} addr add ${xvlan1_ip6}/24 dev ${xvlan_type}0
+
+	ip -n ${s_ns} link add link bond0 name ${xvlan_type}0 type ${xvlan_type} mode ${xvlan_mode}
+	ip -n ${s_ns} link set ${xvlan_type}0 netns ${xvlan2_ns}
+	ip -n ${xvlan2_ns} link set dev ${xvlan_type}0 up
+	ip -n ${xvlan2_ns} addr add ${xvlan2_ip4}/24 dev ${xvlan_type}0
+	ip -n ${xvlan2_ns} addr add ${xvlan2_ip6}/24 dev ${xvlan_type}0
+
+	sleep 2
+
+	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
+	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
+	check_connection "${c_ns}" "${xvlan1_ip4}" "IPv4: client->${xvlan_type}_1"
+	check_connection "${c_ns}" "${xvlan1_ip6}" "IPv6: client->${xvlan_type}_1"
+	check_connection "${c_ns}" "${xvlan2_ip4}" "IPv4: client->${xvlan_type}_2"
+	check_connection "${c_ns}" "${xvlan2_ip6}" "IPv6: client->${xvlan_type}_2"
+	check_connection "${xvlan1_ns}" "${xvlan2_ip4}" "IPv4: ${xvlan_type}_1->${xvlan_type}_2"
+	check_connection "${xvlan1_ns}" "${xvlan2_ip6}" "IPv6: ${xvlan_type}_1->${xvlan_type}_2"
+
+	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
+	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
+	check_connection "${xvlan1_ns}" "${c_ip4}" "IPv4: ${xvlan_type}_1->client"
+	check_connection "${xvlan1_ns}" "${c_ip6}" "IPv6: ${xvlan_type}_1->client"
+	check_connection "${xvlan2_ns}" "${c_ip4}" "IPv4: ${xvlan_type}_2->client"
+	check_connection "${xvlan2_ns}" "${c_ip6}" "IPv6: ${xvlan_type}_2->client"
+	check_connection "${xvlan2_ns}" "${xvlan1_ip4}" "IPv4: ${xvlan_type}_2->${xvlan_type}_1"
+	check_connection "${xvlan2_ns}" "${xvlan1_ip6}" "IPv6: ${xvlan_type}_2->${xvlan_type}_1"
+
+	ip -n ${c_ns} neigh flush dev eth0
+}
+
+trap cleanup EXIT
+
+setup_prepare
+ip netns add ${xvlan1_ns}
+ip netns add ${xvlan2_ns}
+
+bond_modes="active-backup balance-tlb balance-alb"
+
+for bond_mode in ${bond_modes}; do
+	xvlan_over_bond "mode ${bond_mode}" macvlan bridge
+	xvlan_over_bond "mode ${bond_mode}" ipvlan  l2
+done
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index 899d7fb6ea8e..dad4e5fda4db 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -3,6 +3,7 @@ CONFIG_BRIDGE=y
 CONFIG_DUMMY=y
 CONFIG_IPV6=y
 CONFIG_MACVLAN=y
+CONFIG_IPVLAN=y
 CONFIG_NET_ACT_GACT=y
 CONFIG_NET_CLS_FLOWER=y
 CONFIG_NET_SCH_INGRESS=y
-- 
cgit v1.2.3


From 2bf66e66d2e6feece6175ec09ec590a0a8563bdd Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 18 Dec 2024 22:43:47 +1100
Subject: selftests/powerpc: Fix argument order to timer_sub()

Commit c814bf958926 ("powerpc/selftests: Use timersub() for
gettimeofday()"), got the order of arguments to timersub() wrong,
leading to a negative time delta being reported, eg:

  test: gettimeofday
  tags: git_version:v6.12-rc5-409-gdddf291c3030
  time = -3.297781
  success: gettimeofday

The correct order is minuend, subtrahend, which in this case is end,
start. Which gives:

  test: gettimeofday
  tags: git_version:v6.12-rc5-409-gdddf291c3030-dirty
  time = 3.300650
  success: gettimeofday

Fixes: c814bf958926 ("powerpc/selftests: Use timersub() for gettimeofday()")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20241218114347.428108-1-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/benchmarks/gettimeofday.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
index 580fcac0a09f..b71ef8a493ed 100644
--- a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
+++ b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
@@ -20,7 +20,7 @@ static int test_gettimeofday(void)
 		gettimeofday(&tv_end, NULL);
 	}
 
-	timersub(&tv_start, &tv_end, &tv_diff);
+	timersub(&tv_end, &tv_start, &tv_diff);
 
 	printf("time = %.6f\n", tv_diff.tv_sec + (tv_diff.tv_usec) * 1e-6);
 
-- 
cgit v1.2.3


From 38138762faffeb923d9f49efbcc09884f1530786 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 6 Jan 2025 14:24:37 +0000
Subject: tools: arm64: Update sysreg.h header files

Created with the following:

  cp include/linux/kasan-tags.h tools/include/linux/
  cp arch/arm64/include/asm/sysreg.h tools/arch/arm64/include/asm/

Update the tools copy of sysreg.h so that the next commit to add a new
register doesn't have unrelated changes in it. Because the new version
of sysreg.h includes kasan-tags.h, that file also now needs to be copied
into tools.

Acked-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: James Clark <james.clark@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20250106142446.628923-3-james.clark@linaro.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/arch/arm64/include/asm/sysreg.h | 398 +++++++++++++++++++++++++++++++++-
 tools/include/linux/kasan-tags.h      |  15 ++
 2 files changed, 405 insertions(+), 8 deletions(-)
 create mode 100644 tools/include/linux/kasan-tags.h

(limited to 'tools')

diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index cd8420e8c3ad..345e81e0d2b3 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -11,6 +11,7 @@
 
 #include <linux/bits.h>
 #include <linux/stringify.h>
+#include <linux/kasan-tags.h>
 
 #include <asm/gpr-num.h>
 
@@ -108,6 +109,9 @@
 #define set_pstate_ssbs(x)		asm volatile(SET_PSTATE_SSBS(x))
 #define set_pstate_dit(x)		asm volatile(SET_PSTATE_DIT(x))
 
+/* Register-based PAN access, for save/restore purposes */
+#define SYS_PSTATE_PAN			sys_reg(3, 0, 4, 2, 3)
+
 #define __SYS_BARRIER_INSN(CRm, op2, Rt) \
 	__emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f))
 
@@ -123,6 +127,37 @@
 #define SYS_DC_CIGSW			sys_insn(1, 0, 7, 14, 4)
 #define SYS_DC_CIGDSW			sys_insn(1, 0, 7, 14, 6)
 
+#define SYS_IC_IALLUIS			sys_insn(1, 0, 7, 1, 0)
+#define SYS_IC_IALLU			sys_insn(1, 0, 7, 5, 0)
+#define SYS_IC_IVAU			sys_insn(1, 3, 7, 5, 1)
+
+#define SYS_DC_IVAC			sys_insn(1, 0, 7, 6, 1)
+#define SYS_DC_IGVAC			sys_insn(1, 0, 7, 6, 3)
+#define SYS_DC_IGDVAC			sys_insn(1, 0, 7, 6, 5)
+
+#define SYS_DC_CVAC			sys_insn(1, 3, 7, 10, 1)
+#define SYS_DC_CGVAC			sys_insn(1, 3, 7, 10, 3)
+#define SYS_DC_CGDVAC			sys_insn(1, 3, 7, 10, 5)
+
+#define SYS_DC_CVAU			sys_insn(1, 3, 7, 11, 1)
+
+#define SYS_DC_CVAP			sys_insn(1, 3, 7, 12, 1)
+#define SYS_DC_CGVAP			sys_insn(1, 3, 7, 12, 3)
+#define SYS_DC_CGDVAP			sys_insn(1, 3, 7, 12, 5)
+
+#define SYS_DC_CVADP			sys_insn(1, 3, 7, 13, 1)
+#define SYS_DC_CGVADP			sys_insn(1, 3, 7, 13, 3)
+#define SYS_DC_CGDVADP			sys_insn(1, 3, 7, 13, 5)
+
+#define SYS_DC_CIVAC			sys_insn(1, 3, 7, 14, 1)
+#define SYS_DC_CIGVAC			sys_insn(1, 3, 7, 14, 3)
+#define SYS_DC_CIGDVAC			sys_insn(1, 3, 7, 14, 5)
+
+/* Data cache zero operations */
+#define SYS_DC_ZVA			sys_insn(1, 3, 7, 4, 1)
+#define SYS_DC_GVA			sys_insn(1, 3, 7, 4, 3)
+#define SYS_DC_GZVA			sys_insn(1, 3, 7, 4, 4)
+
 /*
  * Automatically generated definitions for system registers, the
  * manual encodings below are in the process of being converted to
@@ -162,6 +197,84 @@
 #define SYS_DBGDTRTX_EL0		sys_reg(2, 3, 0, 5, 0)
 #define SYS_DBGVCR32_EL2		sys_reg(2, 4, 0, 7, 0)
 
+#define SYS_BRBINF_EL1(n)		sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 0))
+#define SYS_BRBINFINJ_EL1		sys_reg(2, 1, 9, 1, 0)
+#define SYS_BRBSRC_EL1(n)		sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 1))
+#define SYS_BRBSRCINJ_EL1		sys_reg(2, 1, 9, 1, 1)
+#define SYS_BRBTGT_EL1(n)		sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 2))
+#define SYS_BRBTGTINJ_EL1		sys_reg(2, 1, 9, 1, 2)
+#define SYS_BRBTS_EL1			sys_reg(2, 1, 9, 0, 2)
+
+#define SYS_BRBCR_EL1			sys_reg(2, 1, 9, 0, 0)
+#define SYS_BRBFCR_EL1			sys_reg(2, 1, 9, 0, 1)
+#define SYS_BRBIDR0_EL1			sys_reg(2, 1, 9, 2, 0)
+
+#define SYS_TRCITECR_EL1		sys_reg(3, 0, 1, 2, 3)
+#define SYS_TRCACATR(m)			sys_reg(2, 1, 2, ((m & 7) << 1), (2 | (m >> 3)))
+#define SYS_TRCACVR(m)			sys_reg(2, 1, 2, ((m & 7) << 1), (0 | (m >> 3)))
+#define SYS_TRCAUTHSTATUS		sys_reg(2, 1, 7, 14, 6)
+#define SYS_TRCAUXCTLR			sys_reg(2, 1, 0, 6, 0)
+#define SYS_TRCBBCTLR			sys_reg(2, 1, 0, 15, 0)
+#define SYS_TRCCCCTLR			sys_reg(2, 1, 0, 14, 0)
+#define SYS_TRCCIDCCTLR0		sys_reg(2, 1, 3, 0, 2)
+#define SYS_TRCCIDCCTLR1		sys_reg(2, 1, 3, 1, 2)
+#define SYS_TRCCIDCVR(m)		sys_reg(2, 1, 3, ((m & 7) << 1), 0)
+#define SYS_TRCCLAIMCLR			sys_reg(2, 1, 7, 9, 6)
+#define SYS_TRCCLAIMSET			sys_reg(2, 1, 7, 8, 6)
+#define SYS_TRCCNTCTLR(m)		sys_reg(2, 1, 0, (4 | (m & 3)), 5)
+#define SYS_TRCCNTRLDVR(m)		sys_reg(2, 1, 0, (0 | (m & 3)), 5)
+#define SYS_TRCCNTVR(m)			sys_reg(2, 1, 0, (8 | (m & 3)), 5)
+#define SYS_TRCCONFIGR			sys_reg(2, 1, 0, 4, 0)
+#define SYS_TRCDEVARCH			sys_reg(2, 1, 7, 15, 6)
+#define SYS_TRCDEVID			sys_reg(2, 1, 7, 2, 7)
+#define SYS_TRCEVENTCTL0R		sys_reg(2, 1, 0, 8, 0)
+#define SYS_TRCEVENTCTL1R		sys_reg(2, 1, 0, 9, 0)
+#define SYS_TRCEXTINSELR(m)		sys_reg(2, 1, 0, (8 | (m & 3)), 4)
+#define SYS_TRCIDR0			sys_reg(2, 1, 0, 8, 7)
+#define SYS_TRCIDR10			sys_reg(2, 1, 0, 2, 6)
+#define SYS_TRCIDR11			sys_reg(2, 1, 0, 3, 6)
+#define SYS_TRCIDR12			sys_reg(2, 1, 0, 4, 6)
+#define SYS_TRCIDR13			sys_reg(2, 1, 0, 5, 6)
+#define SYS_TRCIDR1			sys_reg(2, 1, 0, 9, 7)
+#define SYS_TRCIDR2			sys_reg(2, 1, 0, 10, 7)
+#define SYS_TRCIDR3			sys_reg(2, 1, 0, 11, 7)
+#define SYS_TRCIDR4			sys_reg(2, 1, 0, 12, 7)
+#define SYS_TRCIDR5			sys_reg(2, 1, 0, 13, 7)
+#define SYS_TRCIDR6			sys_reg(2, 1, 0, 14, 7)
+#define SYS_TRCIDR7			sys_reg(2, 1, 0, 15, 7)
+#define SYS_TRCIDR8			sys_reg(2, 1, 0, 0, 6)
+#define SYS_TRCIDR9			sys_reg(2, 1, 0, 1, 6)
+#define SYS_TRCIMSPEC(m)		sys_reg(2, 1, 0, (m & 7), 7)
+#define SYS_TRCITEEDCR			sys_reg(2, 1, 0, 2, 1)
+#define SYS_TRCOSLSR			sys_reg(2, 1, 1, 1, 4)
+#define SYS_TRCPRGCTLR			sys_reg(2, 1, 0, 1, 0)
+#define SYS_TRCQCTLR			sys_reg(2, 1, 0, 1, 1)
+#define SYS_TRCRSCTLR(m)		sys_reg(2, 1, 1, (m & 15), (0 | (m >> 4)))
+#define SYS_TRCRSR			sys_reg(2, 1, 0, 10, 0)
+#define SYS_TRCSEQEVR(m)		sys_reg(2, 1, 0, (m & 3), 4)
+#define SYS_TRCSEQRSTEVR		sys_reg(2, 1, 0, 6, 4)
+#define SYS_TRCSEQSTR			sys_reg(2, 1, 0, 7, 4)
+#define SYS_TRCSSCCR(m)			sys_reg(2, 1, 1, (m & 7), 2)
+#define SYS_TRCSSCSR(m)			sys_reg(2, 1, 1, (8 | (m & 7)), 2)
+#define SYS_TRCSSPCICR(m)		sys_reg(2, 1, 1, (m & 7), 3)
+#define SYS_TRCSTALLCTLR		sys_reg(2, 1, 0, 11, 0)
+#define SYS_TRCSTATR			sys_reg(2, 1, 0, 3, 0)
+#define SYS_TRCSYNCPR			sys_reg(2, 1, 0, 13, 0)
+#define SYS_TRCTRACEIDR			sys_reg(2, 1, 0, 0, 1)
+#define SYS_TRCTSCTLR			sys_reg(2, 1, 0, 12, 0)
+#define SYS_TRCVICTLR			sys_reg(2, 1, 0, 0, 2)
+#define SYS_TRCVIIECTLR			sys_reg(2, 1, 0, 1, 2)
+#define SYS_TRCVIPCSSCTLR		sys_reg(2, 1, 0, 3, 2)
+#define SYS_TRCVISSCTLR			sys_reg(2, 1, 0, 2, 2)
+#define SYS_TRCVMIDCCTLR0		sys_reg(2, 1, 3, 2, 2)
+#define SYS_TRCVMIDCCTLR1		sys_reg(2, 1, 3, 3, 2)
+#define SYS_TRCVMIDCVR(m)		sys_reg(2, 1, 3, ((m & 7) << 1), 1)
+
+/* ETM */
+#define SYS_TRCOSLAR			sys_reg(2, 1, 1, 0, 4)
+
+#define SYS_BRBCR_EL2			sys_reg(2, 4, 9, 0, 0)
+
 #define SYS_MIDR_EL1			sys_reg(3, 0, 0, 0, 0)
 #define SYS_MPIDR_EL1			sys_reg(3, 0, 0, 0, 5)
 #define SYS_REVIDR_EL1			sys_reg(3, 0, 0, 0, 6)
@@ -202,15 +315,38 @@
 #define SYS_ERXCTLR_EL1			sys_reg(3, 0, 5, 4, 1)
 #define SYS_ERXSTATUS_EL1		sys_reg(3, 0, 5, 4, 2)
 #define SYS_ERXADDR_EL1			sys_reg(3, 0, 5, 4, 3)
+#define SYS_ERXPFGF_EL1			sys_reg(3, 0, 5, 4, 4)
+#define SYS_ERXPFGCTL_EL1		sys_reg(3, 0, 5, 4, 5)
+#define SYS_ERXPFGCDN_EL1		sys_reg(3, 0, 5, 4, 6)
 #define SYS_ERXMISC0_EL1		sys_reg(3, 0, 5, 5, 0)
 #define SYS_ERXMISC1_EL1		sys_reg(3, 0, 5, 5, 1)
+#define SYS_ERXMISC2_EL1		sys_reg(3, 0, 5, 5, 2)
+#define SYS_ERXMISC3_EL1		sys_reg(3, 0, 5, 5, 3)
 #define SYS_TFSR_EL1			sys_reg(3, 0, 5, 6, 0)
 #define SYS_TFSRE0_EL1			sys_reg(3, 0, 5, 6, 1)
 
 #define SYS_PAR_EL1			sys_reg(3, 0, 7, 4, 0)
 
 #define SYS_PAR_EL1_F			BIT(0)
+/* When PAR_EL1.F == 1 */
 #define SYS_PAR_EL1_FST			GENMASK(6, 1)
+#define SYS_PAR_EL1_PTW			BIT(8)
+#define SYS_PAR_EL1_S			BIT(9)
+#define SYS_PAR_EL1_AssuredOnly		BIT(12)
+#define SYS_PAR_EL1_TopLevel		BIT(13)
+#define SYS_PAR_EL1_Overlay		BIT(14)
+#define SYS_PAR_EL1_DirtyBit		BIT(15)
+#define SYS_PAR_EL1_F1_IMPDEF		GENMASK_ULL(63, 48)
+#define SYS_PAR_EL1_F1_RES0		(BIT(7) | BIT(10) | GENMASK_ULL(47, 16))
+#define SYS_PAR_EL1_RES1		BIT(11)
+/* When PAR_EL1.F == 0 */
+#define SYS_PAR_EL1_SH			GENMASK_ULL(8, 7)
+#define SYS_PAR_EL1_NS			BIT(9)
+#define SYS_PAR_EL1_F0_IMPDEF		BIT(10)
+#define SYS_PAR_EL1_NSE			BIT(11)
+#define SYS_PAR_EL1_PA			GENMASK_ULL(51, 12)
+#define SYS_PAR_EL1_ATTR		GENMASK_ULL(63, 56)
+#define SYS_PAR_EL1_F0_RES0		(GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52))
 
 /*** Statistical Profiling Extension ***/
 #define PMSEVFR_EL1_RES0_IMP	\
@@ -274,6 +410,8 @@
 #define SYS_ICC_IGRPEN0_EL1		sys_reg(3, 0, 12, 12, 6)
 #define SYS_ICC_IGRPEN1_EL1		sys_reg(3, 0, 12, 12, 7)
 
+#define SYS_ACCDATA_EL1			sys_reg(3, 0, 13, 0, 5)
+
 #define SYS_CNTKCTL_EL1			sys_reg(3, 0, 14, 1, 0)
 
 #define SYS_AIDR_EL1			sys_reg(3, 1, 0, 0, 7)
@@ -286,7 +424,6 @@
 #define SYS_PMCNTENCLR_EL0		sys_reg(3, 3, 9, 12, 2)
 #define SYS_PMOVSCLR_EL0		sys_reg(3, 3, 9, 12, 3)
 #define SYS_PMSWINC_EL0			sys_reg(3, 3, 9, 12, 4)
-#define SYS_PMSELR_EL0			sys_reg(3, 3, 9, 12, 5)
 #define SYS_PMCEID0_EL0			sys_reg(3, 3, 9, 12, 6)
 #define SYS_PMCEID1_EL0			sys_reg(3, 3, 9, 12, 7)
 #define SYS_PMCCNTR_EL0			sys_reg(3, 3, 9, 13, 0)
@@ -369,6 +506,7 @@
 
 #define SYS_SCTLR_EL2			sys_reg(3, 4, 1, 0, 0)
 #define SYS_ACTLR_EL2			sys_reg(3, 4, 1, 0, 1)
+#define SYS_SCTLR2_EL2			sys_reg(3, 4, 1, 0, 3)
 #define SYS_HCR_EL2			sys_reg(3, 4, 1, 1, 0)
 #define SYS_MDCR_EL2			sys_reg(3, 4, 1, 1, 1)
 #define SYS_CPTR_EL2			sys_reg(3, 4, 1, 1, 2)
@@ -382,12 +520,15 @@
 #define SYS_VTCR_EL2			sys_reg(3, 4, 2, 1, 2)
 
 #define SYS_TRFCR_EL2			sys_reg(3, 4, 1, 2, 1)
-#define SYS_HDFGRTR_EL2			sys_reg(3, 4, 3, 1, 4)
-#define SYS_HDFGWTR_EL2			sys_reg(3, 4, 3, 1, 5)
+#define SYS_VNCR_EL2			sys_reg(3, 4, 2, 2, 0)
 #define SYS_HAFGRTR_EL2			sys_reg(3, 4, 3, 1, 6)
 #define SYS_SPSR_EL2			sys_reg(3, 4, 4, 0, 0)
 #define SYS_ELR_EL2			sys_reg(3, 4, 4, 0, 1)
 #define SYS_SP_EL1			sys_reg(3, 4, 4, 1, 0)
+#define SYS_SPSR_irq			sys_reg(3, 4, 4, 3, 0)
+#define SYS_SPSR_abt			sys_reg(3, 4, 4, 3, 1)
+#define SYS_SPSR_und			sys_reg(3, 4, 4, 3, 2)
+#define SYS_SPSR_fiq			sys_reg(3, 4, 4, 3, 3)
 #define SYS_IFSR32_EL2			sys_reg(3, 4, 5, 0, 1)
 #define SYS_AFSR0_EL2			sys_reg(3, 4, 5, 1, 0)
 #define SYS_AFSR1_EL2			sys_reg(3, 4, 5, 1, 1)
@@ -449,24 +590,49 @@
 
 #define SYS_CONTEXTIDR_EL2		sys_reg(3, 4, 13, 0, 1)
 #define SYS_TPIDR_EL2			sys_reg(3, 4, 13, 0, 2)
+#define SYS_SCXTNUM_EL2			sys_reg(3, 4, 13, 0, 7)
+
+#define __AMEV_op2(m)			(m & 0x7)
+#define __AMEV_CRm(n, m)		(n | ((m & 0x8) >> 3))
+#define __SYS__AMEVCNTVOFF0n_EL2(m)	sys_reg(3, 4, 13, __AMEV_CRm(0x8, m), __AMEV_op2(m))
+#define SYS_AMEVCNTVOFF0n_EL2(m)	__SYS__AMEVCNTVOFF0n_EL2(m)
+#define __SYS__AMEVCNTVOFF1n_EL2(m)	sys_reg(3, 4, 13, __AMEV_CRm(0xA, m), __AMEV_op2(m))
+#define SYS_AMEVCNTVOFF1n_EL2(m)	__SYS__AMEVCNTVOFF1n_EL2(m)
 
 #define SYS_CNTVOFF_EL2			sys_reg(3, 4, 14, 0, 3)
 #define SYS_CNTHCTL_EL2			sys_reg(3, 4, 14, 1, 0)
+#define SYS_CNTHP_TVAL_EL2		sys_reg(3, 4, 14, 2, 0)
+#define SYS_CNTHP_CTL_EL2		sys_reg(3, 4, 14, 2, 1)
+#define SYS_CNTHP_CVAL_EL2		sys_reg(3, 4, 14, 2, 2)
+#define SYS_CNTHV_TVAL_EL2		sys_reg(3, 4, 14, 3, 0)
+#define SYS_CNTHV_CTL_EL2		sys_reg(3, 4, 14, 3, 1)
+#define SYS_CNTHV_CVAL_EL2		sys_reg(3, 4, 14, 3, 2)
 
 /* VHE encodings for architectural EL0/1 system registers */
+#define SYS_BRBCR_EL12			sys_reg(2, 5, 9, 0, 0)
 #define SYS_SCTLR_EL12			sys_reg(3, 5, 1, 0, 0)
+#define SYS_CPACR_EL12			sys_reg(3, 5, 1, 0, 2)
+#define SYS_SCTLR2_EL12			sys_reg(3, 5, 1, 0, 3)
+#define SYS_ZCR_EL12			sys_reg(3, 5, 1, 2, 0)
+#define SYS_TRFCR_EL12			sys_reg(3, 5, 1, 2, 1)
+#define SYS_SMCR_EL12			sys_reg(3, 5, 1, 2, 6)
 #define SYS_TTBR0_EL12			sys_reg(3, 5, 2, 0, 0)
 #define SYS_TTBR1_EL12			sys_reg(3, 5, 2, 0, 1)
 #define SYS_TCR_EL12			sys_reg(3, 5, 2, 0, 2)
+#define SYS_TCR2_EL12			sys_reg(3, 5, 2, 0, 3)
 #define SYS_SPSR_EL12			sys_reg(3, 5, 4, 0, 0)
 #define SYS_ELR_EL12			sys_reg(3, 5, 4, 0, 1)
 #define SYS_AFSR0_EL12			sys_reg(3, 5, 5, 1, 0)
 #define SYS_AFSR1_EL12			sys_reg(3, 5, 5, 1, 1)
 #define SYS_ESR_EL12			sys_reg(3, 5, 5, 2, 0)
 #define SYS_TFSR_EL12			sys_reg(3, 5, 5, 6, 0)
+#define SYS_FAR_EL12			sys_reg(3, 5, 6, 0, 0)
+#define SYS_PMSCR_EL12			sys_reg(3, 5, 9, 9, 0)
 #define SYS_MAIR_EL12			sys_reg(3, 5, 10, 2, 0)
 #define SYS_AMAIR_EL12			sys_reg(3, 5, 10, 3, 0)
 #define SYS_VBAR_EL12			sys_reg(3, 5, 12, 0, 0)
+#define SYS_CONTEXTIDR_EL12		sys_reg(3, 5, 13, 0, 1)
+#define SYS_SCXTNUM_EL12		sys_reg(3, 5, 13, 0, 7)
 #define SYS_CNTKCTL_EL12		sys_reg(3, 5, 14, 1, 0)
 #define SYS_CNTP_TVAL_EL02		sys_reg(3, 5, 14, 2, 0)
 #define SYS_CNTP_CTL_EL02		sys_reg(3, 5, 14, 2, 1)
@@ -477,6 +643,183 @@
 
 #define SYS_SP_EL2			sys_reg(3, 6,  4, 1, 0)
 
+/* AT instructions */
+#define AT_Op0 1
+#define AT_CRn 7
+
+#define OP_AT_S1E1R	sys_insn(AT_Op0, 0, AT_CRn, 8, 0)
+#define OP_AT_S1E1W	sys_insn(AT_Op0, 0, AT_CRn, 8, 1)
+#define OP_AT_S1E0R	sys_insn(AT_Op0, 0, AT_CRn, 8, 2)
+#define OP_AT_S1E0W	sys_insn(AT_Op0, 0, AT_CRn, 8, 3)
+#define OP_AT_S1E1RP	sys_insn(AT_Op0, 0, AT_CRn, 9, 0)
+#define OP_AT_S1E1WP	sys_insn(AT_Op0, 0, AT_CRn, 9, 1)
+#define OP_AT_S1E1A	sys_insn(AT_Op0, 0, AT_CRn, 9, 2)
+#define OP_AT_S1E2R	sys_insn(AT_Op0, 4, AT_CRn, 8, 0)
+#define OP_AT_S1E2W	sys_insn(AT_Op0, 4, AT_CRn, 8, 1)
+#define OP_AT_S12E1R	sys_insn(AT_Op0, 4, AT_CRn, 8, 4)
+#define OP_AT_S12E1W	sys_insn(AT_Op0, 4, AT_CRn, 8, 5)
+#define OP_AT_S12E0R	sys_insn(AT_Op0, 4, AT_CRn, 8, 6)
+#define OP_AT_S12E0W	sys_insn(AT_Op0, 4, AT_CRn, 8, 7)
+#define OP_AT_S1E2A	sys_insn(AT_Op0, 4, AT_CRn, 9, 2)
+
+/* TLBI instructions */
+#define TLBI_Op0	1
+
+#define TLBI_Op1_EL1	0	/* Accessible from EL1 or higher */
+#define TLBI_Op1_EL2	4	/* Accessible from EL2 or higher */
+
+#define TLBI_CRn_XS	8	/* Extra Slow (the common one) */
+#define TLBI_CRn_nXS	9	/* not Extra Slow (which nobody uses)*/
+
+#define TLBI_CRm_IPAIS	0	/* S2 Inner-Shareable */
+#define TLBI_CRm_nROS	1	/* non-Range, Outer-Sharable */
+#define TLBI_CRm_RIS	2	/* Range, Inner-Sharable */
+#define TLBI_CRm_nRIS	3	/* non-Range, Inner-Sharable */
+#define TLBI_CRm_IPAONS	4	/* S2 Outer and Non-Shareable */
+#define TLBI_CRm_ROS	5	/* Range, Outer-Sharable */
+#define TLBI_CRm_RNS	6	/* Range, Non-Sharable */
+#define TLBI_CRm_nRNS	7	/* non-Range, Non-Sharable */
+
+#define OP_TLBI_VMALLE1OS		sys_insn(1, 0, 8, 1, 0)
+#define OP_TLBI_VAE1OS			sys_insn(1, 0, 8, 1, 1)
+#define OP_TLBI_ASIDE1OS		sys_insn(1, 0, 8, 1, 2)
+#define OP_TLBI_VAAE1OS			sys_insn(1, 0, 8, 1, 3)
+#define OP_TLBI_VALE1OS			sys_insn(1, 0, 8, 1, 5)
+#define OP_TLBI_VAALE1OS		sys_insn(1, 0, 8, 1, 7)
+#define OP_TLBI_RVAE1IS			sys_insn(1, 0, 8, 2, 1)
+#define OP_TLBI_RVAAE1IS		sys_insn(1, 0, 8, 2, 3)
+#define OP_TLBI_RVALE1IS		sys_insn(1, 0, 8, 2, 5)
+#define OP_TLBI_RVAALE1IS		sys_insn(1, 0, 8, 2, 7)
+#define OP_TLBI_VMALLE1IS		sys_insn(1, 0, 8, 3, 0)
+#define OP_TLBI_VAE1IS			sys_insn(1, 0, 8, 3, 1)
+#define OP_TLBI_ASIDE1IS		sys_insn(1, 0, 8, 3, 2)
+#define OP_TLBI_VAAE1IS			sys_insn(1, 0, 8, 3, 3)
+#define OP_TLBI_VALE1IS			sys_insn(1, 0, 8, 3, 5)
+#define OP_TLBI_VAALE1IS		sys_insn(1, 0, 8, 3, 7)
+#define OP_TLBI_RVAE1OS			sys_insn(1, 0, 8, 5, 1)
+#define OP_TLBI_RVAAE1OS		sys_insn(1, 0, 8, 5, 3)
+#define OP_TLBI_RVALE1OS		sys_insn(1, 0, 8, 5, 5)
+#define OP_TLBI_RVAALE1OS		sys_insn(1, 0, 8, 5, 7)
+#define OP_TLBI_RVAE1			sys_insn(1, 0, 8, 6, 1)
+#define OP_TLBI_RVAAE1			sys_insn(1, 0, 8, 6, 3)
+#define OP_TLBI_RVALE1			sys_insn(1, 0, 8, 6, 5)
+#define OP_TLBI_RVAALE1			sys_insn(1, 0, 8, 6, 7)
+#define OP_TLBI_VMALLE1			sys_insn(1, 0, 8, 7, 0)
+#define OP_TLBI_VAE1			sys_insn(1, 0, 8, 7, 1)
+#define OP_TLBI_ASIDE1			sys_insn(1, 0, 8, 7, 2)
+#define OP_TLBI_VAAE1			sys_insn(1, 0, 8, 7, 3)
+#define OP_TLBI_VALE1			sys_insn(1, 0, 8, 7, 5)
+#define OP_TLBI_VAALE1			sys_insn(1, 0, 8, 7, 7)
+#define OP_TLBI_VMALLE1OSNXS		sys_insn(1, 0, 9, 1, 0)
+#define OP_TLBI_VAE1OSNXS		sys_insn(1, 0, 9, 1, 1)
+#define OP_TLBI_ASIDE1OSNXS		sys_insn(1, 0, 9, 1, 2)
+#define OP_TLBI_VAAE1OSNXS		sys_insn(1, 0, 9, 1, 3)
+#define OP_TLBI_VALE1OSNXS		sys_insn(1, 0, 9, 1, 5)
+#define OP_TLBI_VAALE1OSNXS		sys_insn(1, 0, 9, 1, 7)
+#define OP_TLBI_RVAE1ISNXS		sys_insn(1, 0, 9, 2, 1)
+#define OP_TLBI_RVAAE1ISNXS		sys_insn(1, 0, 9, 2, 3)
+#define OP_TLBI_RVALE1ISNXS		sys_insn(1, 0, 9, 2, 5)
+#define OP_TLBI_RVAALE1ISNXS		sys_insn(1, 0, 9, 2, 7)
+#define OP_TLBI_VMALLE1ISNXS		sys_insn(1, 0, 9, 3, 0)
+#define OP_TLBI_VAE1ISNXS		sys_insn(1, 0, 9, 3, 1)
+#define OP_TLBI_ASIDE1ISNXS		sys_insn(1, 0, 9, 3, 2)
+#define OP_TLBI_VAAE1ISNXS		sys_insn(1, 0, 9, 3, 3)
+#define OP_TLBI_VALE1ISNXS		sys_insn(1, 0, 9, 3, 5)
+#define OP_TLBI_VAALE1ISNXS		sys_insn(1, 0, 9, 3, 7)
+#define OP_TLBI_RVAE1OSNXS		sys_insn(1, 0, 9, 5, 1)
+#define OP_TLBI_RVAAE1OSNXS		sys_insn(1, 0, 9, 5, 3)
+#define OP_TLBI_RVALE1OSNXS		sys_insn(1, 0, 9, 5, 5)
+#define OP_TLBI_RVAALE1OSNXS		sys_insn(1, 0, 9, 5, 7)
+#define OP_TLBI_RVAE1NXS		sys_insn(1, 0, 9, 6, 1)
+#define OP_TLBI_RVAAE1NXS		sys_insn(1, 0, 9, 6, 3)
+#define OP_TLBI_RVALE1NXS		sys_insn(1, 0, 9, 6, 5)
+#define OP_TLBI_RVAALE1NXS		sys_insn(1, 0, 9, 6, 7)
+#define OP_TLBI_VMALLE1NXS		sys_insn(1, 0, 9, 7, 0)
+#define OP_TLBI_VAE1NXS			sys_insn(1, 0, 9, 7, 1)
+#define OP_TLBI_ASIDE1NXS		sys_insn(1, 0, 9, 7, 2)
+#define OP_TLBI_VAAE1NXS		sys_insn(1, 0, 9, 7, 3)
+#define OP_TLBI_VALE1NXS		sys_insn(1, 0, 9, 7, 5)
+#define OP_TLBI_VAALE1NXS		sys_insn(1, 0, 9, 7, 7)
+#define OP_TLBI_IPAS2E1IS		sys_insn(1, 4, 8, 0, 1)
+#define OP_TLBI_RIPAS2E1IS		sys_insn(1, 4, 8, 0, 2)
+#define OP_TLBI_IPAS2LE1IS		sys_insn(1, 4, 8, 0, 5)
+#define OP_TLBI_RIPAS2LE1IS		sys_insn(1, 4, 8, 0, 6)
+#define OP_TLBI_ALLE2OS			sys_insn(1, 4, 8, 1, 0)
+#define OP_TLBI_VAE2OS			sys_insn(1, 4, 8, 1, 1)
+#define OP_TLBI_ALLE1OS			sys_insn(1, 4, 8, 1, 4)
+#define OP_TLBI_VALE2OS			sys_insn(1, 4, 8, 1, 5)
+#define OP_TLBI_VMALLS12E1OS		sys_insn(1, 4, 8, 1, 6)
+#define OP_TLBI_RVAE2IS			sys_insn(1, 4, 8, 2, 1)
+#define OP_TLBI_RVALE2IS		sys_insn(1, 4, 8, 2, 5)
+#define OP_TLBI_ALLE2IS			sys_insn(1, 4, 8, 3, 0)
+#define OP_TLBI_VAE2IS			sys_insn(1, 4, 8, 3, 1)
+#define OP_TLBI_ALLE1IS			sys_insn(1, 4, 8, 3, 4)
+#define OP_TLBI_VALE2IS			sys_insn(1, 4, 8, 3, 5)
+#define OP_TLBI_VMALLS12E1IS		sys_insn(1, 4, 8, 3, 6)
+#define OP_TLBI_IPAS2E1OS		sys_insn(1, 4, 8, 4, 0)
+#define OP_TLBI_IPAS2E1			sys_insn(1, 4, 8, 4, 1)
+#define OP_TLBI_RIPAS2E1		sys_insn(1, 4, 8, 4, 2)
+#define OP_TLBI_RIPAS2E1OS		sys_insn(1, 4, 8, 4, 3)
+#define OP_TLBI_IPAS2LE1OS		sys_insn(1, 4, 8, 4, 4)
+#define OP_TLBI_IPAS2LE1		sys_insn(1, 4, 8, 4, 5)
+#define OP_TLBI_RIPAS2LE1		sys_insn(1, 4, 8, 4, 6)
+#define OP_TLBI_RIPAS2LE1OS		sys_insn(1, 4, 8, 4, 7)
+#define OP_TLBI_RVAE2OS			sys_insn(1, 4, 8, 5, 1)
+#define OP_TLBI_RVALE2OS		sys_insn(1, 4, 8, 5, 5)
+#define OP_TLBI_RVAE2			sys_insn(1, 4, 8, 6, 1)
+#define OP_TLBI_RVALE2			sys_insn(1, 4, 8, 6, 5)
+#define OP_TLBI_ALLE2			sys_insn(1, 4, 8, 7, 0)
+#define OP_TLBI_VAE2			sys_insn(1, 4, 8, 7, 1)
+#define OP_TLBI_ALLE1			sys_insn(1, 4, 8, 7, 4)
+#define OP_TLBI_VALE2			sys_insn(1, 4, 8, 7, 5)
+#define OP_TLBI_VMALLS12E1		sys_insn(1, 4, 8, 7, 6)
+#define OP_TLBI_IPAS2E1ISNXS		sys_insn(1, 4, 9, 0, 1)
+#define OP_TLBI_RIPAS2E1ISNXS		sys_insn(1, 4, 9, 0, 2)
+#define OP_TLBI_IPAS2LE1ISNXS		sys_insn(1, 4, 9, 0, 5)
+#define OP_TLBI_RIPAS2LE1ISNXS		sys_insn(1, 4, 9, 0, 6)
+#define OP_TLBI_ALLE2OSNXS		sys_insn(1, 4, 9, 1, 0)
+#define OP_TLBI_VAE2OSNXS		sys_insn(1, 4, 9, 1, 1)
+#define OP_TLBI_ALLE1OSNXS		sys_insn(1, 4, 9, 1, 4)
+#define OP_TLBI_VALE2OSNXS		sys_insn(1, 4, 9, 1, 5)
+#define OP_TLBI_VMALLS12E1OSNXS		sys_insn(1, 4, 9, 1, 6)
+#define OP_TLBI_RVAE2ISNXS		sys_insn(1, 4, 9, 2, 1)
+#define OP_TLBI_RVALE2ISNXS		sys_insn(1, 4, 9, 2, 5)
+#define OP_TLBI_ALLE2ISNXS		sys_insn(1, 4, 9, 3, 0)
+#define OP_TLBI_VAE2ISNXS		sys_insn(1, 4, 9, 3, 1)
+#define OP_TLBI_ALLE1ISNXS		sys_insn(1, 4, 9, 3, 4)
+#define OP_TLBI_VALE2ISNXS		sys_insn(1, 4, 9, 3, 5)
+#define OP_TLBI_VMALLS12E1ISNXS		sys_insn(1, 4, 9, 3, 6)
+#define OP_TLBI_IPAS2E1OSNXS		sys_insn(1, 4, 9, 4, 0)
+#define OP_TLBI_IPAS2E1NXS		sys_insn(1, 4, 9, 4, 1)
+#define OP_TLBI_RIPAS2E1NXS		sys_insn(1, 4, 9, 4, 2)
+#define OP_TLBI_RIPAS2E1OSNXS		sys_insn(1, 4, 9, 4, 3)
+#define OP_TLBI_IPAS2LE1OSNXS		sys_insn(1, 4, 9, 4, 4)
+#define OP_TLBI_IPAS2LE1NXS		sys_insn(1, 4, 9, 4, 5)
+#define OP_TLBI_RIPAS2LE1NXS		sys_insn(1, 4, 9, 4, 6)
+#define OP_TLBI_RIPAS2LE1OSNXS		sys_insn(1, 4, 9, 4, 7)
+#define OP_TLBI_RVAE2OSNXS		sys_insn(1, 4, 9, 5, 1)
+#define OP_TLBI_RVALE2OSNXS		sys_insn(1, 4, 9, 5, 5)
+#define OP_TLBI_RVAE2NXS		sys_insn(1, 4, 9, 6, 1)
+#define OP_TLBI_RVALE2NXS		sys_insn(1, 4, 9, 6, 5)
+#define OP_TLBI_ALLE2NXS		sys_insn(1, 4, 9, 7, 0)
+#define OP_TLBI_VAE2NXS			sys_insn(1, 4, 9, 7, 1)
+#define OP_TLBI_ALLE1NXS		sys_insn(1, 4, 9, 7, 4)
+#define OP_TLBI_VALE2NXS		sys_insn(1, 4, 9, 7, 5)
+#define OP_TLBI_VMALLS12E1NXS		sys_insn(1, 4, 9, 7, 6)
+
+/* Misc instructions */
+#define OP_GCSPUSHX			sys_insn(1, 0, 7, 7, 4)
+#define OP_GCSPOPCX			sys_insn(1, 0, 7, 7, 5)
+#define OP_GCSPOPX			sys_insn(1, 0, 7, 7, 6)
+#define OP_GCSPUSHM			sys_insn(1, 3, 7, 7, 0)
+
+#define OP_BRB_IALL			sys_insn(1, 1, 7, 2, 4)
+#define OP_BRB_INJ			sys_insn(1, 1, 7, 2, 5)
+#define OP_CFP_RCTX			sys_insn(1, 3, 7, 3, 4)
+#define OP_DVP_RCTX			sys_insn(1, 3, 7, 3, 5)
+#define OP_COSP_RCTX			sys_insn(1, 3, 7, 3, 6)
+#define OP_CPP_RCTX			sys_insn(1, 3, 7, 3, 7)
+
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_ENTP2	(BIT(60))
 #define SCTLR_ELx_DSSBS	(BIT(44))
@@ -555,16 +898,14 @@
 /* Position the attr at the correct index */
 #define MAIR_ATTRIDX(attr, idx)		((attr) << ((idx) * 8))
 
-/* id_aa64pfr0 */
-#define ID_AA64PFR0_EL1_ELx_64BIT_ONLY		0x1
-#define ID_AA64PFR0_EL1_ELx_32BIT_64BIT		0x2
-
 /* id_aa64mmfr0 */
 #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN	0x0
+#define ID_AA64MMFR0_EL1_TGRAN4_LPA2		ID_AA64MMFR0_EL1_TGRAN4_52_BIT
 #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX	0x7
 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN	0x0
 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX	0x7
 #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN	0x1
+#define ID_AA64MMFR0_EL1_TGRAN16_LPA2		ID_AA64MMFR0_EL1_TGRAN16_52_BIT
 #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX	0xf
 
 #define ARM64_MIN_PARANGE_BITS		32
@@ -572,6 +913,7 @@
 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT	0x0
 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE		0x1
 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN		0x2
+#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2		0x3
 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX		0x7
 
 #ifdef CONFIG_ARM64_PA_BITS_52
@@ -582,11 +924,13 @@
 
 #if defined(CONFIG_ARM64_4K_PAGES)
 #define ID_AA64MMFR0_EL1_TGRAN_SHIFT		ID_AA64MMFR0_EL1_TGRAN4_SHIFT
+#define ID_AA64MMFR0_EL1_TGRAN_LPA2		ID_AA64MMFR0_EL1_TGRAN4_52_BIT
 #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN	ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN
 #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX	ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX
 #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT		ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT
 #elif defined(CONFIG_ARM64_16K_PAGES)
 #define ID_AA64MMFR0_EL1_TGRAN_SHIFT		ID_AA64MMFR0_EL1_TGRAN16_SHIFT
+#define ID_AA64MMFR0_EL1_TGRAN_LPA2		ID_AA64MMFR0_EL1_TGRAN16_52_BIT
 #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN	ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN
 #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX	ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX
 #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT		ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT
@@ -610,6 +954,19 @@
 #define SYS_GCR_EL1_RRND	(BIT(16))
 #define SYS_GCR_EL1_EXCL_MASK	0xffffUL
 
+#ifdef CONFIG_KASAN_HW_TAGS
+/*
+ * KASAN always uses a whole byte for its tags. With CONFIG_KASAN_HW_TAGS it
+ * only uses tags in the range 0xF0-0xFF, which we map to MTE tags 0x0-0xF.
+ */
+#define __MTE_TAG_MIN		(KASAN_TAG_MIN & 0xf)
+#define __MTE_TAG_MAX		(KASAN_TAG_MAX & 0xf)
+#define __MTE_TAG_INCL		GENMASK(__MTE_TAG_MAX, __MTE_TAG_MIN)
+#define KERNEL_GCR_EL1_EXCL	(SYS_GCR_EL1_EXCL_MASK & ~__MTE_TAG_INCL)
+#else
+#define KERNEL_GCR_EL1_EXCL	SYS_GCR_EL1_EXCL_MASK
+#endif
+
 #define KERNEL_GCR_EL1		(SYS_GCR_EL1_RRND | KERNEL_GCR_EL1_EXCL)
 
 /* RGSR_EL1 Definitions */
@@ -716,6 +1073,22 @@
 
 #define PIRx_ELx_PERM(idx, perm)	((perm) << ((idx) * 4))
 
+/*
+ * Permission Overlay Extension (POE) permission encodings.
+ */
+#define POE_NONE	UL(0x0)
+#define POE_R		UL(0x1)
+#define POE_X		UL(0x2)
+#define POE_RX		UL(0x3)
+#define POE_W		UL(0x4)
+#define POE_RW		UL(0x5)
+#define POE_XW		UL(0x6)
+#define POE_RXW		UL(0x7)
+#define POE_MASK	UL(0xf)
+
+/* Initial value for Permission Overlay Extension for EL0 */
+#define POR_EL0_INIT	POE_RXW
+
 #define ARM64_FEATURE_FIELD_BITS	4
 
 /* Defined for compatibility only, do not add new users. */
@@ -789,15 +1162,21 @@
 /*
  * For registers without architectural names, or simply unsupported by
  * GAS.
+ *
+ * __check_r forces warnings to be generated by the compiler when
+ * evaluating r which wouldn't normally happen due to being passed to
+ * the assembler via __stringify(r).
  */
 #define read_sysreg_s(r) ({						\
 	u64 __val;							\
+	u32 __maybe_unused __check_r = (u32)(r);			\
 	asm volatile(__mrs_s("%0", r) : "=r" (__val));			\
 	__val;								\
 })
 
 #define write_sysreg_s(v, r) do {					\
 	u64 __val = (u64)(v);						\
+	u32 __maybe_unused __check_r = (u32)(r);			\
 	asm volatile(__msr_s(r, "%x0") : : "rZ" (__val));		\
 } while (0)
 
@@ -827,6 +1206,8 @@
 	par;								\
 })
 
+#define SYS_FIELD_VALUE(reg, field, val)	reg##_##field##_##val
+
 #define SYS_FIELD_GET(reg, field, val)		\
 		 FIELD_GET(reg##_##field##_MASK, val)
 
@@ -834,7 +1215,8 @@
 		 FIELD_PREP(reg##_##field##_MASK, val)
 
 #define SYS_FIELD_PREP_ENUM(reg, field, val)		\
-		 FIELD_PREP(reg##_##field##_MASK, reg##_##field##_##val)
+		 FIELD_PREP(reg##_##field##_MASK,	\
+			    SYS_FIELD_VALUE(reg, field, val))
 
 #endif
 
diff --git a/tools/include/linux/kasan-tags.h b/tools/include/linux/kasan-tags.h
new file mode 100644
index 000000000000..4f85f562512c
--- /dev/null
+++ b/tools/include/linux/kasan-tags.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KASAN_TAGS_H
+#define _LINUX_KASAN_TAGS_H
+
+#define KASAN_TAG_KERNEL	0xFF /* native kernel pointers tag */
+#define KASAN_TAG_INVALID	0xFE /* inaccessible memory tag */
+#define KASAN_TAG_MAX		0xFD /* maximum value for random tags */
+
+#ifdef CONFIG_KASAN_HW_TAGS
+#define KASAN_TAG_MIN		0xF0 /* minimum value for random tags */
+#else
+#define KASAN_TAG_MIN		0x00 /* minimum value for random tags */
+#endif
+
+#endif /* LINUX_KASAN_TAGS_H */
-- 
cgit v1.2.3


From c382ee674c8b5005798606267d660cf995218b18 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 6 Jan 2025 14:24:38 +0000
Subject: arm64/sysreg/tools: Move TRFCR definitions to sysreg

Convert TRFCR to automatic generation. Add separate definitions for ELx
and EL2 as TRFCR_EL1 doesn't have CX. This also mirrors the previous
definition so no code change is required.

Also add TRFCR_EL12 which will start to be used in a later commit.

Unfortunately, to avoid breaking the Perf build with duplicate
definition errors, the tools copy of the sysreg.h header needs to be
updated at the same time rather than the usual second commit. This is
because the generated version of sysreg
(arch/arm64/include/generated/asm/sysreg-defs.h), is currently shared
and tools/ does not have its own copy.

Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: James Clark <james.clark@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20250106142446.628923-4-james.clark@linaro.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/arch/arm64/include/asm/sysreg.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index 345e81e0d2b3..150416682e2c 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -283,8 +283,6 @@
 #define SYS_RGSR_EL1			sys_reg(3, 0, 1, 0, 5)
 #define SYS_GCR_EL1			sys_reg(3, 0, 1, 0, 6)
 
-#define SYS_TRFCR_EL1			sys_reg(3, 0, 1, 2, 1)
-
 #define SYS_TCR_EL1			sys_reg(3, 0, 2, 0, 2)
 
 #define SYS_APIAKEYLO_EL1		sys_reg(3, 0, 2, 1, 0)
@@ -519,7 +517,6 @@
 #define SYS_VTTBR_EL2			sys_reg(3, 4, 2, 1, 0)
 #define SYS_VTCR_EL2			sys_reg(3, 4, 2, 1, 2)
 
-#define SYS_TRFCR_EL2			sys_reg(3, 4, 1, 2, 1)
 #define SYS_VNCR_EL2			sys_reg(3, 4, 2, 2, 0)
 #define SYS_HAFGRTR_EL2			sys_reg(3, 4, 3, 1, 6)
 #define SYS_SPSR_EL2			sys_reg(3, 4, 4, 0, 0)
@@ -983,15 +980,6 @@
 /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */
 #define SYS_MPIDR_SAFE_VAL	(BIT(31))
 
-#define TRFCR_ELx_TS_SHIFT		5
-#define TRFCR_ELx_TS_MASK		((0x3UL) << TRFCR_ELx_TS_SHIFT)
-#define TRFCR_ELx_TS_VIRTUAL		((0x1UL) << TRFCR_ELx_TS_SHIFT)
-#define TRFCR_ELx_TS_GUEST_PHYSICAL	((0x2UL) << TRFCR_ELx_TS_SHIFT)
-#define TRFCR_ELx_TS_PHYSICAL		((0x3UL) << TRFCR_ELx_TS_SHIFT)
-#define TRFCR_EL2_CX			BIT(3)
-#define TRFCR_ELx_ExTRE			BIT(1)
-#define TRFCR_ELx_E0TRE			BIT(0)
-
 /* GIC Hypervisor interface registers */
 /* ICH_MISR_EL2 bit definitions */
 #define ICH_MISR_EOI		(1 << 0)
-- 
cgit v1.2.3


From 4bbb6df62c54e6a2c1fcce4908df768f0cfa1e91 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Fri, 27 Dec 2024 14:22:20 -0800
Subject: tools: fix atomic_set() definition to set the value correctly

Currently vma test is failing because of the new vma_assert_attached()
assertion.  The check is failing because previous refcount_set() inside
vma_mark_attached() is a NoOp.  Fix the definition of atomic_set() to
correctly set the value of the atomic.

Link: https://lkml.kernel.org/r/20241227222220.1726384-1-surenb@google.com
Fixes: 9325b8b5a1cb ("tools: add skeleton code for userland testing of VMA logic")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/shared/linux/maple_tree.h | 2 +-
 tools/testing/vma/linux/atomic.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/shared/linux/maple_tree.h b/tools/testing/shared/linux/maple_tree.h
index 06c89bdcc515..f67d47d32857 100644
--- a/tools/testing/shared/linux/maple_tree.h
+++ b/tools/testing/shared/linux/maple_tree.h
@@ -2,6 +2,6 @@
 #define atomic_t int32_t
 #define atomic_inc(x) uatomic_inc(x)
 #define atomic_read(x) uatomic_read(x)
-#define atomic_set(x, y) do {} while (0)
+#define atomic_set(x, y) uatomic_set(x, y)
 #define U8_MAX UCHAR_MAX
 #include "../../../../include/linux/maple_tree.h"
diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h
index e01f66f98982..3e1b6adc027b 100644
--- a/tools/testing/vma/linux/atomic.h
+++ b/tools/testing/vma/linux/atomic.h
@@ -6,7 +6,7 @@
 #define atomic_t int32_t
 #define atomic_inc(x) uatomic_inc(x)
 #define atomic_read(x) uatomic_read(x)
-#define atomic_set(x, y) do {} while (0)
+#define atomic_set(x, y) uatomic_set(x, y)
 #define U8_MAX UCHAR_MAX
 
 #endif	/* _LINUX_ATOMIC_H */
-- 
cgit v1.2.3


From a32bf5bb7933fde6f39747499f8ec232b5b5400f Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 7 Jan 2025 14:25:53 +0000
Subject: selftests/mm: set allocated memory to non-zero content in cow test

After commit b1f202060afe ("mm: remap unused subpages to shared zeropage
when splitting isolated thp"), cow test cases involving swapping out THPs
via madvise(MADV_PAGEOUT) started to be skipped due to the subsequent
check via pagemap determining that the memory was not actually swapped
out.  Logs similar to this were emitted:

   ...

   # [RUN] Basic COW after fork() ... with swapped-out, PTE-mapped THP (16 kB)
   ok 2 # SKIP MADV_PAGEOUT did not work, is swap enabled?
   # [RUN] Basic COW after fork() ... with single PTE of swapped-out THP (16 kB)
   ok 3 # SKIP MADV_PAGEOUT did not work, is swap enabled?
   # [RUN] Basic COW after fork() ... with swapped-out, PTE-mapped THP (32 kB)
   ok 4 # SKIP MADV_PAGEOUT did not work, is swap enabled?

   ...

The commit in question introduces the behaviour of scanning THPs and if
their content is predominantly zero, it splits them and replaces the pages
which are wholly zero with the zero page.  These cow test cases were
getting caught up in this.

So let's avoid that by filling the contents of all allocated memory with
a non-zero value. With this in place, the tests are passing again.

Link: https://lkml.kernel.org/r/20250107142555.1870101-1-ryan.roberts@arm.com
Fixes: b1f202060afe ("mm: remap unused subpages to shared zeropage when splitting isolated thp")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/cow.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index 32c6ccc2a6be..1238e1c5aae1 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -758,7 +758,7 @@ static void do_run_with_base_page(test_fn fn, bool swapout)
 	}
 
 	/* Populate a base page. */
-	memset(mem, 0, pagesize);
+	memset(mem, 1, pagesize);
 
 	if (swapout) {
 		madvise(mem, pagesize, MADV_PAGEOUT);
@@ -824,12 +824,12 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
 	 * Try to populate a THP. Touch the first sub-page and test if
 	 * we get the last sub-page populated automatically.
 	 */
-	mem[0] = 0;
+	mem[0] = 1;
 	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
 		ksft_test_result_skip("Did not get a THP populated\n");
 		goto munmap;
 	}
-	memset(mem, 0, thpsize);
+	memset(mem, 1, thpsize);
 
 	size = thpsize;
 	switch (thp_run) {
@@ -1012,7 +1012,7 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
 	}
 
 	/* Populate an huge page. */
-	memset(mem, 0, hugetlbsize);
+	memset(mem, 1, hugetlbsize);
 
 	/*
 	 * We need a total of two hugetlb pages to handle COW/unsharing
-- 
cgit v1.2.3


From 658eb5ab916ddc92f294dbce8e3d449470be9f86 Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Tue, 3 Dec 2024 16:48:48 +0800
Subject: delayacct: add delay max to record delay peak

Introduce the use cases of delay max, which can help quickly detect
potential abnormal delays in the system and record the types and specific
details of delay spikes.

Problem
========
Delay accounting can track the average delay of processes to show
system workload. However, when a process experiences a significant
delay, maybe a delay spike, which adversely affects performance,
getdelays can only display the average system delay over a period
of time. Yet, average delay is unhelpful for diagnosing delay peak.
It is not even possible to determine which type of delay has spiked,
as this information might be masked by the average delay.

Solution
=========
the 'delay max' can display delay peak since the system's startup,
which can record potential abnormal delays over time, including
the type of delay and the maximum delay. This is helpful for
quickly identifying crash caused by delay.

Use case
=========
bash# ./getdelays -d -p 244
print delayacct stats ON
PID     244

CPU             count     real total  virtual total    delay total  delay average      delay max
                   68      192000000      213676651         705643          0.010ms     0.306381ms
IO              count    delay total  delay average      delay max
                    0              0          0.000ms     0.000000ms
SWAP            count    delay total  delay average      delay max
                    0              0          0.000ms     0.000000ms
RECLAIM         count    delay total  delay average      delay max
                    0              0          0.000ms     0.000000ms
THRASHING       count    delay total  delay average      delay max
                    0              0          0.000ms     0.000000ms
COMPACT         count    delay total  delay average      delay max
                    0              0          0.000ms     0.000000ms
WPCOPY          count    delay total  delay average      delay max
                  235       15648284          0.067ms     0.263842ms
IRQ             count    delay total  delay average      delay max
                    0              0          0.000ms     0.000000ms

[wang.yaxin@zte.com.cn: update docs and fix some spelling errors]
  Link: https://lkml.kernel.org/r/20241213192700771XKZ8H30OtHSeziGqRVMs0@zte.com.cn
Link: https://lkml.kernel.org/r/20241203164848805CS62CQPQWG9GLdQj2_BxS@zte.com.cn
Co-developed-by: Wang Yong <wang.yong12@zte.com.cn>
Signed-off-by: Wang Yong <wang.yong12@zte.com.cn>
Co-developed-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Co-developed-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Signed-off-by: Kun Jiang <jiang.kun2@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Peilin He <he.peilin@zte.com.cn>
Cc: tuqiang <tu.qiang35@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: ye xingchen <ye.xingchen@zte.com.cn>
Cc: Yunkai Zhang <zhang.yunkai@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/getdelays.c | 59 +++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 1334214546d7..e570bcad185d 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -192,60 +192,69 @@ static int get_family_id(int sd)
 }
 
 #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
+#define delay_max_ms(t) (t / 1000000ULL)
 
 static void print_delayacct(struct taskstats *t)
 {
-	printf("\n\nCPU   %15s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15llu%15llu%15.3fms\n"
-	       "IO    %15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n"
-	       "SWAP  %15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n"
-	       "RECLAIM  %12s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n"
-	       "THRASHING%12s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n"
-	       "COMPACT  %12s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n"
-	       "WPCOPY   %12s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n"
-	       "IRQ   %15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms\n",
+	printf("\n\nCPU   %15s%15s%15s%15s%15s%15s\n"
+	       "      %15llu%15llu%15llu%15llu%15.3fms%13.6fms\n"
+	       "IO    %15s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n"
+	       "SWAP  %15s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n"
+	       "RECLAIM  %12s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n"
+	       "THRASHING%12s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n"
+	       "COMPACT  %12s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n"
+	       "WPCOPY   %12s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n"
+	       "IRQ   %15s%15s%15s%15s\n"
+	       "      %15llu%15llu%15.3fms%13.6fms\n",
 	       "count", "real total", "virtual total",
-	       "delay total", "delay average",
+	       "delay total", "delay average", "delay max",
 	       (unsigned long long)t->cpu_count,
 	       (unsigned long long)t->cpu_run_real_total,
 	       (unsigned long long)t->cpu_run_virtual_total,
 	       (unsigned long long)t->cpu_delay_total,
 	       average_ms((double)t->cpu_delay_total, t->cpu_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->cpu_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->blkio_count,
 	       (unsigned long long)t->blkio_delay_total,
 	       average_ms((double)t->blkio_delay_total, t->blkio_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->blkio_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->swapin_count,
 	       (unsigned long long)t->swapin_delay_total,
 	       average_ms((double)t->swapin_delay_total, t->swapin_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->swapin_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->freepages_count,
 	       (unsigned long long)t->freepages_delay_total,
 	       average_ms((double)t->freepages_delay_total, t->freepages_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->freepages_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->thrashing_count,
 	       (unsigned long long)t->thrashing_delay_total,
 	       average_ms((double)t->thrashing_delay_total, t->thrashing_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->thrashing_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
 	       average_ms((double)t->compact_delay_total, t->compact_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->compact_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->wpcopy_count,
 	       (unsigned long long)t->wpcopy_delay_total,
 	       average_ms((double)t->wpcopy_delay_total, t->wpcopy_count),
-	       "count", "delay total", "delay average",
+		   delay_max_ms((double)t->wpcopy_delay_max),
+	       "count", "delay total", "delay average", "delay max",
 	       (unsigned long long)t->irq_count,
 	       (unsigned long long)t->irq_delay_total,
-	       average_ms((double)t->irq_delay_total, t->irq_count));
+	       average_ms((double)t->irq_delay_total, t->irq_count),
+		   delay_max_ms((double)t->irq_delay_max));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
cgit v1.2.3


From d5cf6b0d170fe57da66bc0456bfae4c6aa106860 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Tue, 3 Dec 2024 10:05:50 +0800
Subject: tools/accounting/procacct: fix minor errors

The logfile option was documented but not working.  Add it and optimized
the while loop.

Link: https://lkml.kernel.org/r/20241203020550.3145-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Dr. Thomas Orgis <thomas.orgis@uni-hamburg.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/procacct.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c
index 90c4a37f53d9..e8dee05a6264 100644
--- a/tools/accounting/procacct.c
+++ b/tools/accounting/procacct.c
@@ -274,12 +274,11 @@ int main(int argc, char *argv[])
 	int maskset = 0;
 	char *logfile = NULL;
 	int cfd = 0;
-	int forking = 0;
 
 	struct msgtemplate msg;
 
-	while (!forking) {
-		c = getopt(argc, argv, "m:vr:");
+	while (1) {
+		c = getopt(argc, argv, "m:vr:w:");
 		if (c < 0)
 			break;
 
-- 
cgit v1.2.3


From 93b6bd40688ba17225ba8c5f28e8ccb713359b05 Mon Sep 17 00:00:00 2001
From: Shivam Chaudhary <cvam0000@gmail.com>
Date: Wed, 11 Dec 2024 21:19:03 +0530
Subject: kernel-wide: add explicity||explicitly to spelling.txt

Correct the spelling dictionary so that future instances will be caught by
checkpatch, and fix the instances found.

Link: https://lkml.kernel.org/r/20241211154903.47027-1-cvam0000@gmail.com
Signed-off-by: Shivam Chaudhary <cvam0000@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Shivam Chaudhary <cvam0000@gmail.com>
Cc: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/pidfd/pidfd_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c
index 9faa686f90e4..e9728e86b4f2 100644
--- a/tools/testing/selftests/pidfd/pidfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -497,7 +497,7 @@ static int child_poll_leader_exit_test(void *args)
 	pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
 
 	/*
-	 * glibc exit calls exit_group syscall, so explicity call exit only
+	 * glibc exit calls exit_group syscall, so explicitly call exit only
 	 * so that only the group leader exits, leaving the threads alone.
 	 */
 	*child_exit_secs = time(NULL);
-- 
cgit v1.2.3


From f65c64f311ee2f1ddc1eb395ed8b20e6b9d14e85 Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Fri, 20 Dec 2024 17:31:05 +0800
Subject: delayacct: add delay min to record delay peak

Delay accounting can now calculate the average delay of processes, detect
the overall system load, and also record the 'delay max' to identify
potential abnormal delays.  However, 'delay min' can help us identify
another useful delay peak.  By comparing the difference between 'delay
max' and 'delay min', we can understand the optimization space for
latency, providing a reference for the optimization of latency
performance.

Use case
=========
bash-4.4# ./getdelays -d -t 242
print delayacct stats ON
TGID    242
CPU         count     real total  virtual total    delay total  delay average      delay max      delay min
               39      156000000      156576579        2111069          0.054ms     0.212296ms     0.031307ms
IO          count    delay total  delay average      delay max      delay min
                0              0          0.000ms     0.000000ms     0.000000ms
SWAP        count    delay total  delay average      delay max      delay min
                0              0          0.000ms     0.000000ms     0.000000ms
RECLAIM     count    delay total  delay average      delay max      delay min
                0              0          0.000ms     0.000000ms     0.000000ms
THRASHING   count    delay total  delay average      delay max      delay min
                0              0          0.000ms     0.000000ms     0.000000ms
COMPACT     count    delay total  delay average      delay max      delay min
                0              0          0.000ms     0.000000ms     0.000000ms
WPCOPY      count    delay total  delay average      delay max      delay min
              156       11215873          0.072ms     0.207403ms     0.033913ms
IRQ         count    delay total  delay average      delay max      delay min
                0              0          0.000ms     0.000000ms     0.000000ms

Link: https://lkml.kernel.org/r/20241220173105906EOdsPhzjMLYNJJBqgz1ga@zte.com.cn
Co-developed-by: Wang Yong <wang.yong12@zte.com.cn>
Signed-off-by: Wang Yong <wang.yong12@zte.com.cn>
Co-developed-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Co-developed-by: Kun Jiang <jiang.kun2@zte.com.cn>
Signed-off-by: Kun Jiang <jiang.kun2@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Peilin He <he.peilin@zte.com.cn>
Cc: tuqiang <tu.qiang35@zte.com.cn>
Cc: ye xingchen <ye.xingchen@zte.com.cn>
Cc: Yunkai Zhang <zhang.yunkai@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/getdelays.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index e570bcad185d..100ad3dc091a 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -192,7 +192,7 @@ static int get_family_id(int sd)
 }
 
 #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
-#define delay_max_ms(t) (t / 1000000ULL)
+#define delay_ms(t) (t / 1000000ULL)
 
 static void print_delayacct(struct taskstats *t)
 {
@@ -213,48 +213,56 @@ static void print_delayacct(struct taskstats *t)
 	       "IRQ   %15s%15s%15s%15s\n"
 	       "      %15llu%15llu%15.3fms%13.6fms\n",
 	       "count", "real total", "virtual total",
-	       "delay total", "delay average", "delay max",
+	       "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->cpu_count,
 	       (unsigned long long)t->cpu_run_real_total,
 	       (unsigned long long)t->cpu_run_virtual_total,
 	       (unsigned long long)t->cpu_delay_total,
 	       average_ms((double)t->cpu_delay_total, t->cpu_count),
-		   delay_max_ms((double)t->cpu_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->cpu_delay_max),
+	       delay_ms((double)t->cpu_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->blkio_count,
 	       (unsigned long long)t->blkio_delay_total,
 	       average_ms((double)t->blkio_delay_total, t->blkio_count),
-		   delay_max_ms((double)t->blkio_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->blkio_delay_max),
+	       delay_ms((double)t->blkio_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->swapin_count,
 	       (unsigned long long)t->swapin_delay_total,
 	       average_ms((double)t->swapin_delay_total, t->swapin_count),
-		   delay_max_ms((double)t->swapin_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->swapin_delay_max),
+	       delay_ms((double)t->swapin_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->freepages_count,
 	       (unsigned long long)t->freepages_delay_total,
 	       average_ms((double)t->freepages_delay_total, t->freepages_count),
-		   delay_max_ms((double)t->freepages_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->freepages_delay_max),
+	       delay_ms((double)t->freepages_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->thrashing_count,
 	       (unsigned long long)t->thrashing_delay_total,
 	       average_ms((double)t->thrashing_delay_total, t->thrashing_count),
-		   delay_max_ms((double)t->thrashing_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->thrashing_delay_max),
+	       delay_ms((double)t->thrashing_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
 	       average_ms((double)t->compact_delay_total, t->compact_count),
-		   delay_max_ms((double)t->compact_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->compact_delay_max),
+	       delay_ms((double)t->compact_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->wpcopy_count,
 	       (unsigned long long)t->wpcopy_delay_total,
 	       average_ms((double)t->wpcopy_delay_total, t->wpcopy_count),
-		   delay_max_ms((double)t->wpcopy_delay_max),
-	       "count", "delay total", "delay average", "delay max",
+	       delay_ms((double)t->wpcopy_delay_max),
+	       delay_ms((double)t->wpcopy_delay_min),
+	       "count", "delay total", "delay average", "delay max", "delay min",
 	       (unsigned long long)t->irq_count,
 	       (unsigned long long)t->irq_delay_total,
 	       average_ms((double)t->irq_delay_total, t->irq_count),
-		   delay_max_ms((double)t->irq_delay_max));
+	       delay_ms((double)t->irq_delay_max),
+	       delay_ms((double)t->irq_delay_min));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
cgit v1.2.3


From 9fb4267a759ce50e633a56df6dfdb32bafc24203 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 13 Jan 2025 15:19:52 +0000
Subject: KVM: arm64: Fix selftests after sysreg field name update

Fix KVM selftests that check for EL0's 64bit-ness, and use a now
removed definition. Kindly point them at the new one.

Reported-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c | 2 +-
 tools/testing/selftests/kvm/aarch64/set_id_regs.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
index 8e5bd07a3727..bd182be48c1b 100644
--- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
@@ -147,7 +147,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
 	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
 
 	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
-	return el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY;
+	return el0 == ID_AA64PFR0_EL1_EL0_IMP;
 }
 
 int main(void)
diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index a79b7f18452d..88f506c45112 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -667,7 +667,7 @@ int main(void)
 	/* Check for AARCH64 only system */
 	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
 	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
-	aarch64_only = (el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY);
+	aarch64_only = (el0 == ID_AA64PFR0_EL1_EL0_IMP);
 
 	ksft_print_header();
 
-- 
cgit v1.2.3


From 6cdbd84dc42b5f7e61c0aed67596efa0f4406a20 Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Sat, 11 Jan 2025 09:17:56 +0000
Subject: cxl/test: Update test code for event records to CXL spec rev 3.1

Update test code for General Media, DRAM, Memory Module Event
Records to CXL spec rev 3.1.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Link: https://patch.msgid.link/20250111091756.1682-7-shiju.jose@huawei.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 tools/testing/cxl/test/mem.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 347c1e7b37bd..8d731bd63988 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -401,6 +401,10 @@ struct cxl_test_gen_media gen_media = {
 			.channel = 1,
 			.rank = 30,
 		},
+		.component_id = { 0x3, 0x74, 0xc5, 0x8, 0x9a, 0x1a, 0xb, 0xfc, 0xd2, 0x7e, 0x2f, 0x31, 0x9b, 0x3c, 0x81, 0x4d },
+		.cme_threshold_ev_flags = 3,
+		.cme_count = { 33, 0, 0 },
+		.sub_type = 0x2,
 	},
 };
 
@@ -429,6 +433,11 @@ struct cxl_test_dram dram = {
 		.bank_group = 5,
 		.bank = 2,
 		.column = {0xDE, 0xAD},
+		.component_id = { 0x1, 0x74, 0xc5, 0x8, 0x9a, 0x1a, 0xb, 0xfc, 0xd2, 0x7e, 0x2f, 0x31, 0x9b, 0x3c, 0x81, 0x4d },
+		.sub_channel = 8,
+		.cme_threshold_ev_flags = 2,
+		.cvme_count = { 14, 0, 0 },
+		.sub_type = 0x5,
 	},
 };
 
@@ -456,7 +465,10 @@ struct cxl_test_mem_module mem_module = {
 			.dirty_shutdown_cnt = { 0xde, 0xad, 0xbe, 0xef },
 			.cor_vol_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
 			.cor_per_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
-		}
+		},
+		/* .validity_flags = <set below> */
+		.component_id = { 0x2, 0x74, 0xc5, 0x8, 0x9a, 0x1a, 0xb, 0xfc, 0xd2, 0x7e, 0x2f, 0x31, 0x9b, 0x3c, 0x81, 0x4d },
+		.event_sub_type = 0x3,
 	},
 };
 
@@ -478,13 +490,18 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds,
 
 static void cxl_mock_add_event_logs(struct mock_event_store *mes)
 {
-	put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK,
+	put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK |
+			   CXL_GMER_VALID_COMPONENT | CXL_GMER_VALID_COMPONENT_ID_FORMAT,
 			   &gen_media.rec.media_hdr.validity_flags);
 
 	put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP |
-			   CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN,
+			   CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN | CXL_DER_VALID_SUB_CHANNEL |
+			   CXL_DER_VALID_COMPONENT | CXL_DER_VALID_COMPONENT_ID_FORMAT,
 			   &dram.rec.media_hdr.validity_flags);
 
+	put_unaligned_le16(CXL_MMER_VALID_COMPONENT | CXL_MMER_VALID_COMPONENT_ID_FORMAT,
+			   &mem_module.rec.validity_flags);
+
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO,
 		      (struct cxl_event_record_raw *)&gen_media);
-- 
cgit v1.2.3


From 7f89bc51101ca676530ee017931ae2a01ff54381 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 21 Dec 2024 15:44:28 +0100
Subject: tools/nolibc: add support for waitid()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

waitid() is the modern variant of the family of wait-like syscalls.
Some architectures have dropped support for wait(), wait4() and waitpid()
but all of them support waitid().
It is more flexible and easier to use than the older ones.

Link: https://lore.kernel.org/r/20241221-nolibc-rv32-v1-1-d9ef6dab7c63@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/sys.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index 7b82bc3cf107..d4a5c2399a66 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -23,6 +23,7 @@
 #include <linux/prctl.h>
 #include <linux/resource.h>
 #include <linux/utsname.h>
+#include <linux/signal.h>
 
 #include "arch.h"
 #include "errno.h"
@@ -1225,6 +1226,23 @@ pid_t waitpid(pid_t pid, int *status, int options)
 }
 
 
+/*
+ * int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options);
+ */
+
+static __attribute__((unused))
+int sys_waitid(int which, pid_t pid, siginfo_t *infop, int options, struct rusage *rusage)
+{
+	return my_syscall5(__NR_waitid, which, pid, infop, options, rusage);
+}
+
+static __attribute__((unused))
+int waitid(int which, pid_t pid, siginfo_t *infop, int options)
+{
+	return __sysret(sys_waitid(which, pid, infop, options, NULL));
+}
+
+
 /*
  * ssize_t write(int fd, const void *buf, size_t count);
  */
-- 
cgit v1.2.3


From a0bc8947ac731ff95a56e0c1737e69e8c56d5b78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 21 Dec 2024 15:44:29 +0100
Subject: selftests/nolibc: use waitid() over waitpid()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newer archs like riscv32 don't provide waitpid() anymore.
Switch to waitid() which is available everywhere.

Link: https://lore.kernel.org/r/20241221-nolibc-rv32-v1-2-d9ef6dab7c63@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 6fba7025c5e3..60c50968d363 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1323,7 +1323,8 @@ static int run_protection(int min __attribute__((unused)),
 			  int max __attribute__((unused)))
 {
 	pid_t pid;
-	int llen = 0, status;
+	int llen = 0, ret;
+	siginfo_t siginfo = {};
 	struct rlimit rlimit = { 0, 0 };
 
 	llen += printf("0 -fstackprotector ");
@@ -1361,10 +1362,11 @@ static int run_protection(int min __attribute__((unused)),
 		return 1;
 
 	default:
-		pid = waitpid(pid, &status, 0);
+		ret = waitid(P_PID, pid, &siginfo, WEXITED);
 
-		if (pid == -1 || !WIFSIGNALED(status) || WTERMSIG(status) != SIGABRT) {
-			llen += printf("waitpid()");
+		if (ret != 0 || siginfo.si_signo != SIGCHLD ||
+		    siginfo.si_code != CLD_KILLED || siginfo.si_status != SIGABRT) {
+			llen += printf("waitid()");
 			result(llen, FAIL);
 			return 1;
 		}
-- 
cgit v1.2.3


From 4c7f09ab79b1348c619b118a2235d88a57e79af3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 21 Dec 2024 15:44:30 +0100
Subject: selftests/nolibc: use a pipe to in vfprintf tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not all architectures implement lseek(), for example riscv32 only
implements llseek() which is not equivalent to normal lseek().
Remove the need for lseek() by using a pipe instead.

Link: https://lore.kernel.org/r/20241221-nolibc-rv32-v1-3-d9ef6dab7c63@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 60c50968d363..3685c13a9a6b 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1229,19 +1229,20 @@ int run_stdlib(int min, int max)
 
 static int expect_vfprintf(int llen, int c, const char *expected, const char *fmt, ...)
 {
-	int ret, fd;
+	int ret, pipefd[2];
 	ssize_t w, r;
 	char buf[100];
 	FILE *memfile;
 	va_list args;
 
-	fd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR, 0600);
-	if (fd == -1) {
-		result(llen, SKIPPED);
-		return 0;
+	ret = pipe(pipefd);
+	if (ret == -1) {
+		llen += printf(" pipe() != %s", strerror(errno));
+		result(llen, FAIL);
+		return 1;
 	}
 
-	memfile = fdopen(fd, "w+");
+	memfile = fdopen(pipefd[1], "w");
 	if (!memfile) {
 		result(llen, FAIL);
 		return 1;
@@ -1257,13 +1258,10 @@ static int expect_vfprintf(int llen, int c, const char *expected, const char *fm
 		return 1;
 	}
 
-	fflush(memfile);
-	lseek(fd, 0, SEEK_SET);
-
-	r = read(fd, buf, sizeof(buf) - 1);
-
 	fclose(memfile);
 
+	r = read(pipefd[0], buf, sizeof(buf) - 1);
+
 	if (r != w) {
 		llen += printf(" written(%d) != read(%d)", (int)w, (int)r);
 		result(llen, FAIL);
-- 
cgit v1.2.3


From 349afc8a52f86643a32381879e4033f4fbaae88a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 21 Dec 2024 15:44:31 +0100
Subject: selftests/nolibc: skip tests for unimplemented syscalls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The riscv32 architecture is missing many of the older syscalls.
Instead of providing wrappers for everything at once, introducing a lot
of complexity, skip the tests for those syscalls for now.

Link: https://lore.kernel.org/r/20241221-nolibc-rv32-v1-4-d9ef6dab7c63@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 3685c13a9a6b..0e0e3b48a8c3 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -302,7 +302,10 @@ int expect_syszr(int expr, int llen)
 {
 	int ret = 0;
 
-	if (expr) {
+	if (errno == ENOSYS) {
+		llen += printf(" = ENOSYS");
+		result(llen, SKIPPED);
+	} else if (expr) {
 		ret = 1;
 		llen += printf(" = %d %s ", expr, errorname(errno));
 		result(llen, FAIL);
@@ -342,7 +345,10 @@ int expect_sysne(int expr, int llen, int val)
 {
 	int ret = 0;
 
-	if (expr == val) {
+	if (errno == ENOSYS) {
+		llen += printf(" = ENOSYS");
+		result(llen, SKIPPED);
+	} else if (expr == val) {
 		ret = 1;
 		llen += printf(" = %d %s ", expr, errorname(errno));
 		result(llen, FAIL);
@@ -367,7 +373,9 @@ int expect_syserr2(int expr, int expret, int experr1, int experr2, int llen)
 	int _errno = errno;
 
 	llen += printf(" = %d %s ", expr, errorname(_errno));
-	if (expr != expret || (_errno != experr1 && _errno != experr2)) {
+	if (errno == ENOSYS) {
+		result(llen, SKIPPED);
+	} else if (expr != expret || (_errno != experr1 && _errno != experr2)) {
 		ret = 1;
 		if (experr2 == 0)
 			llen += printf(" != (%d %s) ", expret, errorname(experr1));
-- 
cgit v1.2.3


From a47b4b9fbaa157e8957ea712d63d559625503ad9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 21 Dec 2024 15:44:32 +0100
Subject: selftests/nolibc: rename riscv to riscv64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

riscv32 support is about the be added. To keep the naming clear and
consistent with other architectures rename riscv to riscv64, as that is
what it actually represents.

Link: https://lore.kernel.org/r/20241221-nolibc-rv32-v1-5-d9ef6dab7c63@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile     | 6 ++++++
 tools/testing/selftests/nolibc/run-tests.sh | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index e92e0b885861..78f47e85b389 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -43,6 +43,7 @@ cc-option = $(call __cc-option, $(CC),$(CLANG_CROSS_FLAGS),$(1),$(2))
 # configure default variants for target kernel supported architectures
 XARCH_powerpc    = ppc
 XARCH_mips       = mips32le
+XARCH_riscv      = riscv64
 XARCH            = $(or $(XARCH_$(ARCH)),$(ARCH))
 
 # map from user input variants to their kernel supported architectures
@@ -51,6 +52,7 @@ ARCH_ppc64       = powerpc
 ARCH_ppc64le     = powerpc
 ARCH_mips32le    = mips
 ARCH_mips32be    = mips
+ARCH_riscv64     = riscv
 ARCH            := $(or $(ARCH_$(XARCH)),$(XARCH))
 
 # kernel image names by architecture
@@ -65,6 +67,7 @@ IMAGE_ppc        = vmlinux
 IMAGE_ppc64      = vmlinux
 IMAGE_ppc64le    = arch/powerpc/boot/zImage
 IMAGE_riscv      = arch/riscv/boot/Image
+IMAGE_riscv64    = arch/riscv/boot/Image
 IMAGE_s390       = arch/s390/boot/bzImage
 IMAGE_loongarch  = arch/loongarch/boot/vmlinuz.efi
 IMAGE            = $(objtree)/$(IMAGE_$(XARCH))
@@ -82,6 +85,7 @@ DEFCONFIG_ppc        = pmac32_defconfig
 DEFCONFIG_ppc64      = powernv_be_defconfig
 DEFCONFIG_ppc64le    = powernv_defconfig
 DEFCONFIG_riscv      = defconfig
+DEFCONFIG_riscv64    = defconfig
 DEFCONFIG_s390       = defconfig
 DEFCONFIG_loongarch  = defconfig
 DEFCONFIG            = $(DEFCONFIG_$(XARCH))
@@ -104,6 +108,7 @@ QEMU_ARCH_ppc        = ppc
 QEMU_ARCH_ppc64      = ppc64
 QEMU_ARCH_ppc64le    = ppc64
 QEMU_ARCH_riscv      = riscv64
+QEMU_ARCH_riscv64    = riscv64
 QEMU_ARCH_s390       = s390x
 QEMU_ARCH_loongarch  = loongarch64
 QEMU_ARCH            = $(QEMU_ARCH_$(XARCH))
@@ -130,6 +135,7 @@ QEMU_ARGS_ppc        = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIB
 QEMU_ARGS_ppc64      = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_ppc64le    = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_riscv      = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_riscv64    = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_s390       = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_loongarch  = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS            = -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA)
diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index 0f67e80051dc..e5daaba103ce 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -17,7 +17,7 @@ perform_download=0
 test_mode=system
 werror=1
 llvm=
-archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch"
+archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv64 s390 loongarch"
 
 TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@")
 
-- 
cgit v1.2.3


From 60fe18237f72e3a186127658452dbb0992113cf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 21 Dec 2024 15:44:33 +0100
Subject: selftests/nolibc: add configurations for riscv32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nolibc already supports riscv32. Wire it up in the testsuite.

Link: https://lore.kernel.org/r/20241221-nolibc-rv32-v1-6-d9ef6dab7c63@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/Makefile     | 5 +++++
 tools/testing/selftests/nolibc/run-tests.sh | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 78f47e85b389..7d14a7c0cb62 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -52,6 +52,7 @@ ARCH_ppc64       = powerpc
 ARCH_ppc64le     = powerpc
 ARCH_mips32le    = mips
 ARCH_mips32be    = mips
+ARCH_riscv32     = riscv
 ARCH_riscv64     = riscv
 ARCH            := $(or $(ARCH_$(XARCH)),$(XARCH))
 
@@ -67,6 +68,7 @@ IMAGE_ppc        = vmlinux
 IMAGE_ppc64      = vmlinux
 IMAGE_ppc64le    = arch/powerpc/boot/zImage
 IMAGE_riscv      = arch/riscv/boot/Image
+IMAGE_riscv32    = arch/riscv/boot/Image
 IMAGE_riscv64    = arch/riscv/boot/Image
 IMAGE_s390       = arch/s390/boot/bzImage
 IMAGE_loongarch  = arch/loongarch/boot/vmlinuz.efi
@@ -85,6 +87,7 @@ DEFCONFIG_ppc        = pmac32_defconfig
 DEFCONFIG_ppc64      = powernv_be_defconfig
 DEFCONFIG_ppc64le    = powernv_defconfig
 DEFCONFIG_riscv      = defconfig
+DEFCONFIG_riscv32    = rv32_defconfig
 DEFCONFIG_riscv64    = defconfig
 DEFCONFIG_s390       = defconfig
 DEFCONFIG_loongarch  = defconfig
@@ -108,6 +111,7 @@ QEMU_ARCH_ppc        = ppc
 QEMU_ARCH_ppc64      = ppc64
 QEMU_ARCH_ppc64le    = ppc64
 QEMU_ARCH_riscv      = riscv64
+QEMU_ARCH_riscv32    = riscv32
 QEMU_ARCH_riscv64    = riscv64
 QEMU_ARCH_s390       = s390x
 QEMU_ARCH_loongarch  = loongarch64
@@ -135,6 +139,7 @@ QEMU_ARGS_ppc        = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIB
 QEMU_ARGS_ppc64      = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_ppc64le    = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_riscv      = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_riscv32    = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_riscv64    = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_s390       = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_loongarch  = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index e5daaba103ce..9c5160c53881 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -17,7 +17,7 @@ perform_download=0
 test_mode=system
 werror=1
 llvm=
-archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv64 s390 loongarch"
+archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390 loongarch"
 
 TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@")
 
-- 
cgit v1.2.3


From 3d6f25870d172e2b5f27dd829bbe567d5fe8ea31 Mon Sep 17 00:00:00 2001
From: "John B. Wyatt IV" <jwyatt@redhat.com>
Date: Wed, 8 Jan 2025 17:18:44 -0500
Subject: pm: cpupower: Add missing residency header changes in cpuidle.h to
 SWIG

"tools/cpupower: display residency value in idle-info" added a new
function to cpuidle.h. This patch adds them to the bindings.

Link: https://lore.kernel.org/linux-pm/20240809083728.266697-1-aboorvad@linux.ibm.com/

Tested by compiling both libcpupower and the headers; running the test
script that does not use the functions as a basic sanity test.

Link: https://lore.kernel.org/r/20250108221852.30771-1-jwyatt@redhat.com
Signed-off-by: "John B. Wyatt IV" <jwyatt@redhat.com>
Signed-off-by: "John B. Wyatt IV" <sageofredondo@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/bindings/python/raw_pylibcpupower.swg | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
index a8226c79cfea..d82af6fa93c3 100644
--- a/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
+++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.swg
@@ -163,6 +163,8 @@ int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate,
 				   unsigned int disable);
 unsigned long cpuidle_state_latency(unsigned int cpu,
 						unsigned int idlestate);
+unsigned long cpuidle_state_residency(unsigned int cpu,
+						unsigned int idlestate);
 unsigned long cpuidle_state_usage(unsigned int cpu,
 					unsigned int idlestate);
 unsigned long long cpuidle_state_time(unsigned int cpu,
-- 
cgit v1.2.3


From 2ff80cefb77b70aa860ecf3c69f50e3f4cce439b Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Sat, 11 Jan 2025 15:48:02 +0000
Subject: tools/net/ynl: add support for --family and --list-families

Add a --family option to ynl to specify the spec by family name instead
of file path, with support for searching in-tree and system install
location and a --list-families option to show the available families.

./tools/net/ynl/pyynl/cli.py --family rt_addr --dump getaddr

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250111154803.7496-1-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 41d9fa5c818d..794e3c7dcc65 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -3,6 +3,7 @@
 
 import argparse
 import json
+import os
 import pathlib
 import pprint
 import sys
@@ -10,6 +11,24 @@ import sys
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily, Netlink, NlError
 
+sys_schema_dir='/usr/share/ynl'
+relative_schema_dir='../../../../Documentation/netlink'
+
+def schema_dir():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}")
+    if not os.path.isdir(schema_dir):
+        schema_dir = sys_schema_dir
+    if not os.path.isdir(schema_dir):
+        raise Exception(f"Schema directory {schema_dir} does not exist")
+    return schema_dir
+
+def spec_dir():
+    spec_dir = schema_dir() + '/specs'
+    if not os.path.isdir(spec_dir):
+        raise Exception(f"Spec directory {spec_dir} does not exist")
+    return spec_dir
+
 
 class YnlEncoder(json.JSONEncoder):
     def default(self, obj):
@@ -32,7 +51,14 @@ def main():
 
     parser = argparse.ArgumentParser(description=description,
                                      epilog=epilog)
-    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    spec_group = parser.add_mutually_exclusive_group(required=True)
+    spec_group.add_argument('--family', dest='family', type=str,
+                            help='name of the netlink FAMILY')
+    spec_group.add_argument('--list-families', action='store_true',
+                            help='list all netlink families supported by YNL (has spec)')
+    spec_group.add_argument('--spec', dest='spec', type=str,
+                            help='choose the family by SPEC file path')
+
     parser.add_argument('--schema', dest='schema', type=str)
     parser.add_argument('--no-schema', action='store_true')
     parser.add_argument('--json', dest='json_text', type=str)
@@ -70,6 +96,12 @@ def main():
         else:
             pprint.PrettyPrinter().pprint(msg)
 
+    if args.list_families:
+        for filename in sorted(os.listdir(spec_dir())):
+            if filename.endswith('.yaml'):
+                print(filename.removesuffix('.yaml'))
+        return
+
     if args.no_schema:
         args.schema = ''
 
@@ -77,7 +109,16 @@ def main():
     if args.json_text:
         attrs = json.loads(args.json_text)
 
-    ynl = YnlFamily(args.spec, args.schema, args.process_unknown,
+    if args.family:
+        spec = f"{spec_dir()}/{args.family}.yaml"
+        if args.schema is None and spec.startswith(sys_schema_dir):
+            args.schema = '' # disable schema validation when installed
+    else:
+        spec = args.spec
+    if not os.path.isfile(spec):
+        raise Exception(f"Spec file {spec} does not exist")
+
+    ynl = YnlFamily(spec, args.schema, args.process_unknown,
                     recv_size=args.dbg_small_recv)
     if args.dbg_small_recv:
         ynl.set_recv_dbg(True)
-- 
cgit v1.2.3


From b1b62d6d332e09a2125c830c809c2622164ec35b Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Sat, 11 Jan 2025 15:48:03 +0000
Subject: tools/net/ynl: ethtool: support spec load from install location

Replace hard-coded paths for spec and schema with lookup functions so
that ethtool.py will work in-tree or when installed.

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250111154803.7496-2-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ethtool.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
index ebb0a11f67bf..af7fddd7b085 100755
--- a/tools/net/ynl/pyynl/ethtool.py
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -11,6 +11,7 @@ import os
 
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily
+from cli import schema_dir, spec_dir
 
 def args_to_req(ynl, op_name, args, req):
     """
@@ -156,10 +157,8 @@ def main():
     args = parser.parse_args()
 
     script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
-    spec = os.path.join(script_abs_dir,
-                        '../../../Documentation/netlink/specs/ethtool.yaml')
-    schema = os.path.join(script_abs_dir,
-                          '../../../Documentation/netlink/genetlink-legacy.yaml')
+    spec = os.path.join(spec_dir(), 'ethtool.yaml')
+    schema = os.path.join(schema_dir(), 'genetlink-legacy.yaml')
 
     ynl = YnlFamily(spec, schema)
 
-- 
cgit v1.2.3


From cec1312a84f2d03534b8240a51b46a213a9ac3f7 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Mon, 25 Nov 2024 14:40:36 +0800
Subject: selftests/mm: add a few missing gitignore files

Compiled binary files should be added to .gitignore
'git status' complains:
   Untracked files:
   (use "git add <file>..." to include in what will be committed)
         mm/hugetlb_dio
         mm/pkey_sighandler_tests_32
         mm/pkey_sighandler_tests_64

Link: https://lkml.kernel.org/r/20241125064036.413536-1-lizhijian@fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 8f01f4da1c0d..085b06750bf4 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -36,6 +36,9 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 memfd_secret
+hugetlb_dio
+pkey_sighandler_tests_32
+pkey_sighandler_tests_64
 soft-dirty
 split_huge_page_test
 ksm_tests
-- 
cgit v1.2.3


From c58c4d244ec5f60e60f7eb3c1583c507b4ef585a Mon Sep 17 00:00:00 2001
From: guanjing <guanjing@cmss.chinamobile.com>
Date: Sun, 17 Nov 2024 15:12:31 +0800
Subject: selftests: mm: fix conversion specifiers in transact_test()

Lots of incorrect conversion specifiers. Fix them.

Link: https://lkml.kernel.org/r/20241117071231.177864-1-guanjing@cmss.chinamobile.com
Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests")
Signed-off-by: guanjing <guanjing@cmss.chinamobile.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index bcc73b4e805c..fdafce0654e9 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -1405,9 +1405,9 @@ static void transact_test(int page_size)
 	memset(mem, 0, 0x1000 * nthreads * pages_per_thread);
 
 	count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size);
-	ksft_test_result(count > 0, "%s count %d\n", __func__, count);
+	ksft_test_result(count > 0, "%s count %u\n", __func__, count);
 	count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size);
-	ksft_test_result(count == 0, "%s count %d\n", __func__, count);
+	ksft_test_result(count == 0, "%s count %u\n", __func__, count);
 
 	finish = 0;
 	for (i = 0; i < nthreads; ++i)
@@ -1429,7 +1429,7 @@ static void transact_test(int page_size)
 			ksft_exit_fail_msg("pthread_barrier_wait\n");
 
 		if (count > nthreads * access_per_thread)
-			ksft_exit_fail_msg("Too big count %d expected %d, iter %d\n",
+			ksft_exit_fail_msg("Too big count %u expected %u, iter %u\n",
 					   count, nthreads * access_per_thread, i);
 
 		c = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size);
@@ -1454,7 +1454,7 @@ static void transact_test(int page_size)
 			 * access and application gets page fault again for the same write.
 			 */
 			if (count < nthreads * access_per_thread) {
-				ksft_test_result_fail("Lost update, iter %d, %d vs %d.\n", i, count,
+				ksft_test_result_fail("Lost update, iter %u, %u vs %u.\n", i, count,
 						      nthreads * access_per_thread);
 				return;
 			}
@@ -1467,7 +1467,7 @@ static void transact_test(int page_size)
 	finish = 1;
 	pthread_barrier_wait(&end_barrier);
 
-	ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %d.\n", __func__,
+	ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %u.\n", __func__,
 			      extra_pages,
 			      100.0 * extra_pages / (iter_count * nthreads * access_per_thread),
 			      extra_thread_faults);
-- 
cgit v1.2.3


From a977273b857e6169adca74a47dd9a3dea41f184d Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@chromium.org>
Date: Sat, 16 Nov 2024 00:50:58 +0000
Subject: selftest/mm: remove seal_elf

Remove seal_elf, which is a demo of mseal, we no longer need this.

Link: https://lkml.kernel.org/r/20241116005058.69091-1-jeffxu@chromium.org
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore |   1 -
 tools/testing/selftests/mm/Makefile   |   1 -
 tools/testing/selftests/mm/seal_elf.c | 137 ----------------------------------
 3 files changed, 139 deletions(-)
 delete mode 100644 tools/testing/selftests/mm/seal_elf.c

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 085b06750bf4..a51a947b2d1d 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -52,7 +52,6 @@ va_high_addr_switch
 hugetlb_fault_after_madv
 hugetlb_madv_vs_map
 mseal_test
-seal_elf
 droppable
 hugetlb_dio
 pkey_sighandler_tests_32
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 3de23ea4663f..f2db43c64f83 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -75,7 +75,6 @@ TEST_GEN_FILES += mrelease_test
 TEST_GEN_FILES += mremap_dontunmap
 TEST_GEN_FILES += mremap_test
 TEST_GEN_FILES += mseal_test
-TEST_GEN_FILES += seal_elf
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += pagemap_ioctl
 TEST_GEN_FILES += thuge-gen
diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c
deleted file mode 100644
index d9f8ba8d5050..000000000000
--- a/tools/testing/selftests/mm/seal_elf.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sys/mman.h>
-#include <stdint.h>
-#include <asm-generic/unistd.h>
-#include <string.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <stdbool.h>
-#include "../kselftest.h"
-#include <syscall.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/vfs.h>
-#include <sys/stat.h>
-#include "mseal_helpers.h"
-
-/*
- * define sys_xyx to call syscall directly.
- */
-static int sys_mseal(void *start, size_t len)
-{
-	int sret;
-
-	errno = 0;
-	sret = syscall(__NR_mseal, start, len, 0);
-	return sret;
-}
-
-static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot)
-{
-	int sret;
-
-	errno = 0;
-	sret = syscall(__NR_mprotect, ptr, size, prot);
-	return sret;
-}
-
-static bool seal_support(void)
-{
-	int ret;
-	void *ptr;
-	unsigned long page_size = getpagesize();
-
-	ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (ptr == (void *) -1)
-		return false;
-
-	ret = sys_mseal(ptr, page_size);
-	if (ret < 0)
-		return false;
-
-	return true;
-}
-
-const char somestr[4096] = {"READONLY"};
-
-static void test_seal_elf(void)
-{
-	int ret;
-	FILE *maps;
-	char line[512];
-	uintptr_t  addr_start, addr_end;
-	char prot[5];
-	char filename[256];
-	unsigned long page_size = getpagesize();
-	unsigned long long ptr = (unsigned long long) somestr;
-	char *somestr2 = (char *)somestr;
-
-	/*
-	 * Modify the protection of readonly somestr
-	 */
-	if (((unsigned long long)ptr % page_size) != 0)
-		ptr = (unsigned long long)ptr & ~(page_size - 1);
-
-	ksft_print_msg("somestr = %s\n", somestr);
-	ksft_print_msg("change protection to rw\n");
-	ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE);
-	FAIL_TEST_IF_FALSE(!ret);
-	*somestr2 = 'A';
-	ksft_print_msg("somestr is modified to: %s\n", somestr);
-	ret = sys_mprotect((void *)ptr, page_size, PROT_READ);
-	FAIL_TEST_IF_FALSE(!ret);
-
-	maps = fopen("/proc/self/maps", "r");
-	FAIL_TEST_IF_FALSE(maps);
-
-	/*
-	 * apply sealing to elf binary
-	 */
-	while (fgets(line, sizeof(line), maps)) {
-		if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]",
-			&addr_start, &addr_end, prot, filename) == 4) {
-			if (strlen(filename)) {
-				/*
-				 * seal the mapping if read only.
-				 */
-				if (strstr(prot, "r-")) {
-					ret = sys_mseal((void *)addr_start, addr_end - addr_start);
-					FAIL_TEST_IF_FALSE(!ret);
-					ksft_print_msg("sealed: %lx-%lx %s %s\n",
-						addr_start, addr_end, prot, filename);
-					if ((uintptr_t) somestr >= addr_start &&
-						(uintptr_t) somestr <= addr_end)
-						ksft_print_msg("mapping for somestr found\n");
-				}
-			}
-		}
-	}
-	fclose(maps);
-
-	ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE);
-	FAIL_TEST_IF_FALSE(ret < 0);
-	ksft_print_msg("somestr is sealed, mprotect is rejected\n");
-
-	REPORT_TEST_PASS();
-}
-
-int main(int argc, char **argv)
-{
-	bool test_seal = seal_support();
-
-	ksft_print_header();
-	ksft_print_msg("pid=%d\n", getpid());
-
-	if (!test_seal)
-		ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n");
-
-	ksft_set_plan(1);
-
-	test_seal_elf();
-
-	ksft_finished();
-}
-- 
cgit v1.2.3


From 7d344babac9984a33dcb701469568a868ada1a30 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 3 Dec 2024 18:05:08 +0000
Subject: mm/vma: move brk() internals to mm/vma.c

Patch series "mm/vma: make more mmap logic userland testable".

This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.

It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.

Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.

Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.

The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.


This patch (of 5):

Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().

This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.

[lorenzo.stoakes@oracle.com: add missing personality header import]
  Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index e76ff579e1fd..7c3c15135c5b 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -39,6 +39,7 @@
 #define VM_SHARED	0x00000008
 #define VM_MAYREAD	0x00000010
 #define VM_MAYWRITE	0x00000020
+#define VM_MAYEXEC	0x00000040
 #define VM_GROWSDOWN	0x00000100
 #define VM_PFNMAP	0x00000400
 #define VM_LOCKED	0x00002000
@@ -58,6 +59,13 @@
 /* This mask represents all the VMA flag bits used by mlock */
 #define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
 
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define VM_DATA_DEFAULT_FLAGS	VM_DATA_FLAGS_TSK_EXEC
+
 #ifdef CONFIG_64BIT
 /* VM is sealed, in vm_flags */
 #define VM_SEALED	_BITUL(63)
@@ -122,10 +130,22 @@ enum {
 	TASK_COMM_LEN = 16,
 };
 
+/*
+ * Flags for bug emulation.
+ *
+ * These occupy the top three bytes.
+ */
+enum {
+	READ_IMPLIES_EXEC =	0x0400000,
+};
+
 struct task_struct {
 	char comm[TASK_COMM_LEN];
 	pid_t pid;
 	struct mm_struct *mm;
+
+	/* Used for emulating ABI behavior of previous Linux versions: */
+	unsigned int			personality;
 };
 
 struct task_struct *get_current(void);
@@ -186,6 +206,8 @@ struct mm_struct {
 	unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
 	unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
 	unsigned long stack_vm;	   /* VM_STACK */
+
+	unsigned long def_flags;
 };
 
 struct vma_lock {
-- 
cgit v1.2.3


From c7c643d98590bdc0531877079cd0825f7d33a848 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 3 Dec 2024 18:05:09 +0000
Subject: mm/vma: move unmapped_area() internals to mm/vma.c

We want to be able to unit test the unmapped area logic, so move it to
mm/vma.c.  The wrappers which invoke this remain in place in mm/mmap.c.

In addition, naturally, update the existing test code to enable this to be
compiled in userland.

Link: https://lkml.kernel.org/r/53a57a52a64ea54e9d129d2e2abca3a538022379.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma.c          |  6 ++++
 tools/testing/vma/vma_internal.h | 59 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 8fab5e13c7c3..39ee61e55634 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -18,6 +18,12 @@ static bool fail_prealloc;
 #define vma_iter_prealloc(vmi, vma)					\
 	(fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
 
+#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536
+
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
 /*
  * Directly import the VMA implementation here. Our vma_internal.h wrapper
  * provides userland-equivalent functionality for everything vma.c uses.
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 7c3c15135c5b..6ad8bd8edaad 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -27,6 +27,15 @@
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 
+extern unsigned long stack_guard_gap;
+#ifdef CONFIG_MMU
+extern unsigned long mmap_min_addr;
+extern unsigned long dac_mmap_min_addr;
+#else
+#define mmap_min_addr		0UL
+#define dac_mmap_min_addr	0UL
+#endif
+
 #define VM_WARN_ON(_expr) (WARN_ON(_expr))
 #define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
 #define VM_BUG_ON(_expr) (BUG_ON(_expr))
@@ -52,6 +61,8 @@
 #define VM_STACK	VM_GROWSDOWN
 #define VM_SHADOW_STACK	VM_NONE
 #define VM_SOFTDIRTY	0
+#define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
+#define VM_GROWSUP	VM_NONE
 
 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
@@ -66,6 +77,8 @@
 
 #define VM_DATA_DEFAULT_FLAGS	VM_DATA_FLAGS_TSK_EXEC
 
+#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
+
 #ifdef CONFIG_64BIT
 /* VM is sealed, in vm_flags */
 #define VM_SEALED	_BITUL(63)
@@ -395,6 +408,17 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+struct vm_unmapped_area_info {
+#define VM_UNMAPPED_AREA_TOPDOWN 1
+	unsigned long flags;
+	unsigned long length;
+	unsigned long low_limit;
+	unsigned long high_limit;
+	unsigned long align_mask;
+	unsigned long align_offset;
+	unsigned long start_gap;
+};
+
 static inline void vma_iter_invalidate(struct vma_iterator *vmi)
 {
 	mas_pause(&vmi->mas);
@@ -1055,4 +1079,39 @@ static inline int mmap_file(struct file *, struct vm_area_struct *)
 	return 0;
 }
 
+static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_GROWSDOWN)
+		return stack_guard_gap;
+
+	/* See reasoning around the VM_SHADOW_STACK definition */
+	if (vma->vm_flags & VM_SHADOW_STACK)
+		return PAGE_SIZE;
+
+	return 0;
+}
+
+static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+	unsigned long gap = stack_guard_start_gap(vma);
+	unsigned long vm_start = vma->vm_start;
+
+	vm_start -= gap;
+	if (vm_start > vma->vm_start)
+		vm_start = 0;
+	return vm_start;
+}
+
+static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+	unsigned long vm_end = vma->vm_end;
+
+	if (vma->vm_flags & VM_GROWSUP) {
+		vm_end += stack_guard_gap;
+		if (vm_end < vma->vm_end)
+			vm_end = -PAGE_SIZE;
+	}
+	return vm_end;
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From a9d1f3f2d7fecbc465bd4b16343a2ff8499cc558 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 3 Dec 2024 18:05:11 +0000
Subject: mm/vma: move stack expansion logic to mm/vma.c

We build on previous work making expand_downwards() an entirely internal
function.

This logic is subtle and so it is highly useful to get it into vma.c so we
can then userland unit test.

We must additionally move acct_stack_growth() to vma.c as it is a helper
function used by both expand_downwards() and expand_upwards().

We are also then able to mark anon_vma_interval_tree_pre_update_vma() and
anon_vma_interval_tree_post_update_vma() static as these are no longer
used by anything else.

Link: https://lkml.kernel.org/r/0feb104eff85922019d4fb29280f3afb130c5204.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma.c          |  5 ++++
 tools/testing/vma/vma_internal.h | 62 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 39ee61e55634..891d87a9ad6b 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -53,6 +53,11 @@ struct task_struct *get_current(void)
 	return &__current;
 }
 
+unsigned long rlimit(unsigned int limit)
+{
+	return (unsigned long)-1;
+}
+
 /* Helper function to simply allocate a VMA. */
 static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
 					unsigned long start,
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 6ad8bd8edaad..fab3f3bdf2f0 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -79,6 +79,11 @@ extern unsigned long dac_mmap_min_addr;
 
 #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
 
+#define RLIMIT_STACK		3	/* max stack size */
+#define RLIMIT_MEMLOCK		8	/* max locked-in-memory address space */
+
+#define CAP_IPC_LOCK         14
+
 #ifdef CONFIG_64BIT
 /* VM is sealed, in vm_flags */
 #define VM_SEALED	_BITUL(63)
@@ -478,6 +483,8 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
 
+extern unsigned long rlimit(unsigned int limit);
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	memset(vma, 0, sizeof(*vma));
@@ -1114,4 +1121,59 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
 	return vm_end;
 }
 
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+					unsigned long addr, unsigned long len)
+{
+	return 0;
+}
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_ACCESS_FLAGS;
+}
+
+static inline bool capable(int cap)
+{
+	return true;
+}
+
+static inline bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+			unsigned long bytes)
+{
+	unsigned long locked_pages, limit_pages;
+
+	if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+		return true;
+
+	locked_pages = bytes >> PAGE_SHIFT;
+	locked_pages += mm->locked_vm;
+
+	limit_pages = rlimit(RLIMIT_MEMLOCK);
+	limit_pages >>= PAGE_SHIFT;
+
+	return locked_pages <= limit_pages;
+}
+
+static inline int __anon_vma_prepare(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
+
+	if (!anon_vma)
+		return -ENOMEM;
+
+	anon_vma->root = anon_vma;
+	vma->anon_vma = anon_vma;
+
+	return 0;
+}
+
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+	if (likely(vma->anon_vma))
+		return 0;
+
+	return __anon_vma_prepare(vma);
+}
+
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From bef5418d1f3dee46bee1198d0c4a9c7c63fc2514 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 3 Dec 2024 18:05:12 +0000
Subject: mm/vma: move __vm_munmap() to mm/vma.c

This was arbitrarily left in mmap.c it makes no sense being there, move it
to vma.c to render it testable.

Link: https://lkml.kernel.org/r/5e5e81807c54dfbe363edb2d431eb3d7a37fcdba.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index fab3f3bdf2f0..a7de59a0d694 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -906,6 +906,11 @@ static inline void mmap_write_unlock(struct mm_struct *)
 {
 }
 
+static inline int mmap_write_lock_killable(struct mm_struct *)
+{
+	return 0;
+}
+
 static inline bool can_modify_mm(struct mm_struct *mm,
 				 unsigned long start,
 				 unsigned long end)
@@ -1175,5 +1180,9 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
 	return __anon_vma_prepare(vma);
 }
 
+static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
+					      struct list_head *uf)
+{
+}
 
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From 19b65ffae985c16da237edccae324bcbf1a5408e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 5 Dec 2024 19:07:48 +0000
Subject: selftests/mm: add fork CoW guard page test

When we fork anonymous pages, apply a guard page then remove it, the
previous CoW mapping is cleared.

This might not be obvious to an outside observer without taking some time
to think about how the overall process functions, so document that this is
the case through a test, which also usefully asserts that the behaviour is
as we expect.

This is grouped with other, more important, fork tests that ensure that
guard pages are correctly propagated on fork.

Fix a typo in a nearby comment at the same time.

[ryan.roberts@arm.com: static process_madvise() wrapper for guard-pages]
  Link: https://lkml.kernel.org/r/20250107142937.1870478-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20241205190748.115656-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/guard-pages.c  | 83 +++++++++++++++++++++++++++++--
 tools/testing/selftests/mm/run_vmtests.sh |  5 ++
 2 files changed, 85 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c
index 7cdf815d0d63..ece37212a8a2 100644
--- a/tools/testing/selftests/mm/guard-pages.c
+++ b/tools/testing/selftests/mm/guard-pages.c
@@ -55,6 +55,12 @@ static int pidfd_open(pid_t pid, unsigned int flags)
 	return syscall(SYS_pidfd_open, pid, flags);
 }
 
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+				   size_t n, int advice, unsigned int flags)
+{
+	return syscall(__NR_process_madvise, pidfd, iovec, n, advice, flags);
+}
+
 /*
  * Enable our signal catcher and try to read/write the specified buffer. The
  * return value indicates whether the read/write succeeds without a fatal
@@ -419,7 +425,7 @@ TEST_F(guard_pages, process_madvise)
 	ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0);
 
 	/* Now guard in one step. */
-	count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0);
+	count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0);
 
 	/* OK we don't have permission to do this, skip. */
 	if (count == -1 && errno == EPERM)
@@ -440,7 +446,7 @@ TEST_F(guard_pages, process_madvise)
 	ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size]));
 
 	/* Now do the same with unguard... */
-	count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0);
+	count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0);
 
 	/* ...and everything should now succeed. */
 
@@ -990,7 +996,7 @@ TEST_F(guard_pages, fork)
 		   MAP_ANON | MAP_PRIVATE, -1, 0);
 	ASSERT_NE(ptr, MAP_FAILED);
 
-	/* Establish guard apges in the first 5 pages. */
+	/* Establish guard pages in the first 5 pages. */
 	ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
 
 	pid = fork();
@@ -1029,6 +1035,77 @@ TEST_F(guard_pages, fork)
 	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
 }
 
+/*
+ * Assert expected behaviour after we fork populated ranges of anonymous memory
+ * and then guard and unguard the range.
+ */
+TEST_F(guard_pages, fork_cow)
+{
+	const unsigned long page_size = self->page_size;
+	char *ptr;
+	pid_t pid;
+	int i;
+
+	/* Map 10 pages. */
+	ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANON | MAP_PRIVATE, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	/* Populate range. */
+	for (i = 0; i < 10 * page_size; i++) {
+		char chr = 'a' + (i % 26);
+
+		ptr[i] = chr;
+	}
+
+	pid = fork();
+	ASSERT_NE(pid, -1);
+	if (!pid) {
+		/* This is the child process now. */
+
+		/* Ensure the range is as expected. */
+		for (i = 0; i < 10 * page_size; i++) {
+			char expected = 'a' + (i % 26);
+			char actual = ptr[i];
+
+			ASSERT_EQ(actual, expected);
+		}
+
+		/* Establish guard pages across the whole range. */
+		ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+		/* Remove it. */
+		ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+		/*
+		 * By removing the guard pages, the page tables will be
+		 * cleared. Assert that we are looking at the zero page now.
+		 */
+		for (i = 0; i < 10 * page_size; i++) {
+			char actual = ptr[i];
+
+			ASSERT_EQ(actual, '\0');
+		}
+
+		exit(0);
+	}
+
+	/* Parent process. */
+
+	/* Parent simply waits on child. */
+	waitpid(pid, NULL, 0);
+
+	/* Ensure the range is unchanged in parent anon range. */
+	for (i = 0; i < 10 * page_size; i++) {
+		char expected = 'a' + (i % 26);
+		char actual = ptr[i];
+
+		ASSERT_EQ(actual, expected);
+	}
+
+	/* Cleanup. */
+	ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
 /*
  * Assert that forking a process with VMAs that do have VM_WIPEONFORK set
  * behave as expected.
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 2fc290d9430c..00c3f07ea100 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -45,6 +45,8 @@ separated by spaces:
 	vmalloc smoke tests
 - hmm
 	hmm smoke tests
+- madv_guard
+	test madvise(2) MADV_GUARD_INSTALL and MADV_GUARD_REMOVE options
 - madv_populate
 	test memadvise(2) MADV_POPULATE_{READ,WRITE} options
 - memfd_secret
@@ -375,6 +377,9 @@ CATEGORY="mremap" run_test ./mremap_dontunmap
 
 CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
 
+# MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests
+CATEGORY="madv_guard" run_test ./guard-pages
+
 # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
 CATEGORY="madv_populate" run_test ./madv_populate
 
-- 
cgit v1.2.3


From e5e7fb278e5924f29ceab42bbbb891cde528f7cc Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Fri, 22 Nov 2024 09:44:15 -0800
Subject: mm: convert mm_lock_seq to a proper seqcount

Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock
variants to increment it, in-line with the usual seqcount usage pattern.
This lets us check whether the mmap_lock is write-locked by checking
mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be
used when implementing mmap_lock speculation functions.
As a result vm_lock_seq is also change to be unsigned to match the type
of mm_lock_seq.sequence.

Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma.c          | 4 ++--
 tools/testing/vma/vma_internal.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 891d87a9ad6b..920fba58884e 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -100,7 +100,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
 	 * begun. Linking to the tree will have caused this to be incremented,
 	 * which means we will get a false positive otherwise.
 	 */
-	vma->vm_lock_seq = -1;
+	vma->vm_lock_seq = UINT_MAX;
 
 	return vma;
 }
@@ -225,7 +225,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
 	int seq = vma->vm_lock_seq;
 
 	/* We reset after each check. */
-	vma->vm_lock_seq = -1;
+	vma->vm_lock_seq = UINT_MAX;
 
 	/* The vma_start_write() stub simply increments this value. */
 	return seq > -1;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index a7de59a0d694..b973b3e41c83 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -281,7 +281,7 @@ struct vm_area_struct {
 	 * counter reuse can only lead to occasional unnecessary use of the
 	 * slowpath.
 	 */
-	int vm_lock_seq;
+	unsigned int vm_lock_seq;
 	struct vma_lock *vm_lock;
 #endif
 
@@ -467,7 +467,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
 		return false;
 
 	init_rwsem(&vma->vm_lock->lock);
-	vma->vm_lock_seq = -1;
+	vma->vm_lock_seq = UINT_MAX;
 
 	return true;
 }
-- 
cgit v1.2.3


From 2f3577d16c8644455f215abb3c2f77aa32b967b1 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 9 Dec 2024 23:56:21 +0500
Subject: selftests/mm: thp_settings: remove const from return type

Patch series "selftest/mm: Remove warnings found by adding compiler flags".

Recently, I reviewed a patch on the mm/kselftest mailing list about a test
which had obvious type mismatch fix in it.  It was strange why that wasn't
caught during development and when patch was accepted.  This led me to
discover that those extra compiler options to catch these warnings aren't
being used.  When I added them, I found tens of warnings in just mm suite.

In this series, I'm fixing those warnings in a few files.  More fixes will
be sent later.


This patch (of 4):

Remove cost from the return type as it is ignored anyways and generates
the warning:

  warning: type qualifiers ignored on function return type [-Wignored-qualifiers]

Link: https://lkml.kernel.org/r/20241209185624.2245158-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20241209185624.2245158-2-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/thp_settings.c | 4 ++--
 tools/testing/selftests/mm/thp_settings.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c
index 577eaab6266f..ad872af1c81a 100644
--- a/tools/testing/selftests/mm/thp_settings.c
+++ b/tools/testing/selftests/mm/thp_settings.c
@@ -87,7 +87,7 @@ int write_file(const char *path, const char *buf, size_t buflen)
 	return (unsigned int) numwritten;
 }
 
-const unsigned long read_num(const char *path)
+unsigned long read_num(const char *path)
 {
 	char buf[21];
 
@@ -172,7 +172,7 @@ void thp_write_string(const char *name, const char *val)
 	}
 }
 
-const unsigned long thp_read_num(const char *name)
+unsigned long thp_read_num(const char *name)
 {
 	char path[PATH_MAX];
 	int ret;
diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h
index 876235a23460..fc131d23d593 100644
--- a/tools/testing/selftests/mm/thp_settings.h
+++ b/tools/testing/selftests/mm/thp_settings.h
@@ -64,12 +64,12 @@ struct thp_settings {
 
 int read_file(const char *path, char *buf, size_t buflen);
 int write_file(const char *path, const char *buf, size_t buflen);
-const unsigned long read_num(const char *path);
+unsigned long read_num(const char *path);
 void write_num(const char *path, unsigned long num);
 
 int thp_read_string(const char *name, const char * const strings[]);
 void thp_write_string(const char *name, const char *val);
-const unsigned long thp_read_num(const char *name);
+unsigned long thp_read_num(const char *name);
 void thp_write_num(const char *name, unsigned long num);
 
 void thp_write_settings(struct thp_settings *settings);
-- 
cgit v1.2.3


From 43448e5bbbad1fb168b728b8a7c0058ab1397375 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 9 Dec 2024 23:56:22 +0500
Subject: selftests/mm: pagemap_ioctl: Fix types mismatches shown by compiler
 options

Fix following warnings caught by compiler:

- There are several type mismatches among different variables.
- Remove unused variable warnings.

Link: https://lkml.kernel.org/r/20241209185624.2245158-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 108 ++++++++++++++++-------------
 tools/testing/selftests/mm/vm_util.c       |   2 +-
 2 files changed, 59 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index fdafce0654e9..57b4bba2b45f 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -34,8 +34,8 @@
 #define PAGEMAP "/proc/self/pagemap"
 int pagemap_fd;
 int uffd;
-int page_size;
-int hpage_size;
+unsigned int page_size;
+unsigned int hpage_size;
 const char *progname;
 
 #define LEN(region)	((region.end - region.start)/page_size)
@@ -235,7 +235,9 @@ int get_reads(struct page_region *vec, int vec_size)
 
 int sanity_tests_sd(void)
 {
-	int mem_size, vec_size, ret, ret2, ret3, i, num_pages = 1000, total_pages = 0;
+	unsigned long long mem_size, vec_size, i, total_pages = 0;
+	long ret, ret2, ret3;
+	int num_pages = 1000;
 	int total_writes, total_reads, reads, count;
 	struct page_region *vec, *vec2;
 	char *mem, *m[2];
@@ -321,9 +323,9 @@ int sanity_tests_sd(void)
 	ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0,
 			    0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
-	ksft_test_result(ret == mem_size/(page_size * 2),
+	ksft_test_result((unsigned long long)ret == mem_size/(page_size * 2),
 			 "%s Repeated pattern of written and non-written pages\n", __func__);
 
 	/* 4. Repeated pattern of written and non-written pages in parts */
@@ -331,21 +333,21 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			    num_pages/2 - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	ret2 = pagemap_ioctl(mem, mem_size, vec, 2, 0, 0, PAGE_IS_WRITTEN, 0, 0,
 			     PAGE_IS_WRITTEN);
 	if (ret2 < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno));
 
 	ret3 = pagemap_ioctl(mem, mem_size, vec, vec_size,
 			     PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			     0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret3 < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret3, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret3, errno, strerror(errno));
 
 	ksft_test_result((ret + ret3) == num_pages/2 && ret2 == 2,
-			 "%s Repeated pattern of written and non-written pages in parts %d %d %d\n",
+			 "%s Repeated pattern of written and non-written pages in parts %ld %ld %ld\n",
 			 __func__, ret, ret3, ret2);
 
 	/* 5. Repeated pattern of written and non-written pages max_pages */
@@ -357,13 +359,13 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			    num_pages/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size,
 			     PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			     0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret2 < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno));
 
 	ksft_test_result(ret == num_pages/2 && ret2 == 1,
 			 "%s Repeated pattern of written and non-written pages max_pages\n",
@@ -378,12 +380,12 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			    2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	ret2 = pagemap_ioctl(mem, mem_size, vec2, vec_size, 0, 0,
 			      PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret2 < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno));
 
 	ksft_test_result(ret == 1 && LEN(vec[0]) == 2 &&
 			 vec[0].start == (uintptr_t)(mem + page_size) &&
@@ -416,7 +418,7 @@ int sanity_tests_sd(void)
 	ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0,
 			    PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	ksft_test_result(ret == 1 && LEN(vec[0]) == mem_size/page_size,
 			 "%s Two regions\n", __func__);
@@ -448,7 +450,7 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0,
 			    PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	for (i = 0; i < mem_size/page_size; i += 2)
 		mem[i * page_size]++;
@@ -457,7 +459,7 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			    mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	total_pages += ret;
 
@@ -465,7 +467,7 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			    mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	total_pages += ret;
 
@@ -473,7 +475,7 @@ int sanity_tests_sd(void)
 			    PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			    mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
 	total_pages += ret;
 
@@ -515,9 +517,9 @@ int sanity_tests_sd(void)
 					  vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 					  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 			if (ret < 0)
-				ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+				ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 
-			if (ret > vec_size)
+			if ((unsigned long)ret > vec_size)
 				break;
 
 			reads = get_reads(vec, ret);
@@ -554,63 +556,63 @@ int sanity_tests_sd(void)
 	ret = pagemap_ioc(mem, 0, vec, vec_size, 0,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 0 && walk_end == (long)mem,
 			 "Walk_end: Same start and end address\n");
 
 	ret = pagemap_ioc(mem, 0, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 0 && walk_end == (long)mem,
 			 "Walk_end: Same start and end with WP\n");
 
 	ret = pagemap_ioc(mem, 0, vec, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 0 && walk_end == (long)mem,
 			 "Walk_end: Same start and end with 0 output buffer\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
 			 "Walk_end: Big vec\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
 			 "Walk_end: vec of minimum length\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
 			  vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
 			 "Walk_end: Max pages specified\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size/2),
 			 "Walk_end: Half max pages\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size),
 			 "Walk_end: 1 max page\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  -1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
 			 "Walk_end: max pages\n");
 
@@ -621,49 +623,49 @@ int sanity_tests_sd(void)
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
-	ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+	ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
 			 "Walk_end sparse: Big vec\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
 			  0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2),
 			 "Walk_end sparse: vec of minimum length\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
 			  vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2),
 			 "Walk_end sparse: Max pages specified\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size/2, 0,
 			  vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
-	ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+	ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
 			 "Walk_end sparse: Max pages specified\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
-	ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+	ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
 			 "Walk_end sparse: Max pages specified\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
-	ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+	ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
 			 "Walk_endsparse : Half max pages\n");
 
 	ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
 			  1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
 	if (ret < 0)
-		ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+		ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
 	ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2),
 			 "Walk_end: 1 max page\n");
 
@@ -674,9 +676,10 @@ int sanity_tests_sd(void)
 	return 0;
 }
 
-int base_tests(char *prefix, char *mem, int mem_size, int skip)
+int base_tests(char *prefix, char *mem, unsigned long long mem_size, int skip)
 {
-	int vec_size, written;
+	unsigned long long vec_size;
+	int written;
 	struct page_region *vec, *vec2;
 
 	if (skip) {
@@ -799,8 +802,8 @@ int hpage_unit_tests(void)
 	char *map;
 	int ret, ret2;
 	size_t num_pages = 10;
-	int map_size = hpage_size * num_pages;
-	int vec_size = map_size/page_size;
+	unsigned long long map_size = hpage_size * num_pages;
+	unsigned long long vec_size = map_size/page_size;
 	struct page_region *vec, *vec2;
 
 	vec = malloc(sizeof(struct page_region) * vec_size);
@@ -1047,7 +1050,8 @@ static void test_simple(void)
 
 int sanity_tests(void)
 {
-	int mem_size, vec_size, ret, fd, i, buf_size;
+	unsigned long long mem_size, vec_size;
+	int ret, fd, i, buf_size;
 	struct page_region *vec;
 	char *mem, *fmem;
 	struct stat sbuf;
@@ -1312,7 +1316,9 @@ static ssize_t get_dirty_pages_reset(char *mem, unsigned int count,
 {
 	struct pm_scan_arg arg = {0};
 	struct page_region rgns[256];
-	int i, j, cnt, ret;
+	unsigned long long i, j;
+	long ret;
+	int cnt;
 
 	arg.size = sizeof(struct pm_scan_arg);
 	arg.start = (uintptr_t)mem;
@@ -1330,7 +1336,7 @@ static ssize_t get_dirty_pages_reset(char *mem, unsigned int count,
 		ksft_exit_fail_msg("ioctl failed\n");
 
 	cnt = 0;
-	for (i = 0; i < ret; ++i) {
+	for (i = 0; i < (unsigned long)ret; ++i) {
 		if (rgns[i].categories != PAGE_IS_WRITTEN)
 			ksft_exit_fail_msg("wrong flags\n");
 
@@ -1384,9 +1390,10 @@ void *thread_proc(void *mem)
 static void transact_test(int page_size)
 {
 	unsigned int i, count, extra_pages;
+	unsigned int c;
 	pthread_t th;
 	char *mem;
-	int ret, c;
+	int ret;
 
 	if (pthread_barrier_init(&start_barrier, NULL, nthreads + 1))
 		ksft_exit_fail_msg("pthread_barrier_init\n");
@@ -1473,9 +1480,10 @@ static void transact_test(int page_size)
 			      extra_thread_faults);
 }
 
-int main(int argc, char *argv[])
+int main(int __attribute__((unused)) argc, char *argv[])
 {
-	int mem_size, shmid, buf_size, fd, i, ret;
+	int shmid, buf_size, fd, i, ret;
+	unsigned long long mem_size;
 	char *mem, *map, *fmem;
 	struct stat sbuf;
 
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index d8d0cf04bb57..7519c9a892f0 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -138,7 +138,7 @@ void clear_softdirty(void)
 		ksft_exit_fail_msg("opening clear_refs failed\n");
 	ret = write(fd, ctrl, strlen(ctrl));
 	close(fd);
-	if (ret != strlen(ctrl))
+	if (ret != (signed int)strlen(ctrl))
 		ksft_exit_fail_msg("writing clear_refs failed\n");
 }
 
-- 
cgit v1.2.3


From 8fee0d5c4888477c6c71143e58aabf1f5c5fe498 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 9 Dec 2024 23:56:23 +0500
Subject: selftests/mm: mseal_test: remove unused variables

Fix following warnings:
- Remove unused variables and fix following warnings:

Link: https://lkml.kernel.org/r/20241209185624.2245158-4-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mseal_test.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
index 01675c412b2a..ad17005521a8 100644
--- a/tools/testing/selftests/mm/mseal_test.c
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -802,7 +802,7 @@ static void test_seal_mprotect_partial_mprotect_tail(bool seal)
 }
 
 
-static void test_seal_mprotect_two_vma_with_gap(bool seal)
+static void test_seal_mprotect_two_vma_with_gap(void)
 {
 	void *ptr;
 	unsigned long page_size = getpagesize();
@@ -1864,7 +1864,7 @@ static void test_seal_madvise_nodiscard(bool seal)
 	REPORT_TEST_PASS();
 }
 
-int main(int argc, char **argv)
+int main(void)
 {
 	bool test_seal = seal_support();
 
@@ -1913,8 +1913,8 @@ int main(int argc, char **argv)
 	test_seal_mprotect_partial_mprotect(false);
 	test_seal_mprotect_partial_mprotect(true);
 
-	test_seal_mprotect_two_vma_with_gap(false);
-	test_seal_mprotect_two_vma_with_gap(true);
+	test_seal_mprotect_two_vma_with_gap();
+	test_seal_mprotect_two_vma_with_gap();
 
 	test_seal_mprotect_merge(false);
 	test_seal_mprotect_merge(true);
-- 
cgit v1.2.3


From fa5d61791117177e4c6aa87f7d3c170fa6f1f43b Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 9 Dec 2024 23:56:24 +0500
Subject: selftests/mm: mremap_test: Remove unused variable and type mismatches

Remove unused variable and fix type mismatches.

Link: https://lkml.kernel.org/r/20241209185624.2245158-5-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mremap_test.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 5a3a9bcba640..d207a52f2b5b 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -34,7 +34,7 @@ struct config {
 	unsigned long long dest_alignment;
 	unsigned long long region_size;
 	int overlapping;
-	int dest_preamble_size;
+	unsigned int dest_preamble_size;
 };
 
 struct test {
@@ -328,7 +328,7 @@ static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr)
 {
 	char *test_name = "mremap mremap move within range";
 	void *src, *dest;
-	int i, success = 1;
+	unsigned int i, success = 1;
 
 	size_t size = SIZE_MB(20);
 	void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
@@ -569,7 +569,7 @@ static void mremap_move_1mb_from_start(unsigned int pattern_seed,
 {
 	char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src";
 	void *src = NULL, *dest = NULL;
-	int i, success = 1;
+	unsigned int i, success = 1;
 
 	/* Config to reuse get_source_mapping() to do an aligned mmap. */
 	struct config c = {
@@ -636,7 +636,7 @@ out:
 
 static void run_mremap_test_case(struct test test_case, int *failures,
 				 unsigned int threshold_mb,
-				 unsigned int pattern_seed, char *rand_addr)
+				 char *rand_addr)
 {
 	long long remap_time = remap_region(test_case.config, threshold_mb,
 					    rand_addr);
@@ -708,7 +708,8 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
 int main(int argc, char **argv)
 {
 	int failures = 0;
-	int i, run_perf_tests;
+	unsigned int i;
+	int run_perf_tests;
 	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
 
 	/* hard-coded test configs */
@@ -831,7 +832,7 @@ int main(int argc, char **argv)
 
 	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
 		run_mremap_test_case(test_cases[i], &failures, threshold_mb,
-				     pattern_seed, rand_addr);
+				     rand_addr);
 
 	maps_fp = fopen("/proc/self/maps", "r");
 
@@ -853,7 +854,7 @@ int main(int argc, char **argv)
 		 "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
 		for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
 			run_mremap_test_case(perf_test_cases[i], &failures,
-					     threshold_mb, pattern_seed,
+					     threshold_mb,
 					     rand_addr);
 	}
 
-- 
cgit v1.2.3


From 8e36b2945e7057d956b5eb4a91cbf42eb5b26148 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:06 +0000
Subject: selftests/mm: fix condition in uffd_move_test_common()

Patch series "pkeys kselftests improvements".

This series brings various cleanups and fixes for the mm (mostly pkeys)
kselftests.  The original goal was to make the pkeys tests work out of the
box and without build warning - it turned out to be more involved than
expected.

The most important change is enabling -O2 when building all mm kselftests
(patch 5).  This is actually needed for the pkeys tests to run
successfully (see gcc command line at the top of protection_keys.c and
pkey_sighandler_tests.c), and seems to have no negative impact on the
other tests.  It certainly can't hurt performance!

The following patches address a few obvious issues in the pkeys tests
(unused code, bad scope for functions/variables, etc.) and finally make a
couple of small improvements.

There is one ugliness that this series does not fix: some functions in
pkey-<arch>.h call functions that are actually defined in
protection_keys.c.  For instance, expect_fault_on_read_execonly_key() in
pkey-x86.h calls expected_pkey_fault().  This means that other test
programs that use pkey-helpers.h (namely pkey_sighandler_tests) would fail
to link if they called such functions defined in pkey-<arch>.h.  Fixing
this would require a more comprehensive reorganisation of the pkey-*
headers, which doesn't seem worth it (patch 9 adds a comment to
pkey-helpers.h to clarify the situation).

Some more details on the patches:

- Patch 1 is an unrelated fix that was revealed by inspecting a warning.
  It seems fairly harmless though, so I thought I'd just post it as part
  of this series.

- Patch 2-5 fix various warnings that come up by building the mm tests
  at -O2 and finally enable -O2.

- Patch 6-12 are various cleanups for the pkeys tests. Patch 11 in
  particular enables is_pkeys_supported() to be called from outside
  protection_keys.c (patch 13 relies on this).

- Patch 13-14 are small improvements to pkey_sighandler_tests.c.

Many thanks to Ryan Roberts for checking that the mm tests still run fine
on arm64 with those patches applied.  I've also checked that the pkeys
tests run fine on arm64 and x86.


This patch (of 14):

area_src and area_dst are saved at the beginning of the function if
chunk_size > page_size.  The intention is quite clearly to restore them at
the end based on the same condition, but step_size is considered instead
of chunk_size.  Considering that step_size is a number of pages, the
condition is likely to be false.

Use the same condition as when saving so that the globals are restored as
intended.

Link: https://lkml.kernel.org/r/20241209095019.1732120-1-kevin.brodsky@arm.com
Link: https://lkml.kernel.org/r/20241209095019.1732120-2-kevin.brodsky@arm.com
Fixes: a2bf6a9ca805 ("selftests/mm: add UFFDIO_MOVE ioctl test")
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/uffd-unit-tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index a2e71b1636e7..74c884713bf7 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1190,7 +1190,7 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size,
 				    nr, count, count_verify[src_offs + nr + i]);
 		}
 	}
-	if (step_size > page_size) {
+	if (chunk_size > page_size) {
 		area_src = orig_area_src;
 		area_dst = orig_area_dst;
 	}
-- 
cgit v1.2.3


From 516fb516383ef39d881d116c6447826356883ad0 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:07 +0000
Subject: selftests/mm: fix -Wmaybe-uninitialized warnings

A few -Wmaybe-uninitialized warnings show up when building the mm tests
with -O2.  None of them looks worrying; silence them by initialising the
problematic variables.

Link: https://lkml.kernel.org/r/20241209095019.1732120-3-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/ksm_tests.c       | 2 +-
 tools/testing/selftests/mm/mremap_test.c     | 2 +-
 tools/testing/selftests/mm/soft-dirty.c      | 2 +-
 tools/testing/selftests/mm/uffd-unit-tests.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index b748c48908d9..dcdd5bb20f3d 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -776,7 +776,7 @@ err_out:
 
 int main(int argc, char *argv[])
 {
-	int ret, opt;
+	int ret = 0, opt;
 	int prot = 0;
 	int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
 	int merge_type = KSM_MERGE_TYPE_DEFAULT;
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index d207a52f2b5b..bb84476a177f 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -384,7 +384,7 @@ out:
 static long long remap_region(struct config c, unsigned int threshold_mb,
 			      char *rand_addr)
 {
-	void *addr, *src_addr, *dest_addr, *dest_preamble_addr;
+	void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL;
 	unsigned long long t, d;
 	struct timespec t_start = {0, 0}, t_end = {0, 0};
 	long long  start_ns, end_ns, align_mask, ret, offset;
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index bdfa5d085f00..8e1462ce0532 100644
--- a/tools/testing/selftests/mm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -128,7 +128,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
 {
 	const char *type[] = {"file", "anon"};
 	const char *fname = "./soft-dirty-test-file";
-	int test_fd;
+	int test_fd = 0;
 	char *map;
 
 	if (anon) {
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 74c884713bf7..9ff71fa1f9bf 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1122,7 +1122,7 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size,
 	char c;
 	unsigned long long count;
 	struct uffd_args args = { 0 };
-	char *orig_area_src, *orig_area_dst;
+	char *orig_area_src = NULL, *orig_area_dst = NULL;
 	unsigned long step_size, step_count;
 	unsigned long src_offs = 0;
 	unsigned long dst_offs = 0;
-- 
cgit v1.2.3


From 5b6b2799f617b3259d551980fa94f290d96bc593 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:08 +0000
Subject: selftests/mm: fix strncpy() length

GCC complains (with -O2) that the length is equal to the destination size,
which is indeed invalid.  Subtract 1 from the size of the array to leave
room for '\0'.

Link: https://lkml.kernel.org/r/20241209095019.1732120-4-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/write_to_hugetlbfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
index 1289d311efd7..34c91f7e6128 100644
--- a/tools/testing/selftests/mm/write_to_hugetlbfs.c
+++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 			size = atoi(optarg);
 			break;
 		case 'p':
-			strncpy(path, optarg, sizeof(path));
+			strncpy(path, optarg, sizeof(path) - 1);
 			break;
 		case 'm':
 			if (atoi(optarg) >= MAX_METHOD) {
-- 
cgit v1.2.3


From 71384f84cbbe7660023b01c1a0fa9cc7dbc487a7 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:09 +0000
Subject: selftests/mm: fix -Warray-bounds warnings in pkey_sighandler_tests

GCC doesn't like dereferencing a pointer set to 0x1 (when building
at -O2):

pkey_sighandler_tests.c:166:9: warning: array subscript 0 is outside array bounds of 'int[0]' [-Warray-bounds=]
  166 |         *(int *) (0x1) = 1;
      |         ^~~~~~~~~~~~~~
cc1: note: source object is likely at address zero

Using NULL instead seems to make it happy.  This should make no difference
in practice (SIGSEGV with SEGV_MAPERR will be the outcome regardless), we
just need to update the expected si_addr.

[kevin.brodsky@arm.com: fix clang dereferencing-null issue]
  Link: https://lkml.kernel.org/r/20241218153615.2267571-1-kevin.brodsky@arm.com
Link: https://lkml.kernel.org/r/20241209095019.1732120-5-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index c593a426341c..c6c020a2a6f5 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -163,7 +163,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr)
 	__write_pkey_reg(pkey_reg_restrictive_default());
 
 	/* Segfault (with SEGV_MAPERR) */
-	*(int *) (0x1) = 1;
+	*(volatile int *)NULL = 1;
 	return NULL;
 }
 
@@ -179,7 +179,6 @@ static void *thread_segv_pkuerr_stack(void *ptr)
 static void *thread_segv_maperr_ptr(void *ptr)
 {
 	stack_t *stack = ptr;
-	int *bad = (int *)1;
 	u64 pkey_reg;
 
 	/*
@@ -195,7 +194,7 @@ static void *thread_segv_maperr_ptr(void *ptr)
 	__write_pkey_reg(pkey_reg);
 
 	/* Segfault */
-	*bad = 1;
+	*(volatile int *)NULL = 1;
 	syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
 	return NULL;
 }
@@ -234,7 +233,7 @@ static void test_sigsegv_handler_with_pkey0_disabled(void)
 
 	ksft_test_result(siginfo.si_signo == SIGSEGV &&
 			 siginfo.si_code == SEGV_MAPERR &&
-			 siginfo.si_addr == (void *)1,
+			 siginfo.si_addr == NULL,
 			 "%s\n", __func__);
 }
 
@@ -349,7 +348,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 
 	ksft_test_result(siginfo.si_signo == SIGSEGV &&
 			 siginfo.si_code == SEGV_MAPERR &&
-			 siginfo.si_addr == (void *)1,
+			 siginfo.si_addr == NULL,
 			 "%s\n", __func__);
 }
 
-- 
cgit v1.2.3


From 46036188ea1f5266df23a6149dea0df1c77cd1c7 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:10 +0000
Subject: selftests/mm: build with -O2

The mm kselftests are currently built with no optimisation (-O0).  It's
unclear why, and besides being obviously suboptimal, this also prevents
the pkeys tests from working as intended.  Let's build all the tests with
-O2.

[kevin.brodsky@arm.com: silence unused-result warnings]
  Link: https://lkml.kernel.org/r/20250107170110.2819685-1-kevin.brodsky@arm.com
Link: https://lkml.kernel.org/r/20241209095019.1732120-6-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index f2db43c64f83..0d5d8f2f8652 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -33,9 +33,16 @@ endif
 # LDLIBS.
 MAKEFLAGS += --no-builtin-rules
 
-CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+CFLAGS = -Wall -O2 -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS = -lrt -lpthread -lm
 
+# Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is
+# automatically enabled at -O1 or above. This triggers various unused-result
+# warnings where functions such as read() or write() are called and their
+# return value is not checked. Disable _FORTIFY_SOURCE to silence those
+# warnings.
+CFLAGS += -U_FORTIFY_SOURCE
+
 KDIR ?= /lib/modules/$(shell uname -r)/build
 ifneq (,$(wildcard $(KDIR)/Module.symvers))
 ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h))
-- 
cgit v1.2.3


From 31fdc9657bbc9a3245bce4cd8b51bcc243b6cb97 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:11 +0000
Subject: selftests/mm: remove unused pkey helpers

Commit 5f23f6d082a9 ("x86/pkeys: Add self-tests") introduced a
number of helpers and functions that don't seem to have ever been
used. Let's remove them.

Link: https://lkml.kernel.org/r/20241209095019.1732120-7-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey-helpers.h    | 34 ----------------------------
 tools/testing/selftests/mm/protection_keys.c | 34 ----------------------------
 2 files changed, 68 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index f7cfe163b0ff..472febd992eb 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -26,9 +26,7 @@
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
 #endif
-#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
 extern int dprint_in_signal;
-extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
 extern int test_nr;
 extern int iteration_nr;
@@ -171,38 +169,6 @@ static inline void write_pkey_reg(u64 pkey_reg)
 			pkey_reg, __read_pkey_reg());
 }
 
-/*
- * These are technically racy. since something could
- * change PKEY register between the read and the write.
- */
-static inline void __pkey_access_allow(int pkey, int do_allow)
-{
-	u64 pkey_reg = read_pkey_reg();
-	int bit = pkey * 2;
-
-	if (do_allow)
-		pkey_reg &= (1<<bit);
-	else
-		pkey_reg |= (1<<bit);
-
-	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
-	write_pkey_reg(pkey_reg);
-}
-
-static inline void __pkey_write_allow(int pkey, int do_allow_write)
-{
-	u64 pkey_reg = read_pkey_reg();
-	int bit = pkey * 2 + 1;
-
-	if (do_allow_write)
-		pkey_reg &= (1<<bit);
-	else
-		pkey_reg |= (1<<bit);
-
-	write_pkey_reg(pkey_reg);
-	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
-}
-
 #define ALIGN_UP(x, align_to)	(((x) + ((align_to)-1)) & ~((align_to)-1))
 #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
 #define ALIGN_PTR_UP(p, ptr_align_to)	\
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 4990f7ab4cb7..fcbebc4490b4 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -53,7 +53,6 @@ int test_nr;
 
 u64 shadow_pkey_reg;
 int dprint_in_signal;
-char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
 void cat_into_file(char *str, char *file)
 {
@@ -397,12 +396,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 	dprint_in_signal = 0;
 }
 
-int wait_all_children(void)
-{
-	int status;
-	return waitpid(-1, &status, 0);
-}
-
 void sig_chld(int x)
 {
 	dprint_in_signal = 1;
@@ -817,39 +810,12 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
 	return ptr;
 }
 
-void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
-{
-	void *ptr;
-	int fd;
-
-	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-			size, prot, pkey);
-	pkey_assert(pkey < NR_PKEYS);
-	fd = open("/dax/foo", O_RDWR);
-	pkey_assert(fd >= 0);
-
-	ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
-	pkey_assert(ptr != (void *)-1);
-
-	mprotect_pkey(ptr, size, prot, pkey);
-
-	record_pkey_malloc(ptr, size, prot);
-
-	dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
-	close(fd);
-	return ptr;
-}
-
 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 
 	malloc_pkey_with_mprotect,
 	malloc_pkey_with_mprotect_subpage,
 	malloc_pkey_anon_huge,
 	malloc_pkey_hugetlb
-/* can not do direct with the pkey_mprotect() API:
-	malloc_pkey_mmap_direct,
-	malloc_pkey_mmap_dax,
-*/
 };
 
 void *malloc_pkey(long size, int prot, u16 pkey)
-- 
cgit v1.2.3


From f7ed8331ecb84aaf7b6cb822182a11bf385d8c23 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:12 +0000
Subject: selftests/mm: define types using typedef in pkey-helpers.h

Using #define to define types should be avoided.  Use typedef instead.
Also ensure that __u* types are actually defined by including
<linux/types.h>.

Link: https://lkml.kernel.org/r/20241209095019.1732120-8-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey-helpers.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 472febd992eb..84376ab09545 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -13,13 +13,15 @@
 #include <ucontext.h>
 #include <sys/mman.h>
 
+#include <linux/types.h>
+
 #include "../kselftest.h"
 
 /* Define some kernel-like types */
-#define  u8 __u8
-#define u16 __u16
-#define u32 __u32
-#define u64 __u64
+typedef __u8	u8;
+typedef __u16	u16;
+typedef __u32	u32;
+typedef __u64	u64;
 
 #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
 
-- 
cgit v1.2.3


From 21309ac2652068f45ebb78a2e7a7cd3d5417350e Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:13 +0000
Subject: selftests/mm: ensure pkey-*.h define inline functions only

Headers should not define non-inline functions, as this prevents them from
being included more than once in a given program.  pkey-helpers.h and the
arch-specific headers it includes currently define multiple such
non-inline functions.

In most cases those functions can simply be made inline - this patch does
just that.  read_ptr() is an exception as it must not be inlined.  Since
it is only called from protection_keys.c, we just move it there.

Link: https://lkml.kernel.org/r/20241209095019.1732120-9-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey-arm64.h      | 4 ++--
 tools/testing/selftests/mm/pkey-helpers.h    | 8 +-------
 tools/testing/selftests/mm/pkey-powerpc.h    | 4 ++--
 tools/testing/selftests/mm/pkey-x86.h        | 6 +++---
 tools/testing/selftests/mm/protection_keys.c | 7 +++++++
 5 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index d9d2100eafc0..9897e31f16dd 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -81,11 +81,11 @@ static inline int get_arch_reserved_keys(void)
 	return NR_RESERVED_PKEYS;
 }
 
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
+static inline void expect_fault_on_read_execonly_key(void *p1, int pkey)
 {
 }
 
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
 {
 	return PTR_ERR_ENOTSUP;
 }
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 84376ab09545..bc81275a89d9 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -84,13 +84,7 @@ extern void abort_hooks(void);
 # define noinline __attribute__((noinline))
 #endif
 
-noinline int read_ptr(int *ptr)
-{
-	/* Keep GCC from optimizing this away somehow */
-	barrier();
-	return *ptr;
-}
-
+noinline int read_ptr(int *ptr);
 void expected_pkey_fault(int pkey);
 int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 int sys_pkey_free(unsigned long pkey);
diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
index 3d0c0bdae5bc..1bad310d282a 100644
--- a/tools/testing/selftests/mm/pkey-powerpc.h
+++ b/tools/testing/selftests/mm/pkey-powerpc.h
@@ -91,7 +91,7 @@ static inline int get_arch_reserved_keys(void)
 			return NR_RESERVED_PKEYS_64K_3KEYS;
 }
 
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
+static inline void expect_fault_on_read_execonly_key(void *p1, int pkey)
 {
 	/*
 	 * powerpc does not allow userspace to change permissions of exec-only
@@ -105,7 +105,7 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey)
 /* 4-byte instructions * 16384 = 64K page */
 #define __page_o_noops() asm(".rept 16384 ; nop; .endr")
 
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
 {
 	void *ptr;
 	int ret;
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index ac91777c8917..f7ecd335df1e 100644
--- a/tools/testing/selftests/mm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -113,7 +113,7 @@ static inline u32 pkey_bit_position(int pkey)
 #define XSTATE_PKEY	0x200
 #define XSTATE_BV_OFFSET	512
 
-int pkey_reg_xstate_offset(void)
+static inline int pkey_reg_xstate_offset(void)
 {
 	unsigned int eax;
 	unsigned int ebx;
@@ -148,7 +148,7 @@ static inline int get_arch_reserved_keys(void)
 	return NR_RESERVED_PKEYS;
 }
 
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
+static inline void expect_fault_on_read_execonly_key(void *p1, int pkey)
 {
 	int ptr_contents;
 
@@ -157,7 +157,7 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey)
 	expected_pkey_fault(pkey);
 }
 
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
 {
 	return PTR_ERR_ENOTSUP;
 }
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index fcbebc4490b4..82ece325b70c 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -54,6 +54,13 @@ int test_nr;
 u64 shadow_pkey_reg;
 int dprint_in_signal;
 
+noinline int read_ptr(int *ptr)
+{
+	/* Keep GCC from optimizing this away somehow */
+	barrier();
+	return *ptr;
+}
+
 void cat_into_file(char *str, char *file)
 {
 	int fd = open(file, O_RDWR);
-- 
cgit v1.2.3


From f3f555974c19ede667b1fbe67b3236beea474099 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:14 +0000
Subject: selftests/mm: remove empty pkey helper definition

Some of the functions declared in pkey-helpers.h are actually defined in
protections_keys.c, meaning they can only be called from
protections_keys.c.  This is less than ideal, but it is hard to avoid as
these helpers are themselves called from inline functions in
pkey-<arch>.h.  Let's at least add a comment clarifying that.  We can also
remove the empty definition in pkey_sighandler_tests.c:
expected_pkey_fault() is not meant to be called from there.

Link: https://lkml.kernel.org/r/20241209095019.1732120-10-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey-helpers.h          | 6 ++++--
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 2 --
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index bc81275a89d9..7604cc66ef0e 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -84,10 +84,12 @@ extern void abort_hooks(void);
 # define noinline __attribute__((noinline))
 #endif
 
-noinline int read_ptr(int *ptr);
-void expected_pkey_fault(int pkey);
 int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 int sys_pkey_free(unsigned long pkey);
+
+/* For functions called from protection_keys.c only */
+noinline int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
 		unsigned long pkey);
 void record_pkey_malloc(void *ptr, long size, int prot);
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index c6c020a2a6f5..b6184865629a 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -32,8 +32,6 @@
 
 #define STACK_SIZE PTHREAD_STACK_MIN
 
-void expected_pkey_fault(int pkey) {}
-
 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
 siginfo_t siginfo = {0};
-- 
cgit v1.2.3


From b0cc298487d9fa61fb3198b2d1bd0839b3c4c95d Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:15 +0000
Subject: selftests/mm: ensure non-global pkey symbols are marked static

The pkey tests define a whole lot of functions and some global variables.
A few are truly global (declared in pkey-helpers.h), but the majority are
file-scoped.  Make sure those are labelled static.

Some of the pkey_{access,write}_{allow,deny} helpers are not called, or
only called when building for some architectures.  Mark them
__maybe_unused to suppress compiler warnings.

Link: https://lkml.kernel.org/r/20241209095019.1732120-11-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey-helpers.h          |   3 +
 tools/testing/selftests/mm/pkey_sighandler_tests.c |   6 +-
 tools/testing/selftests/mm/protection_keys.c       | 132 ++++++++++-----------
 3 files changed, 72 insertions(+), 69 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 7604cc66ef0e..6f0ab7b42738 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -83,6 +83,9 @@ extern void abort_hooks(void);
 #ifndef noinline
 # define noinline __attribute__((noinline))
 #endif
+#ifndef __maybe_unused
+# define __maybe_unused __attribute__((__unused__))
+#endif
 
 int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 int sys_pkey_free(unsigned long pkey);
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index b6184865629a..425da9556867 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -32,9 +32,9 @@
 
 #define STACK_SIZE PTHREAD_STACK_MIN
 
-pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-siginfo_t siginfo = {0};
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static siginfo_t siginfo = {0};
 
 /*
  * We need to use inline assembly instead of glibc's syscall because glibc's
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 82ece325b70c..f43cf3b75d8e 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -61,7 +61,7 @@ noinline int read_ptr(int *ptr)
 	return *ptr;
 }
 
-void cat_into_file(char *str, char *file)
+static void cat_into_file(char *str, char *file)
 {
 	int fd = open(file, O_RDWR);
 	int ret;
@@ -88,7 +88,7 @@ void cat_into_file(char *str, char *file)
 
 #if CONTROL_TRACING > 0
 static int warned_tracing;
-int tracing_root_ok(void)
+static int tracing_root_ok(void)
 {
 	if (geteuid() != 0) {
 		if (!warned_tracing)
@@ -101,7 +101,7 @@ int tracing_root_ok(void)
 }
 #endif
 
-void tracing_on(void)
+static void tracing_on(void)
 {
 #if CONTROL_TRACING > 0
 #define TRACEDIR "/sys/kernel/tracing"
@@ -125,7 +125,7 @@ void tracing_on(void)
 #endif
 }
 
-void tracing_off(void)
+static void tracing_off(void)
 {
 #if CONTROL_TRACING > 0
 	if (!tracing_root_ok())
@@ -159,7 +159,7 @@ __attribute__((__aligned__(65536)))
 #else
 __attribute__((__aligned__(PAGE_SIZE)))
 #endif
-void lots_o_noops_around_write(int *write_to_me)
+static void lots_o_noops_around_write(int *write_to_me)
 {
 	dprintf3("running %s()\n", __func__);
 	__page_o_noops();
@@ -170,7 +170,7 @@ void lots_o_noops_around_write(int *write_to_me)
 	dprintf3("%s() done\n", __func__);
 }
 
-void dump_mem(void *dumpme, int len_bytes)
+static void dump_mem(void *dumpme, int len_bytes)
 {
 	char *c = (void *)dumpme;
 	int i;
@@ -213,7 +213,7 @@ static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
 	return 0;
 }
 
-void pkey_disable_set(int pkey, int flags)
+static void pkey_disable_set(int pkey, int flags)
 {
 	unsigned long syscall_flags = 0;
 	int ret;
@@ -251,7 +251,7 @@ void pkey_disable_set(int pkey, int flags)
 		pkey, flags);
 }
 
-void pkey_disable_clear(int pkey, int flags)
+static void pkey_disable_clear(int pkey, int flags)
 {
 	unsigned long syscall_flags = 0;
 	int ret;
@@ -277,19 +277,19 @@ void pkey_disable_clear(int pkey, int flags)
 			pkey, read_pkey_reg());
 }
 
-void pkey_write_allow(int pkey)
+__maybe_unused static void pkey_write_allow(int pkey)
 {
 	pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
 }
-void pkey_write_deny(int pkey)
+__maybe_unused static void pkey_write_deny(int pkey)
 {
 	pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
 }
-void pkey_access_allow(int pkey)
+__maybe_unused static void pkey_access_allow(int pkey)
 {
 	pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
 }
-void pkey_access_deny(int pkey)
+__maybe_unused static void pkey_access_deny(int pkey)
 {
 	pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
 }
@@ -307,9 +307,9 @@ static char *si_code_str(int si_code)
 	return "UNKNOWN";
 }
 
-int pkey_faults;
-int last_si_pkey = -1;
-void signal_handler(int signum, siginfo_t *si, void *vucontext)
+static int pkey_faults;
+static int last_si_pkey = -1;
+static void signal_handler(int signum, siginfo_t *si, void *vucontext)
 {
 	ucontext_t *uctxt = vucontext;
 	int trapno;
@@ -403,14 +403,14 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
 	dprint_in_signal = 0;
 }
 
-void sig_chld(int x)
+static void sig_chld(int x)
 {
 	dprint_in_signal = 1;
 	dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
 	dprint_in_signal = 0;
 }
 
-void setup_sigsegv_handler(void)
+static void setup_sigsegv_handler(void)
 {
 	int r, rs;
 	struct sigaction newact;
@@ -436,13 +436,13 @@ void setup_sigsegv_handler(void)
 	pkey_assert(r == 0);
 }
 
-void setup_handlers(void)
+static void setup_handlers(void)
 {
 	signal(SIGCHLD, &sig_chld);
 	setup_sigsegv_handler();
 }
 
-pid_t fork_lazy_child(void)
+static pid_t fork_lazy_child(void)
 {
 	pid_t forkret;
 
@@ -488,7 +488,7 @@ int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
 	return ret;
 }
 
-int alloc_pkey(void)
+static int alloc_pkey(void)
 {
 	int ret;
 	unsigned long init_val = 0x0;
@@ -546,7 +546,7 @@ int sys_pkey_free(unsigned long pkey)
  * not cleared.  This ensures we get lots of random bit sets
  * and clears on the vma and pte pkey bits.
  */
-int alloc_random_pkey(void)
+static int alloc_random_pkey(void)
 {
 	int max_nr_pkey_allocs;
 	int ret;
@@ -629,7 +629,7 @@ struct pkey_malloc_record {
 };
 struct pkey_malloc_record *pkey_malloc_records;
 struct pkey_malloc_record *pkey_last_malloc_record;
-long nr_pkey_malloc_records;
+static long nr_pkey_malloc_records;
 void record_pkey_malloc(void *ptr, long size, int prot)
 {
 	long i;
@@ -667,7 +667,7 @@ void record_pkey_malloc(void *ptr, long size, int prot)
 	nr_pkey_malloc_records++;
 }
 
-void free_pkey_malloc(void *ptr)
+static void free_pkey_malloc(void *ptr)
 {
 	long i;
 	int ret;
@@ -694,8 +694,7 @@ void free_pkey_malloc(void *ptr)
 	pkey_assert(false);
 }
 
-
-void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+static void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
 {
 	void *ptr;
 	int ret;
@@ -715,7 +714,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
 	return ptr;
 }
 
-void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+static void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 {
 	int ret;
 	void *ptr;
@@ -745,10 +744,10 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 	return ptr;
 }
 
-int hugetlb_setup_ok;
+static int hugetlb_setup_ok;
 #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
 #define GET_NR_HUGE_PAGES 10
-void setup_hugetlbfs(void)
+static void setup_hugetlbfs(void)
 {
 	int err;
 	int fd;
@@ -796,7 +795,7 @@ void setup_hugetlbfs(void)
 	hugetlb_setup_ok = 1;
 }
 
-void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+static void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
 {
 	void *ptr;
 	int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
@@ -817,7 +816,7 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
 	return ptr;
 }
 
-void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+static void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 
 	malloc_pkey_with_mprotect,
 	malloc_pkey_with_mprotect_subpage,
@@ -825,7 +824,7 @@ void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 	malloc_pkey_hugetlb
 };
 
-void *malloc_pkey(long size, int prot, u16 pkey)
+static void *malloc_pkey(long size, int prot, u16 pkey)
 {
 	void *ret;
 	static int malloc_type;
@@ -855,7 +854,7 @@ void *malloc_pkey(long size, int prot, u16 pkey)
 	return ret;
 }
 
-int last_pkey_faults;
+static int last_pkey_faults;
 #define UNKNOWN_PKEY -2
 void expected_pkey_fault(int pkey)
 {
@@ -897,9 +896,9 @@ void expected_pkey_fault(int pkey)
 	pkey_assert(last_pkey_faults == pkey_faults);		\
 } while (0)
 
-int test_fds[10] = { -1 };
-int nr_test_fds;
-void __save_test_fd(int fd)
+static int test_fds[10] = { -1 };
+static int nr_test_fds;
+static void __save_test_fd(int fd)
 {
 	pkey_assert(fd >= 0);
 	pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
@@ -907,14 +906,14 @@ void __save_test_fd(int fd)
 	nr_test_fds++;
 }
 
-int get_test_read_fd(void)
+static int get_test_read_fd(void)
 {
 	int test_fd = open("/etc/passwd", O_RDONLY);
 	__save_test_fd(test_fd);
 	return test_fd;
 }
 
-void close_test_fds(void)
+static void close_test_fds(void)
 {
 	int i;
 
@@ -927,7 +926,7 @@ void close_test_fds(void)
 	nr_test_fds = 0;
 }
 
-void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+static void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
 {
 	int i, err;
 	int max_nr_pkey_allocs;
@@ -979,7 +978,7 @@ void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
 	pkey_assert(!err);
 }
 
-void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+static void test_read_of_write_disabled_region(int *ptr, u16 pkey)
 {
 	int ptr_contents;
 
@@ -989,7 +988,7 @@ void test_read_of_write_disabled_region(int *ptr, u16 pkey)
 	dprintf1("*ptr: %d\n", ptr_contents);
 	dprintf1("\n");
 }
-void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+static void test_read_of_access_disabled_region(int *ptr, u16 pkey)
 {
 	int ptr_contents;
 
@@ -1001,7 +1000,7 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey)
 	expected_pkey_fault(pkey);
 }
 
-void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+static void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
 		u16 pkey)
 {
 	int ptr_contents;
@@ -1018,7 +1017,7 @@ void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
 	expected_pkey_fault(pkey);
 }
 
-void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+static void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
 		u16 pkey)
 {
 	*ptr = __LINE__;
@@ -1029,14 +1028,14 @@ void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
 	expected_pkey_fault(pkey);
 }
 
-void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+static void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
 	dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
 	pkey_write_deny(pkey);
 	*ptr = __LINE__;
 	expected_pkey_fault(pkey);
 }
-void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+static void test_write_of_access_disabled_region(int *ptr, u16 pkey)
 {
 	dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
 	pkey_access_deny(pkey);
@@ -1044,7 +1043,7 @@ void test_write_of_access_disabled_region(int *ptr, u16 pkey)
 	expected_pkey_fault(pkey);
 }
 
-void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+static void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
 			u16 pkey)
 {
 	*ptr = __LINE__;
@@ -1055,7 +1054,7 @@ void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
 	expected_pkey_fault(pkey);
 }
 
-void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+static void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
 {
 	int ret;
 	int test_fd = get_test_read_fd();
@@ -1067,7 +1066,8 @@ void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
 	dprintf1("read ret: %d\n", ret);
 	pkey_assert(ret);
 }
-void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+
+static void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
 	int ret;
 	int test_fd = get_test_read_fd();
@@ -1080,7 +1080,7 @@ void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
 	pkey_assert(ret);
 }
 
-void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+static void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
 {
 	int pipe_ret, vmsplice_ret;
 	struct iovec iov;
@@ -1102,7 +1102,7 @@ void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
 	close(pipe_fds[1]);
 }
 
-void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+static void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
 {
 	int ignored = 0xdada;
 	int futex_ret;
@@ -1120,7 +1120,7 @@ void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
 }
 
 /* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+static void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
 {
 	int err;
 	int i;
@@ -1143,7 +1143,7 @@ void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
 }
 
 /* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+static void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
 {
 	int err;
 	int bad_pkey = NR_PKEYS+99;
@@ -1153,7 +1153,7 @@ void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
 	pkey_assert(err);
 }
 
-void become_child(void)
+static void become_child(void)
 {
 	pid_t forkret;
 
@@ -1169,7 +1169,7 @@ void become_child(void)
 }
 
 /* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+static void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 {
 	int err;
 	int allocated_pkeys[NR_PKEYS] = {0};
@@ -1236,7 +1236,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 	}
 }
 
-void arch_force_pkey_reg_init(void)
+static void arch_force_pkey_reg_init(void)
 {
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 	u64 *buf;
@@ -1275,7 +1275,7 @@ void arch_force_pkey_reg_init(void)
  * a long-running test that continually checks the pkey
  * register.
  */
-void test_pkey_init_state(int *ptr, u16 pkey)
+static void test_pkey_init_state(int *ptr, u16 pkey)
 {
 	int err;
 	int allocated_pkeys[NR_PKEYS] = {0};
@@ -1313,7 +1313,7 @@ void test_pkey_init_state(int *ptr, u16 pkey)
  * have to call pkey_alloc() to use it first.  Make sure that it
  * is usable.
  */
-void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+static void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
 {
 	long size;
 	int prot;
@@ -1337,7 +1337,7 @@ void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
 	mprotect_pkey(ptr, size, prot, pkey);
 }
 
-void test_ptrace_of_child(int *ptr, u16 pkey)
+static void test_ptrace_of_child(int *ptr, u16 pkey)
 {
 	__attribute__((__unused__)) int peek_result;
 	pid_t child_pid;
@@ -1413,7 +1413,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
 	free(plain_ptr_unaligned);
 }
 
-void *get_pointer_to_instructions(void)
+static void *get_pointer_to_instructions(void)
 {
 	void *p1;
 
@@ -1434,7 +1434,7 @@ void *get_pointer_to_instructions(void)
 	return p1;
 }
 
-void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+static void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
 {
 	void *p1;
 	int scratch;
@@ -1466,7 +1466,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
 	pkey_assert(!ret);
 }
 
-void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+static void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
 {
 	void *p1;
 	int scratch;
@@ -1515,7 +1515,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
 }
 
 #if defined(__i386__) || defined(__x86_64__)
-void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+static void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
 {
 	u32 new_pkru;
 	pid_t child;
@@ -1638,7 +1638,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
 #endif
 
 #if defined(__aarch64__)
-void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+static void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
 {
 	pid_t child;
 	int status, ret;
@@ -1715,7 +1715,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
 }
 #endif
 
-void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+static void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
 {
 	int size = PAGE_SIZE;
 	int sret;
@@ -1729,7 +1729,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
 	pkey_assert(sret < 0);
 }
 
-void (*pkey_tests[])(int *ptr, u16 pkey) = {
+static void (*pkey_tests[])(int *ptr, u16 pkey) = {
 	test_read_of_write_disabled_region,
 	test_read_of_access_disabled_region,
 	test_read_of_access_disabled_region_with_page_already_mapped,
@@ -1755,7 +1755,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
 #endif
 };
 
-void run_tests_once(void)
+static void run_tests_once(void)
 {
 	int *ptr;
 	int prot = PROT_READ|PROT_WRITE;
@@ -1789,7 +1789,7 @@ void run_tests_once(void)
 	iteration_nr++;
 }
 
-void pkey_setup_shadow(void)
+static void pkey_setup_shadow(void)
 {
 	shadow_pkey_reg = __read_pkey_reg();
 }
-- 
cgit v1.2.3


From 50910acd6f61573ac23d468f221fa06178f2bd29 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:16 +0000
Subject: selftests/mm: use sys_pkey helpers consistently

sys_pkey_alloc, sys_pkey_free and sys_mprotect_pkey are currently used in
protections_keys.c, while pkey_sighandler_tests.c calls the libc wrappers
directly (e.g.  pkey_mprotect()).  This is probably ok when using glibc
(those symbols appeared a while ago), but Musl does not currently provide
them.  The logging in the helpers from pkey-helpers.h can also come in
handy.

Make things more consistent by using the sys_pkey helpers in
pkey_sighandler_tests.c too.  To that end their implementation is moved to
a common .c file (pkey_util.c).  This also enables calling
is_pkeys_supported() outside of protections_keys.c, since it relies on
sys_pkey_{alloc,free}.

[kevin.brodsky@arm.com: fix dependency on pkey_util.c]
  Link: https://lkml.kernel.org/r/20241216092849.2140850-1-kevin.brodsky@arm.com
Link: https://lkml.kernel.org/r/20241209095019.1732120-12-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile                |  4 +++
 tools/testing/selftests/mm/pkey-helpers.h          |  2 ++
 tools/testing/selftests/mm/pkey_sighandler_tests.c |  8 ++---
 tools/testing/selftests/mm/pkey_util.c             | 40 ++++++++++++++++++++++
 tools/testing/selftests/mm/protection_keys.c       | 35 -------------------
 5 files changed, 50 insertions(+), 39 deletions(-)
 create mode 100644 tools/testing/selftests/mm/pkey_util.c

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 0d5d8f2f8652..f430c4303c0d 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -158,11 +158,15 @@ $(TEST_GEN_FILES): vm_util.c thp_settings.c
 
 $(OUTPUT)/uffd-stress: uffd-common.c
 $(OUTPUT)/uffd-unit-tests: uffd-common.c
+$(OUTPUT)/protection_keys: pkey_util.c
+$(OUTPUT)/pkey_sighandler_tests: pkey_util.c
 
 ifeq ($(ARCH),x86_64)
 BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
 BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
 
+$(BINARIES_32) $(BINARIES_64): pkey_util.c
+
 define gen-target-rule-32
 $(1) $(1)_32: $(OUTPUT)/$(1)_32
 .PHONY: $(1) $(1)_32
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 6f0ab7b42738..f080e97b39be 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -89,6 +89,8 @@ extern void abort_hooks(void);
 
 int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 int sys_pkey_free(unsigned long pkey);
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+		unsigned long pkey);
 
 /* For functions called from protection_keys.c only */
 noinline int read_ptr(int *ptr);
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index 425da9556867..63443b75f49e 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -311,8 +311,8 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	__write_pkey_reg(pkey_reg);
 
 	/* Protect the new stack with MPK 1 */
-	pkey = pkey_alloc(0, 0);
-	pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+	pkey = sys_pkey_alloc(0, 0);
+	sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
 
 	/* Set up alternate signal stack that will use the default MPK */
 	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
@@ -484,8 +484,8 @@ static void test_pkru_sigreturn(void)
 	__write_pkey_reg(pkey_reg);
 
 	/* Protect the stack with MPK 2 */
-	pkey = pkey_alloc(0, 0);
-	pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+	pkey = sys_pkey_alloc(0, 0);
+	sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
 
 	/* Set up alternate signal stack that will use the default MPK */
 	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
diff --git a/tools/testing/selftests/mm/pkey_util.c b/tools/testing/selftests/mm/pkey_util.c
new file mode 100644
index 000000000000..ca4ad0d44ab2
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey_util.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "pkey-helpers.h"
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+	int ret = syscall(SYS_pkey_alloc, flags, init_val);
+	dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+			__func__, flags, init_val, ret, errno);
+	return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+	int ret = syscall(SYS_pkey_free, pkey);
+	dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+	return ret;
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+		unsigned long pkey)
+{
+	int sret;
+
+	dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+			ptr, size, orig_prot, pkey);
+
+	errno = 0;
+	sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey);
+	if (errno) {
+		dprintf2("SYS_mprotect_key sret: %d\n", sret);
+		dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+		dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+		if (DEBUG_LEVEL >= 2)
+			perror("SYS_mprotect_pkey");
+	}
+	return sret;
+}
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index f43cf3b75d8e..3688571e6b39 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -460,34 +460,6 @@ static pid_t fork_lazy_child(void)
 	return forkret;
 }
 
-int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-		unsigned long pkey)
-{
-	int sret;
-
-	dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
-			ptr, size, orig_prot, pkey);
-
-	errno = 0;
-	sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey);
-	if (errno) {
-		dprintf2("SYS_mprotect_key sret: %d\n", sret);
-		dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
-		dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
-		if (DEBUG_LEVEL >= 2)
-			perror("SYS_mprotect_pkey");
-	}
-	return sret;
-}
-
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
-{
-	int ret = syscall(SYS_pkey_alloc, flags, init_val);
-	dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
-			__func__, flags, init_val, ret, errno);
-	return ret;
-}
-
 static int alloc_pkey(void)
 {
 	int ret;
@@ -534,13 +506,6 @@ static int alloc_pkey(void)
 	return ret;
 }
 
-int sys_pkey_free(unsigned long pkey)
-{
-	int ret = syscall(SYS_pkey_free, pkey);
-	dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
-	return ret;
-}
-
 /*
  * I had a bug where pkey bits could be set by mprotect() but
  * not cleared.  This ensures we get lots of random bit sets
-- 
cgit v1.2.3


From 28501aa13acd8376b09a87dc5e3da02462bb0702 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:17 +0000
Subject: selftests/mm: rename pkey register macro

PKEY_ALLOW_ALL is meant to represent the pkey register value that allows
all accesses (enables all pkeys).  However its current naming suggests
that the value applies to *one* key only (like PKEY_DISABLE_ACCESS for
instance).

Rename PKEY_ALLOW_ALL to PKEY_REG_ALLOW_ALL to avoid such
misunderstanding.  This is consistent with the PKEY_REG_ALLOW_NONE macro
introduced by commit 6e182dc9f268 ("selftests/mm: Use generic pkey
register manipulation").

Link: https://lkml.kernel.org/r/20241209095019.1732120-13-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey-arm64.h      | 2 +-
 tools/testing/selftests/mm/protection_keys.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index 9897e31f16dd..8e9685e03c44 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -30,7 +30,7 @@
 #define NR_PKEYS		8
 #define NR_RESERVED_PKEYS	1 /* pkey-0 */
 
-#define PKEY_ALLOW_ALL		0x77777777
+#define PKEY_REG_ALLOW_ALL	0x77777777
 #define PKEY_REG_ALLOW_NONE	0x0
 
 #define PKEY_BITS_PER_PKEY	4
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 3688571e6b39..a4683f2476f2 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -396,7 +396,7 @@ static void signal_handler(int signum, siginfo_t *si, void *vucontext)
 	/* restore access and let the faulting instruction continue */
 	pkey_access_allow(siginfo_pkey);
 #elif defined(__aarch64__)
-	aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL);
+	aarch64_write_signal_pkey(uctxt, PKEY_REG_ALLOW_ALL);
 #endif /* arch */
 	pkey_faults++;
 	dprintf1("<<<<==================================================\n");
@@ -842,7 +842,7 @@ void expected_pkey_fault(int pkey)
 	 */
 	if (__read_pkey_reg() != 0)
 #elif defined(__aarch64__)
-	if (__read_pkey_reg() != PKEY_ALLOW_ALL)
+	if (__read_pkey_reg() != PKEY_REG_ALLOW_ALL)
 #else
 	if (__read_pkey_reg() != shadow_pkey_reg)
 #endif /* arch */
-- 
cgit v1.2.3


From 1c6b1d4889d72a705c9f60f9916ebabbcfe25d30 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:18 +0000
Subject: selftests/mm: skip pkey_sighandler_tests if support is missing

The pkey_sighandler_tests are bound to fail if either the kernel or CPU
doesn't support pkeys.  Skip the tests if pkeys support is missing.

Link: https://lkml.kernel.org/r/20241209095019.1732120-14-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index 63443b75f49e..4f3679d11c05 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -535,6 +535,9 @@ int main(int argc, char *argv[])
 	ksft_print_header();
 	ksft_set_plan(ARRAY_SIZE(pkey_tests));
 
+	if (!is_pkeys_supported())
+		ksft_exit_skip("pkeys not supported\n");
+
 	for (i = 0; i < ARRAY_SIZE(pkey_tests); i++)
 		(*pkey_tests[i])();
 
-- 
cgit v1.2.3


From 08cc4c398ed2e4ba6119990eccd952ec3de1e241 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Mon, 9 Dec 2024 09:50:19 +0000
Subject: selftests/mm: remove X permission from sigaltstack mapping

There is no reason why the alternate signal stack should be mapped as RWX.
Map it as RW instead.

Link: https://lkml.kernel.org/r/20241209095019.1732120-15-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Keith Lucas <keith.lucas@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pkey_sighandler_tests.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index 4f3679d11c05..1ac8c8809880 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -315,7 +315,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
 
 	/* Set up alternate signal stack that will use the default MPK */
-	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
+	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
 			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 	sigstack.ss_flags = 0;
 	sigstack.ss_size = STACK_SIZE;
@@ -488,7 +488,7 @@ static void test_pkru_sigreturn(void)
 	sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
 
 	/* Set up alternate signal stack that will use the default MPK */
-	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
+	sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
 			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 	sigstack.ss_flags = 0;
 	sigstack.ss_size = STACK_SIZE;
-- 
cgit v1.2.3


From ea1413e5b53a8dd4fa7675edb23cdf828bbdce1e Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Mon, 23 Dec 2024 13:13:18 +0000
Subject: rseq/selftests: Add support for OpenRISC

Add support for OpenRISC in the rseq selftests.  OpenRISC is 32-bit
only.

Tested this with:

    Compiler:  gcc version 14.2.0 (GCC)
    Binutils:  GNU assembler version 2.43.1 (or1k-smh-linux-gnu) using BFD version (GNU Binutils) 2.43.1.20241207
    Linux:     Linux buildroot 6.13.0-rc2-00005-g1fa73dd6c2d3-dirty #213 SMP Sat Dec 28 22:18:39 GMT 2024 openrisc GNU/Linux
    Glibc:     2024-12-13 e4e49583d9 Stafford Horne   or1k: Update libm-test-ulps

Signed-off-by: Stafford Horne <shorne@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rseq/param_test.c          |  24 ++
 tools/testing/selftests/rseq/rseq-or1k-bits.h      | 412 +++++++++++++++++++++
 .../selftests/rseq/rseq-or1k-thread-pointer.h      |  13 +
 tools/testing/selftests/rseq/rseq-or1k.h           | 181 +++++++++
 tools/testing/selftests/rseq/rseq-thread-pointer.h |   2 +
 tools/testing/selftests/rseq/rseq.h                |   2 +
 6 files changed, 634 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/rseq-or1k-bits.h
 create mode 100644 tools/testing/selftests/rseq/rseq-or1k-thread-pointer.h
 create mode 100644 tools/testing/selftests/rseq/rseq-or1k.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index 2f37961240ca..05d03e679e06 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -226,8 +226,32 @@ unsigned int yield_mod_cnt, nr_abort;
 	"addi  " INJECT_ASM_REG "," INJECT_ASM_REG ", -1\n\t"	\
 	"bnez " INJECT_ASM_REG ", 222b\n\t"			\
 	"333:\n\t"
+#elif defined(__or1k__)
 
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4]) \
+	, [loop_cnt_5]"m"(loop_cnt[5]) \
+	, [loop_cnt_6]"m"(loop_cnt[6])
 
+#define INJECT_ASM_REG	"r31"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n)					\
+	"l.lwz   " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t"	\
+	"l.sfeqi " INJECT_ASM_REG ", 0\n\t"			\
+	"l.bf 333f\n\t"						\
+	" l.nop\n\t"						\
+	"222:\n\t"						\
+	"l.addi  " INJECT_ASM_REG "," INJECT_ASM_REG ", -1\n\t"	\
+	"l.sfeqi " INJECT_ASM_REG ", 0\n\t"			\
+	"l.bf 222f\n\t"						\
+	" l.nop\n\t"						\
+	"333:\n\t"
 #else
 #error unsupported target
 #endif
diff --git a/tools/testing/selftests/rseq/rseq-or1k-bits.h b/tools/testing/selftests/rseq/rseq-or1k-bits.h
new file mode 100644
index 000000000000..15d0e8200cd1
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-or1k-bits.h
@@ -0,0 +1,412 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv,
+				int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_FINAL_STORE(v, newv, 3)
+				  RSEQ_INJECT_ASM(5)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [expect]		"r" (expect),
+				    [newv]		"r" (newv)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       off_t voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPNE(v, expectnot, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPNE(v, expectnot, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_R_LOAD(v)
+				  RSEQ_ASM_OP_R_STORE(load)
+				  RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+				  RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+				  RSEQ_INJECT_ASM(5)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [expectnot]		"r" (expectnot),
+				    [load]		"m" (*load),
+				    [voffp]		"Ir" (voffp)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+#endif
+				  RSEQ_ASM_OP_R_LOAD(v)
+				  RSEQ_ASM_OP_R_ADD(count)
+				  RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+				  RSEQ_INJECT_ASM(4)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [count]		"r" (count)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error3]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+				  RSEQ_ASM_OP_CMPEQ(v2, expect2, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+				  RSEQ_ASM_OP_CMPEQ(v2, expect2, "%l[error3]")
+#endif
+				  RSEQ_ASM_OP_FINAL_STORE(v, newv, 3)
+				  RSEQ_INJECT_ASM(6)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [expect]		"r" (expect),
+				    [v2]		"m" (*v2),
+				    [expect2]		"r" (expect2),
+				    [newv]		"r" (newv)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2, error3
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+error3:
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+
+/*
+ *   pval = *(ptr+off)
+ *  *pval += inc;
+ */
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, intptr_t inc,
+				int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+#endif
+				  RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, 3)
+				  RSEQ_INJECT_ASM(4)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [ptr]		"r" (ptr),
+				    [off]		"r" (off),
+				    [inc]		"r" (inc)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_STORE(v2, newv2)
+				  RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+				  RSEQ_ASM_OP_FINAL_STORE_RELEASE(v, newv, 3)
+#else
+				  RSEQ_ASM_OP_FINAL_STORE(v, newv, 3)
+#endif
+				  RSEQ_INJECT_ASM(6)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [expect]		"r" (expect),
+				    [v]			"m" (*v),
+				    [newv]		"r" (newv),
+				    [v2]		"m" (*v2),
+				    [newv2]		"r" (newv2)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+				  RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+				  RSEQ_ASM_OP_FINAL_STORE_RELEASE(v, newv, 3)
+#else
+				  RSEQ_ASM_OP_FINAL_STORE(v, newv, 3)
+#endif
+				  RSEQ_INJECT_ASM(6)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [expect]		"r" (expect),
+				    [v]			"m" (*v),
+				    [newv]		"r" (newv),
+				    [dst]		"r" (dst),
+				    [src]		"r" (src),
+				    [len]		"r" (len)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1, RSEQ_ASM_TMP_REG_2,
+				    RSEQ_ASM_TMP_REG_3, RSEQ_ASM_TMP_REG_4
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-or1k-thread-pointer.h b/tools/testing/selftests/rseq/rseq-or1k-thread-pointer.h
new file mode 100644
index 000000000000..cda740f7aff3
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-or1k-thread-pointer.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-only OR MIT */
+#ifndef _RSEQ_OR1K_THREAD_POINTER
+#define _RSEQ_OR1K_THREAD_POINTER
+
+static inline void *rseq_thread_pointer(void)
+{
+	void *__thread_register;
+
+	__asm__ ("l.or %0, r10, r0" : "=r" (__thread_register));
+	return __thread_register;
+}
+
+#endif
diff --git a/tools/testing/selftests/rseq/rseq-or1k.h b/tools/testing/selftests/rseq/rseq-or1k.h
new file mode 100644
index 000000000000..9e78eebdf79a
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-or1k.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+/*
+ * Select the instruction "l.nop 0x35" as the RSEQ_SIG.
+ */
+#define RSEQ_SIG   0x15000035
+
+#define rseq_smp_mb()	__asm__ __volatile__ ("l.msync" ::: "memory")
+#define rseq_smp_rmb()	rseq_smp_mb()
+#define rseq_smp_wmb()	rseq_smp_mb()
+#define RSEQ_ASM_TMP_REG_1	"r31"
+#define RSEQ_ASM_TMP_REG_2	"r29"
+#define RSEQ_ASM_TMP_REG_3	"r27"
+#define RSEQ_ASM_TMP_REG_4	"r25"
+
+#define rseq_smp_load_acquire(p)					\
+__extension__ ({							\
+	rseq_unqual_scalar_typeof(*(p)) ____p1 = RSEQ_READ_ONCE(*(p));	\
+	rseq_smp_mb();							\
+	____p1;								\
+})
+
+#define rseq_smp_acquire__after_ctrl_dep()	rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v)					\
+do {									\
+	rseq_smp_mb();							\
+	RSEQ_WRITE_ONCE(*(p), v);					\
+} while (0)
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,	\
+				post_commit_offset, abort_ip)		\
+	".pushsection	__rseq_cs, \"aw\"\n"				\
+	".balign	32\n"						\
+	__rseq_str(label) ":\n"						\
+	".long " __rseq_str(version) ", " __rseq_str(flags) "\n"	\
+	".long 0x0, " __rseq_str(start_ip) ", "				\
+		"0x0, " __rseq_str(post_commit_offset) ", "		\
+		"0x0, " __rseq_str(abort_ip) "\n"			\
+	".popsection\n\t"						\
+	".pushsection __rseq_cs_ptr_array, \"aw\"\n"			\
+	".long 0x0, " __rseq_str(label) "b\n"				\
+	".popsection\n"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+	__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,		 \
+				((post_commit_ip) - (start_ip)), abort_ip)
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)			\
+	".pushsection __rseq_exit_point_array, \"aw\"\n"		\
+	".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n"	\
+	".popsection\n"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)		\
+	RSEQ_INJECT_ASM(1)						\
+	"l.movhi " RSEQ_ASM_TMP_REG_1 ", hi(" __rseq_str(cs_label) ")\n"\
+	"l.ori   " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1		\
+		", lo(" __rseq_str(cs_label) ")\n"\
+	"l.sw  %[" __rseq_str(rseq_cs) "], " RSEQ_ASM_TMP_REG_1 "\n"	\
+	__rseq_str(label) ":\n"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)			\
+	"l.j 222f\n"							\
+	" l.nop\n"							\
+	".balign	4\n"						\
+	".long "	__rseq_str(RSEQ_SIG) "\n"			\
+	__rseq_str(label) ":\n"						\
+	"l.j %l[" __rseq_str(abort_label) "]\n"				\
+	" l.nop\n"							\
+	"222:\n"
+
+#define RSEQ_ASM_OP_STORE(var, value)					\
+	"l.sw %[" __rseq_str(var) "], %[" __rseq_str(value) "]\n"
+
+#define RSEQ_ASM_OP_CMPEQ(var, expect, label)				\
+	"l.lwz  " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n"	\
+	"l.sfne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "]\n"	\
+	"l.bf   " __rseq_str(label) "\n"				\
+	" l.nop\n"
+
+#define RSEQ_ASM_OP_CMPNE(var, expect, label)				\
+	"l.lwz  " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n"	\
+	"l.sfeq " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "]\n"	\
+	"l.bf   " __rseq_str(label) "\n"				\
+	" l.nop\n"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)		\
+	RSEQ_INJECT_ASM(2)						\
+	RSEQ_ASM_OP_CMPEQ(current_cpu_id, cpu_id, label)
+
+#define RSEQ_ASM_OP_R_LOAD(var)						\
+	"l.lwz " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_STORE(var)					\
+	"l.sw %[" __rseq_str(var) "], " RSEQ_ASM_TMP_REG_1 "\n"
+
+#define RSEQ_ASM_OP_R_LOAD_OFF(offset)					\
+	"l.lwz " RSEQ_ASM_TMP_REG_1 ", "				\
+		"%[" __rseq_str(offset) "](" RSEQ_ASM_TMP_REG_1 ")\n"
+
+#define RSEQ_ASM_OP_R_ADD(count)					\
+	"l.add " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1		\
+		", %[" __rseq_str(count) "]\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE(var, value, post_commit_label)		\
+	RSEQ_ASM_OP_STORE(var, value)					\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE_RELEASE(var, value, post_commit_label)	\
+	"l.msync\n"							\
+	RSEQ_ASM_OP_STORE(var, value)					\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_R_FINAL_STORE(var, post_commit_label)		\
+	"l.sw %[" __rseq_str(var) "], " RSEQ_ASM_TMP_REG_1 "\n"		\
+	__rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)				\
+	"l.sfeq	%[" __rseq_str(len) "], r0\n"				\
+	"l.bf 333f\n"							\
+	" l.nop\n"							\
+	"l.ori  " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(len) "], 0\n"	\
+	"l.ori  " RSEQ_ASM_TMP_REG_2 ", %[" __rseq_str(src) "], 0\n"	\
+	"l.ori  " RSEQ_ASM_TMP_REG_3 ", %[" __rseq_str(dst) "], 0\n"	\
+	"222:\n"							\
+	"l.lbz  " RSEQ_ASM_TMP_REG_4 ", 0(" RSEQ_ASM_TMP_REG_2 ")\n"	\
+	"l.sb   0(" RSEQ_ASM_TMP_REG_3 "), " RSEQ_ASM_TMP_REG_4 "\n"	\
+	"l.addi " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 ", -1\n"	\
+	"l.addi " RSEQ_ASM_TMP_REG_2 ", " RSEQ_ASM_TMP_REG_2 ", 1\n"	\
+	"l.addi " RSEQ_ASM_TMP_REG_3 ", " RSEQ_ASM_TMP_REG_3 ", 1\n"	\
+	"l.sfne " RSEQ_ASM_TMP_REG_1 ", r0\n"				\
+	"l.bf 222b\n"							\
+	" l.nop\n"							\
+	"333:\n"
+
+#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, post_commit_label)	\
+	"l.ori  " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(ptr) "], 0\n"	\
+	RSEQ_ASM_OP_R_ADD(off)						\
+	"l.lwz  " RSEQ_ASM_TMP_REG_1 ", 0(" RSEQ_ASM_TMP_REG_1 ")\n"	\
+	RSEQ_ASM_OP_R_ADD(inc)						\
+	__rseq_str(post_commit_label) ":\n"
+
+/* Per-cpu-id indexing. */
+
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-or1k-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-or1k-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
+
+/* Per-mm-cid indexing. */
+
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-or1k-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-or1k-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
+
+/* APIs which are not based on cpu ids. */
+
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-or1k-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
diff --git a/tools/testing/selftests/rseq/rseq-thread-pointer.h b/tools/testing/selftests/rseq/rseq-thread-pointer.h
index 977c25d758b2..3d5019307a1b 100644
--- a/tools/testing/selftests/rseq/rseq-thread-pointer.h
+++ b/tools/testing/selftests/rseq/rseq-thread-pointer.h
@@ -12,6 +12,8 @@
 #include "rseq-x86-thread-pointer.h"
 #elif defined(__PPC__)
 #include "rseq-ppc-thread-pointer.h"
+#elif defined(__or1k__)
+#include "rseq-or1k-thread-pointer.h"
 #else
 #include "rseq-generic-thread-pointer.h"
 #endif
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index 4e217b620e0c..cbf114d5bb8a 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -122,6 +122,8 @@ static inline struct rseq_abi *rseq_get_abi(void)
 #include <rseq-s390.h>
 #elif defined(__riscv)
 #include <rseq-riscv.h>
+#elif defined(__or1k__)
+#include <rseq-or1k.h>
 #else
 #error unsupported target
 #endif
-- 
cgit v1.2.3


From 05c14d8fd71b9c19391d0b4d65b1c1764e1c440f Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 10 Dec 2024 09:49:45 -0500
Subject: tools/power turbostat: add Busy% to "show idle"

Suggested-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 7accc4a73366..7a10e51a1349 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -270,7 +270,7 @@ struct msr_counter bic[] = {
 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
-#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
+#define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J)
-- 
cgit v1.2.3


From 22a835282b6240f38097f479ae2194bbeb0181e4 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 17 Dec 2024 18:00:31 -0500
Subject: tools/power turbostat: Add an NMI column

Add an NMI column, a proper sub-set of the IRQ column.

It would be preferable if the kernel exported
/sys/kernel/irq/NMI/per_cpu_count.

But since we are already forced to parse /proc/interrupts,
noticing which row is the NMI is simple enough.

Suggested-by: Artem Bityutskiy <artem.bityutskiy@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 54 +++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 7a10e51a1349..2620ed000ad0 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -202,6 +202,7 @@ struct msr_counter bic[] = {
 	{ 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 },
+	{ 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 },
 };
 
 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@@ -266,12 +267,13 @@ struct msr_counter bic[] = {
 #define	BIC_Diec6		(1ULL << 58)
 #define	BIC_SysWatt		(1ULL << 59)
 #define	BIC_Sys_J		(1ULL << 60)
+#define	BIC_NMI			(1ULL << 61)
 
 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
 #define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
-#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
+#define BIC_OTHER ( BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J)
 
@@ -1628,6 +1630,7 @@ struct thread_data {
 	unsigned long long c1;
 	unsigned long long instr_count;
 	unsigned long long irq_count;
+	unsigned long long nmi_count;
 	unsigned int smi_count;
 	unsigned int cpu_id;
 	unsigned int apic_id;
@@ -1934,6 +1937,7 @@ struct timeval tv_even, tv_odd, tv_delta;
 
 int *irq_column_2_cpu;		/* /proc/interrupts column numbers */
 int *irqs_per_cpu;		/* indexed by cpu_num */
+int *nmi_per_cpu;		/* indexed by cpu_num */
 
 void setup_all_buffers(bool startup);
 
@@ -2319,6 +2323,12 @@ void print_header(char *delim)
 		else
 			outp += sprintf(outp, "%sIRQ", (printed++ ? delim : ""));
 	}
+	if (DO_BIC(BIC_NMI)) {
+		if (sums_need_wide_columns)
+			outp += sprintf(outp, "%s     NMI", (printed++ ? delim : ""));
+		else
+			outp += sprintf(outp, "%sNMI", (printed++ ? delim : ""));
+	}
 
 	if (DO_BIC(BIC_SMI))
 		outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
@@ -2605,6 +2615,8 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p
 
 		if (DO_BIC(BIC_IRQ))
 			outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
+		if (DO_BIC(BIC_NMI))
+			outp += sprintf(outp, "IRQ: %lld\n", t->nmi_count);
 		if (DO_BIC(BIC_SMI))
 			outp += sprintf(outp, "SMI: %d\n", t->smi_count);
 
@@ -2824,6 +2836,14 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 			outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count);
 	}
 
+	/* NMI */
+	if (DO_BIC(BIC_NMI)) {
+		if (sums_need_wide_columns)
+			outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->nmi_count);
+		else
+			outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->nmi_count);
+	}
+
 	/* SMI */
 	if (DO_BIC(BIC_SMI))
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
@@ -3439,6 +3459,9 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 	if (DO_BIC(BIC_IRQ))
 		old->irq_count = new->irq_count - old->irq_count;
 
+	if (DO_BIC(BIC_NMI))
+		old->nmi_count = new->nmi_count - old->nmi_count;
+
 	if (DO_BIC(BIC_SMI))
 		old->smi_count = new->smi_count - old->smi_count;
 
@@ -3519,6 +3542,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	t->instr_count = 0;
 
 	t->irq_count = 0;
+	t->nmi_count = 0;
 	t->smi_count = 0;
 
 	c->c3 = 0;
@@ -3623,6 +3647,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	average.threads.instr_count += t->instr_count;
 
 	average.threads.irq_count += t->irq_count;
+	average.threads.nmi_count += t->nmi_count;
 	average.threads.smi_count += t->smi_count;
 
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
@@ -3764,6 +3789,9 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	if (average.threads.irq_count > 9999999)
 		sums_need_wide_columns = 1;
+	if (average.threads.nmi_count > 9999999)
+		sums_need_wide_columns = 1;
+
 
 	average.cores.c3 /= topo.allowed_cores;
 	average.cores.c6 /= topo.allowed_cores;
@@ -4620,6 +4648,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	if (DO_BIC(BIC_IRQ))
 		t->irq_count = irqs_per_cpu[cpu];
+	if (DO_BIC(BIC_NMI))
+		t->nmi_count = nmi_per_cpu[cpu];
 
 	get_cstate_counters(cpu, t, c, p);
 
@@ -5365,6 +5395,7 @@ void free_all_buffers(void)
 
 	free(irq_column_2_cpu);
 	free(irqs_per_cpu);
+	free(nmi_per_cpu);
 
 	for (i = 0; i <= topo.max_cpu_num; ++i) {
 		if (cpus[i].put_ids)
@@ -5821,31 +5852,37 @@ int snapshot_proc_interrupts(void)
 
 		irq_column_2_cpu[column] = cpu_number;
 		irqs_per_cpu[cpu_number] = 0;
+		nmi_per_cpu[cpu_number] = 0;
 	}
 
 	/* read /proc/interrupt count lines and sum up irqs per cpu */
 	while (1) {
 		int column;
 		char buf[64];
+		int this_row_is_nmi = 0;
 
-		retval = fscanf(fp, " %s:", buf);	/* flush irq# "N:" */
+		retval = fscanf(fp, " %s:", buf);	/* irq# "N:" */
 		if (retval != 1)
 			break;
 
+		if (strncmp(buf, "NMI", strlen("NMI")) == 0)
+			this_row_is_nmi = 1;
+
 		/* read the count per cpu */
 		for (column = 0; column < topo.num_cpus; ++column) {
 
 			int cpu_number, irq_count;
 
 			retval = fscanf(fp, " %d", &irq_count);
+
 			if (retval != 1)
 				break;
 
 			cpu_number = irq_column_2_cpu[column];
 			irqs_per_cpu[cpu_number] += irq_count;
-
+			if (this_row_is_nmi)
+				nmi_per_cpu[cpu_number] += irq_count;
 		}
-
 		while (getc(fp) != '\n') ;	/* flush interrupt description */
 
 	}
@@ -5942,7 +5979,7 @@ int snapshot_sys_lpi_us(void)
  */
 int snapshot_proc_sysfs_files(void)
 {
-	if (DO_BIC(BIC_IRQ))
+	if (DO_BIC(BIC_IRQ) || DO_BIC(BIC_NMI))
 		if (snapshot_proc_interrupts())
 			return 1;
 
@@ -8263,6 +8300,7 @@ void process_cpuid()
 		aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
 
 	BIC_PRESENT(BIC_IRQ);
+	BIC_PRESENT(BIC_NMI);
 	BIC_PRESENT(BIC_TSC_MHz);
 }
 
@@ -8613,7 +8651,11 @@ void allocate_irq_buffers(void)
 
 	irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
 	if (irqs_per_cpu == NULL)
-		err(-1, "calloc %d", topo.max_cpu_num + 1);
+		err(-1, "calloc %d IRQ", topo.max_cpu_num + 1);
+
+	nmi_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
+	if (nmi_per_cpu == NULL)
+		err(-1, "calloc %d NMI", topo.max_cpu_num + 1);
 }
 
 int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
-- 
cgit v1.2.3


From 4a358ba215dfefe161b5904e51e48f5f0e82652f Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Wed, 18 Dec 2024 11:43:32 +0100
Subject: tools/power turbostat: Remove SysWatt from DISABLED_BY_DEFAULT

The counter is present on most supporting Intel platforms and provides
useful data to the user. There is no reason to disable the counter by
default.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 2 +-
 tools/power/x86/turbostat/turbostat.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 59b89e6b25bf..f043a93defd4 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -190,7 +190,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors.
 .PP
-\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS). Disabled by default.  Enable with --enable SysWatt.
+\fBSysWatt\fP Watts consumed by the whole platform (RAPL PSYS).
 .PP
 \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package.  Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system.  Note that the meaning of this field is model specific.  For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits.  Comparing PkgWatt and PkgTmp to system limits is necessary.
 .PP
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 2620ed000ad0..1d99aaf9681b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -270,12 +270,12 @@ struct msr_counter bic[] = {
 #define	BIC_NMI			(1ULL << 61)
 
 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
-#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
+#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt)
 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
 #define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
 #define BIC_OTHER ( BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
-#define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_SysWatt | BIC_Sys_J)
+#define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
 
 unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT);
 unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC;
-- 
cgit v1.2.3


From 2f60f03934a50bc1fb69bb4f47a25cddd6807b0b Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 20 Dec 2024 13:38:34 +0100
Subject: tools/power turbostat: Fix PMT mmaped file size rounding

This (the old code) is just not how you round up to a page size.
Noticed on a recent Intel platform. Previous ones must have been
reporting sizes already aligned to a page and so the bug was missed when
testing.

Fixes: f0e4ed752fda ("tools/power turbostat: Add early support for PMT counters")
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1d99aaf9681b..a2ca1c6c3638 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -95,6 +95,8 @@
 #define INTEL_ECORE_TYPE	0x20
 #define INTEL_PCORE_TYPE	0x40
 
+#define ROUND_UP_TO_PAGE_SIZE(n) (((n) + 0x1000UL-1UL) & ~(0x1000UL-1UL))
+
 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
@@ -8996,7 +8998,7 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
 		if (fd_pmt == -1)
 			goto loop_cleanup_and_break;
 
-		mmap_size = (size + 0x1000UL) & (~0x1000UL);
+		mmap_size = ROUND_UP_TO_PAGE_SIZE(size);
 		mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0);
 		if (mmio != MAP_FAILED) {
 
-- 
cgit v1.2.3


From 218cc166321fb3cc8786677ffe0d09a78778a910 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 13 Jan 2025 16:44:58 +0100
Subject: selftests: mptcp: avoid spurious errors on disconnect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The disconnect test-case generates spurious errors:

  INFO: disconnect
  INFO: extra options: -I 3 -i /tmp/tmp.r43niviyoI
  01 ns1 MPTCP -> ns1 (10.0.1.1:10000      ) MPTCP (duration 140ms) [FAIL]
  file received by server does not match (in, out):
  Unexpected revents: POLLERR/POLLNVAL(19)
  -rw-r--r-- 1 root root 10028676 Jan 10 10:47 /tmp/tmp.r43niviyoI.disconnect
  Trailing bytes are:
  ��\����R���!8��u2��5N%
  -rw------- 1 root root 9992290 Jan 10 10:47 /tmp/tmp.Os4UbnWbI1
  Trailing bytes are:
  ��\����R���!8��u2��5N%
  02 ns1 MPTCP -> ns1 (dead:beef:1::1:10001) MPTCP (duration 206ms) [ OK ]
  03 ns1 MPTCP -> ns1 (dead:beef:1::1:10002) TCP   (duration  31ms) [ OK ]
  04 ns1 TCP   -> ns1 (dead:beef:1::1:10003) MPTCP (duration  26ms) [ OK ]
  [FAIL] Tests of the full disconnection have failed
  Time: 2 seconds

The root cause is actually in the user-space bits: the test program
currently disconnects as soon as all the pending data has been spooled,
generating an FASTCLOSE. If such option reaches the peer before the
latter has reached the closed status, the msk socket will report an
error to the user-space, as per protocol specification, causing the
above failure.

Address the issue explicitly waiting for all the relevant sockets to
reach a closed status before performing the disconnect.

Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests")
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-3-0d986ee7b1b6@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 43 +++++++++++++++++------
 1 file changed, 32 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 4209b9569039..414addef9a45 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -25,6 +25,8 @@
 #include <sys/types.h>
 #include <sys/mman.h>
 
+#include <arpa/inet.h>
+
 #include <netdb.h>
 #include <netinet/in.h>
 
@@ -1211,23 +1213,42 @@ static void parse_setsock_options(const char *name)
 	exit(1);
 }
 
-void xdisconnect(int fd, int addrlen)
+void xdisconnect(int fd)
 {
-	struct sockaddr_storage empty;
+	socklen_t addrlen = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage addr, empty;
 	int msec_sleep = 10;
-	int queued = 1;
-	int i;
+	void *raw_addr;
+	int i, cmdlen;
+	char cmd[128];
+
+	/* get the local address and convert it to string */
+	if (getsockname(fd, (struct sockaddr *)&addr, &addrlen) < 0)
+		xerror("getsockname");
+
+	if (addr.ss_family == AF_INET)
+		raw_addr = &(((struct sockaddr_in *)&addr)->sin_addr);
+	else if (addr.ss_family == AF_INET6)
+		raw_addr = &(((struct sockaddr_in6 *)&addr)->sin6_addr);
+	else
+		xerror("bad family");
+
+	strcpy(cmd, "ss -M | grep -q ");
+	cmdlen = strlen(cmd);
+	if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen],
+		       sizeof(cmd) - cmdlen))
+		xerror("inet_ntop");
 
 	shutdown(fd, SHUT_WR);
 
-	/* while until the pending data is completely flushed, the later
+	/*
+	 * wait until the pending data is completely flushed and all
+	 * the MPTCP sockets reached the closed status.
 	 * disconnect will bypass/ignore/drop any pending data.
 	 */
 	for (i = 0; ; i += msec_sleep) {
-		if (ioctl(fd, SIOCOUTQ, &queued) < 0)
-			xerror("can't query out socket queue: %d", errno);
-
-		if (!queued)
+		/* closed socket are not listed by 'ss' */
+		if (system(cmd) != 0)
 			break;
 
 		if (i > poll_timeout)
@@ -1281,9 +1302,9 @@ again:
 		return ret;
 
 	if (cfg_truncate > 0) {
-		xdisconnect(fd, peer->ai_addrlen);
+		xdisconnect(fd);
 	} else if (--cfg_repeat > 0) {
-		xdisconnect(fd, peer->ai_addrlen);
+		xdisconnect(fd);
 
 		/* the socket could be unblocking at this point, we need the
 		 * connect to be blocking
-- 
cgit v1.2.3


From 56530007cac0630140a0846dd6110d60105a58ac Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@gmail.com>
Date: Sat, 2 Nov 2024 08:09:48 -0400
Subject: kunit: add fallback for os.sched_getaffinity

Python 3.13 added os.process_cpu_count as a cross-platform alternative
for the Linux-only os.sched_getaffinity. Use it when it's available and
provide a fallback when it's not.

This allows kunit to run on macOS.

Signed-off-by: Tamir Duberstein <tamird@gmail.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 676fa99a8b19..7f9ae55fd6d5 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -312,7 +312,16 @@ def massage_argv(argv: Sequence[str]) -> Sequence[str]:
 	return list(map(massage_arg, argv))
 
 def get_default_jobs() -> int:
-	return len(os.sched_getaffinity(0))
+	if sys.version_info >= (3, 13):
+		if (ncpu := os.process_cpu_count()) is not None:
+			return ncpu
+		raise RuntimeError("os.process_cpu_count() returned None")
+	 # See https://github.com/python/cpython/blob/b61fece/Lib/os.py#L1175-L1186.
+	if sys.platform != "darwin":
+		return len(os.sched_getaffinity(0))
+	if (ncpu := os.cpu_count()) is not None:
+		return ncpu
+	raise RuntimeError("os.cpu_count() returned None")
 
 def add_common_opts(parser: argparse.ArgumentParser) -> None:
 	parser.add_argument('--build_dir',
-- 
cgit v1.2.3


From 220374e70b0b9d44ab5ec1106bc16caf20f94d80 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@gmail.com>
Date: Sat, 2 Nov 2024 08:09:49 -0400
Subject: kunit: enable hardware acceleration when available

Use KVM or HVF if supported by the QEMU binary and available on the
system.

This produces a nice improvement on my Apple M3 Pro running macOS 14.7:

Before:
./tools/testing/kunit/kunit.py exec --arch arm64
[HH:MM:SS] Elapsed time: 10.145s

After:
./tools/testing/kunit/kunit.py exec --arch arm64
[HH:MM:SS] Elapsed time: 1.773s

Signed-off-by: Tamir Duberstein <tamird@gmail.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py       | 3 +++
 tools/testing/kunit/qemu_configs/arm64.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index e76d7894b6c5..d30f90eae9a4 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -125,6 +125,9 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
 				'-append', ' '.join(params + [self._kernel_command_line]),
 				'-no-reboot',
 				'-nographic',
+				'-accel', 'kvm',
+				'-accel', 'hvf',
+				'-accel', 'tcg',
 				'-serial', self._serial] + self._extra_qemu_params
 		# Note: shlex.join() does what we want, but requires python 3.8+.
 		print('Running tests with:\n$', ' '.join(shlex.quote(arg) for arg in qemu_command))
diff --git a/tools/testing/kunit/qemu_configs/arm64.py b/tools/testing/kunit/qemu_configs/arm64.py
index d3ff27024755..5c44d3a87e6d 100644
--- a/tools/testing/kunit/qemu_configs/arm64.py
+++ b/tools/testing/kunit/qemu_configs/arm64.py
@@ -9,4 +9,4 @@ CONFIG_SERIAL_AMBA_PL011_CONSOLE=y''',
 			   qemu_arch='aarch64',
 			   kernel_path='arch/arm64/boot/Image.gz',
 			   kernel_command_line='console=ttyAMA0',
-			   extra_qemu_params=['-machine', 'virt', '-cpu', 'max,pauth-impdef=on'])
+			   extra_qemu_params=['-machine', 'virt', '-cpu', 'max'])
-- 
cgit v1.2.3


From 687c5e560079cf11f6e5406bfca35567bb485fdd Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Thu, 5 Dec 2024 19:48:07 +0000
Subject: selftest: media_tests: fix trivial UAF typo

Stumbled upon this typo while looking for something else.

Link: https://lore.kernel.org/lkml/20241205194829.3449669-1-cmllamas@google.com/
Cc: Shuah Khan <shuah@kernel.org>
Fixes: fe8777a8a0a1 ("selftests: add media controller regression test scripts and document")
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/media_tests/regression_test.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/media_tests/regression_test.txt b/tools/testing/selftests/media_tests/regression_test.txt
index 2627367681f7..9d0fcd98c085 100644
--- a/tools/testing/selftests/media_tests/regression_test.txt
+++ b/tools/testing/selftests/media_tests/regression_test.txt
@@ -1,5 +1,5 @@
 Testing for regressions in Media Controller API register, ioctl, syscall,
-and unregister paths. There have a few problems that result in user-after
+and unregister paths. There have a few problems that result in use-after
 free on media_device, media_devnode, and cdev pointers when the driver is
 unbound while ioctl is in progress.
 
@@ -15,11 +15,11 @@ Build media_device_test
 cd tools/testing/selftests/media_tests
 make
 
-Regressions test for cdev user-after free error on /dev/mediaX when driver
+Regressions test for cdev use-after-free error on /dev/mediaX when driver
 is unbound:
 
 Start media_device_test to regression test media devnode dynamic alloc
-and cdev user-after-free fixes. This opens media dev files and sits in
+and cdev use-after-free fixes. This opens media dev files and sits in
 a loop running media ioctl MEDIA_IOC_DEVICE_INFO command once every 10
 seconds. The idea is when device file goes away, media devnode and cdev
 should stick around until this test exits.
@@ -40,4 +40,4 @@ keep ioctls going while bind/unbind runs.
 Copy bind_unbind_sample.txt and make changes to specify the driver name
 and number to run bind and unbind. Start the bind_unbind.sh
 
-Run dmesg looking for any user-after free errors or mutex lock errors.
+Run dmesg looking for any use-after-free errors or mutex lock errors.
-- 
cgit v1.2.3


From eed8ecdf123e6d7874b5d3b71dab8a01747a4305 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Mon, 2 Dec 2024 12:58:27 +0800
Subject: selftests/ipc: Remove unused variables

Delete variables "msg" and "pid" that have never been used.

Link: https://lore.kernel.org/r/20241202045827.4704-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ipc/msgque.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c
index c75ea4094870..e9dbb84c100a 100644
--- a/tools/testing/selftests/ipc/msgque.c
+++ b/tools/testing/selftests/ipc/msgque.c
@@ -194,7 +194,7 @@ int fill_msgque(struct msgque_data *msgque)
 
 int main(int argc, char **argv)
 {
-	int msg, pid, err;
+	int err;
 	struct msgque_data msgque;
 
 	if (getuid() != 0)
-- 
cgit v1.2.3


From e0746bde6f820a6239628a5c17653b5c3c3d56df Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 6 Dec 2024 21:07:25 +0800
Subject: selftests/vDSO: support DT_GNU_HASH

glibc added support for DT_GNU_HASH in 2006 and DT_HASH has been
obsoleted for more than one decade in many Linux distributions.

Many vDSOs support DT_GNU_HASH. This patch adds selftests support.

Link: https://lore.kernel.org/r/20241206130724.7944-2-xry111@xry111.site
Signed-off-by: Fangrui Song <i@maskray.me>
Tested-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Xi Ruoyao <xry111@xry111.site> # rebase
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/vDSO/parse_vdso.c | 110 ++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
index 28f35620c499..2fe5e983cb22 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -53,6 +53,7 @@ static struct vdso_info
 	/* Symbol table */
 	ELF(Sym) *symtab;
 	const char *symstrings;
+	ELF(Word) *gnu_hash;
 	ELF_HASH_ENTRY *bucket, *chain;
 	ELF_HASH_ENTRY nbucket, nchain;
 
@@ -81,6 +82,16 @@ static unsigned long elf_hash(const char *name)
 	return h;
 }
 
+static uint32_t gnu_hash(const char *name)
+{
+	const unsigned char *s = (void *)name;
+	uint32_t h = 5381;
+
+	for (; *s; s++)
+		h += h * 32 + *s;
+	return h;
+}
+
 void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 {
 	size_t i;
@@ -123,6 +134,7 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 	 */
 	ELF_HASH_ENTRY *hash = 0;
 	vdso_info.symstrings = 0;
+	vdso_info.gnu_hash = 0;
 	vdso_info.symtab = 0;
 	vdso_info.versym = 0;
 	vdso_info.verdef = 0;
@@ -143,6 +155,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 				((uintptr_t)dyn[i].d_un.d_ptr
 				 + vdso_info.load_offset);
 			break;
+		case DT_GNU_HASH:
+			vdso_info.gnu_hash =
+				(ELF(Word) *)((uintptr_t)dyn[i].d_un.d_ptr +
+					      vdso_info.load_offset);
+			break;
 		case DT_VERSYM:
 			vdso_info.versym = (ELF(Versym) *)
 				((uintptr_t)dyn[i].d_un.d_ptr
@@ -155,17 +172,27 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 			break;
 		}
 	}
-	if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
+	if (!vdso_info.symstrings || !vdso_info.symtab ||
+	    (!hash && !vdso_info.gnu_hash))
 		return;  /* Failed */
 
 	if (!vdso_info.verdef)
 		vdso_info.versym = 0;
 
 	/* Parse the hash table header. */
-	vdso_info.nbucket = hash[0];
-	vdso_info.nchain = hash[1];
-	vdso_info.bucket = &hash[2];
-	vdso_info.chain = &hash[vdso_info.nbucket + 2];
+	if (vdso_info.gnu_hash) {
+		vdso_info.nbucket = vdso_info.gnu_hash[0];
+		/* The bucket array is located after the header (4 uint32) and the bloom
+		 * filter (size_t array of gnu_hash[2] elements).
+		 */
+		vdso_info.bucket = vdso_info.gnu_hash + 4 +
+				   sizeof(size_t) / 4 * vdso_info.gnu_hash[2];
+	} else {
+		vdso_info.nbucket = hash[0];
+		vdso_info.nchain = hash[1];
+		vdso_info.bucket = &hash[2];
+		vdso_info.chain = &hash[vdso_info.nbucket + 2];
+	}
 
 	/* That's all we need. */
 	vdso_info.valid = true;
@@ -209,6 +236,26 @@ static bool vdso_match_version(ELF(Versym) ver,
 		&& !strcmp(name, vdso_info.symstrings + aux->vda_name);
 }
 
+static bool check_sym(ELF(Sym) *sym, ELF(Word) i, const char *name,
+		      const char *version, unsigned long ver_hash)
+{
+	/* Check for a defined global or weak function w/ right name. */
+	if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
+		return false;
+	if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
+	    ELF64_ST_BIND(sym->st_info) != STB_WEAK)
+		return false;
+	if (strcmp(name, vdso_info.symstrings + sym->st_name))
+		return false;
+
+	/* Check symbol version. */
+	if (vdso_info.versym &&
+	    !vdso_match_version(vdso_info.versym[i], version, ver_hash))
+		return false;
+
+	return true;
+}
+
 void *vdso_sym(const char *version, const char *name)
 {
 	unsigned long ver_hash;
@@ -216,29 +263,36 @@ void *vdso_sym(const char *version, const char *name)
 		return 0;
 
 	ver_hash = elf_hash(version);
-	ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
-
-	for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
-		ELF(Sym) *sym = &vdso_info.symtab[chain];
-
-		/* Check for a defined global or weak function w/ right name. */
-		if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
-			continue;
-		if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
-		    ELF64_ST_BIND(sym->st_info) != STB_WEAK)
-			continue;
-		if (sym->st_shndx == SHN_UNDEF)
-			continue;
-		if (strcmp(name, vdso_info.symstrings + sym->st_name))
-			continue;
-
-		/* Check symbol version. */
-		if (vdso_info.versym
-		    && !vdso_match_version(vdso_info.versym[chain],
-					   version, ver_hash))
-			continue;
-
-		return (void *)(vdso_info.load_offset + sym->st_value);
+	ELF(Word) i;
+
+	if (vdso_info.gnu_hash) {
+		uint32_t h1 = gnu_hash(name), h2, *hashval;
+
+		i = vdso_info.bucket[h1 % vdso_info.nbucket];
+		if (i == 0)
+			return 0;
+		h1 |= 1;
+		hashval = vdso_info.bucket + vdso_info.nbucket +
+			  (i - vdso_info.gnu_hash[1]);
+		for (;; i++) {
+			ELF(Sym) *sym = &vdso_info.symtab[i];
+			h2 = *hashval++;
+			if (h1 == (h2 | 1) &&
+			    check_sym(sym, i, name, version, ver_hash))
+				return (void *)(vdso_info.load_offset +
+						sym->st_value);
+			if (h2 & 1)
+				break;
+		}
+	} else {
+		i = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
+		for (; i; i = vdso_info.chain[i]) {
+			ELF(Sym) *sym = &vdso_info.symtab[i];
+			if (sym->st_shndx != SHN_UNDEF &&
+			    check_sym(sym, i, name, version, ver_hash))
+				return (void *)(vdso_info.load_offset +
+						sym->st_value);
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From e8731ecdd67d3d32a2e15a1eea66cfb1e4a5871a Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Sat, 7 Dec 2024 02:23:25 +0100
Subject: selftests: kselftest: Add ksft_test_result_xpass

The functions ksft_test_result_pass, ksft_test_result_fail,
ksft_test_result_xfail, and ksft_test_result_skip already exist and are
available for use in selftests, but no XPASS equivalent is
available.

This adds a new function to that family that outputs XPASS, so that it's
available for future test writers.

Link: https://lore.kernel.org/r/20241207012325.56611-1-me@steffo.eu
Signed-off-by: Stefano Pigozzi <me@steffo.eu>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 29fedf609611..685d9f9554fc 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -18,7 +18,8 @@
  *     ksft_print_msg(fmt, ...);
  *     ksft_perror(msg);
  *
- * and finally report the pass/fail/skip/xfail state of the test with one of:
+ * and finally report the pass/fail/skip/xfail/xpass state of the test
+ * with one of:
  *
  *     ksft_test_result(condition, fmt, ...);
  *     ksft_test_result_report(result, fmt, ...);
@@ -26,6 +27,7 @@
  *     ksft_test_result_fail(fmt, ...);
  *     ksft_test_result_skip(fmt, ...);
  *     ksft_test_result_xfail(fmt, ...);
+ *     ksft_test_result_xpass(fmt, ...);
  *     ksft_test_result_error(fmt, ...);
  *     ksft_test_result_code(exit_code, test_name, fmt, ...);
  *
@@ -227,6 +229,20 @@ static inline __printf(1, 2) void ksft_test_result_xfail(const char *msg, ...)
 	va_end(args);
 }
 
+static inline __printf(1, 2) void ksft_test_result_xpass(const char *msg, ...)
+{
+	int saved_errno = errno;
+	va_list args;
+
+	ksft_cnt.ksft_xpass++;
+
+	va_start(args, msg);
+	printf("ok %u # XPASS ", ksft_test_num());
+	errno = saved_errno;
+	vprintf(msg, args);
+	va_end(args);
+}
+
 static inline __printf(1, 2) void ksft_test_result_skip(const char *msg, ...)
 {
 	int saved_errno = errno;
@@ -318,6 +334,9 @@ void ksft_test_result_code(int exit_code, const char *test_name,
 	case KSFT_XFAIL:					\
 		ksft_test_result_xfail(fmt, ##__VA_ARGS__);	\
 		break;						\
+	case KSFT_XPASS:					\
+		ksft_test_result_xpass(fmt, ##__VA_ARGS__);	\
+		break;						\
 	case KSFT_SKIP:						\
 		ksft_test_result_skip(fmt, ##__VA_ARGS__);	\
 		break;						\
-- 
cgit v1.2.3


From 74864403c578d9caa92fb9b2743a4b7aa5240e44 Mon Sep 17 00:00:00 2001
From: Laura Nao <laura.nao@collabora.com>
Date: Tue, 26 Nov 2024 10:37:10 +0100
Subject: selftests: Warn about skipped tests in result summary

Update the functions that print the test totals at the end of a selftest
to include a warning message when skipped tests are detected. The
message advises users that skipped tests may indicate missing
configuration options and suggests enabling them to improve coverage.

Link: https://lore.kernel.org/r/20241126093710.13314-1-laura.nao@collabora.com
Signed-off-by: Laura Nao <laura.nao@collabora.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest.h               | 5 +++++
 tools/testing/selftests/kselftest/ksft.py         | 3 +++
 tools/testing/selftests/kselftest/ktap_helpers.sh | 4 ++++
 3 files changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 685d9f9554fc..5263961db51f 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -149,6 +149,11 @@ static inline void ksft_set_plan(unsigned int plan)
 
 static inline void ksft_print_cnts(void)
 {
+	if (ksft_cnt.ksft_xskip > 0)
+		printf(
+			"# %u skipped test(s) detected. Consider enabling relevant config options to improve coverage.\n",
+			ksft_cnt.ksft_xskip
+		);
 	if (ksft_plan != ksft_test_num())
 		printf("# Planned tests != run tests (%u != %u)\n",
 			ksft_plan, ksft_test_num());
diff --git a/tools/testing/selftests/kselftest/ksft.py b/tools/testing/selftests/kselftest/ksft.py
index bf215790a89d..0e030837fc17 100644
--- a/tools/testing/selftests/kselftest/ksft.py
+++ b/tools/testing/selftests/kselftest/ksft.py
@@ -27,6 +27,9 @@ def set_plan(num_tests):
 
 
 def print_cnts():
+    if ksft_cnt['skip'] > 0:
+        print(f"# {ksft_cnt['skip']} skipped test(s) detected. Consider enabling relevant config options to improve coverage.")
+
     print(
         f"# Totals: pass:{ksft_cnt['pass']} fail:{ksft_cnt['fail']} xfail:0 xpass:0 skip:{ksft_cnt['skip']} error:0"
     )
diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh
index 79a125eb24c2..531094d81f03 100644
--- a/tools/testing/selftests/kselftest/ktap_helpers.sh
+++ b/tools/testing/selftests/kselftest/ktap_helpers.sh
@@ -107,5 +107,9 @@ ktap_finished() {
 }
 
 ktap_print_totals() {
+	if [ "$KTAP_CNT_SKIP" -gt 0 ]; then
+		echo "# $KTAP_CNT_SKIP skipped test(s) detected. " \
+			"Consider enabling relevant config options to improve coverage."
+	fi
 	echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:0 xpass:0 skip:$KTAP_CNT_SKIP error:0"
 }
-- 
cgit v1.2.3


From 6d59d557e3ec3bde3102efdf4038ccffb848666f Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Wed, 11 Dec 2024 08:49:47 +0800
Subject: selftests/filesystems: Add missing gitignore file

Compiled binary files should be added to .gitignore

'git status' complains:
Untracked files:
(use "git add <file>..." to include in what will be committed)
     filesystems/statmount/statmount_test_ns

Link: https://lore.kernel.org/r/20241211004947.5806-1-lizhijian@fujitsu.com
Cc: Shuah Khan <shuah@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/filesystems/statmount/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/statmount/.gitignore b/tools/testing/selftests/filesystems/statmount/.gitignore
index 82a4846cbc4b..973363ad66a2 100644
--- a/tools/testing/selftests/filesystems/statmount/.gitignore
+++ b/tools/testing/selftests/filesystems/statmount/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
+statmount_test_ns
 /*_test
-- 
cgit v1.2.3


From d54d3f69b7a13f193fd537c60b0184aefc35bc24 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Wed, 11 Dec 2024 08:46:25 +0800
Subject: selftests/zram: gitignore output file

After `make run_tests`, the git status complains:
Untracked files:
    (use "git add <file>..." to include in what will be committed)
        zram/err.log

This file will be cleaned up when execute 'make clean'

Link: https://lore.kernel.org/r/20241211004625.5308-1-lizhijian@fujitsu.com
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/zram/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 tools/testing/selftests/zram/.gitignore

(limited to 'tools')

diff --git a/tools/testing/selftests/zram/.gitignore b/tools/testing/selftests/zram/.gitignore
new file mode 100644
index 000000000000..088cd9bad87a
--- /dev/null
+++ b/tools/testing/selftests/zram/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+err.log
-- 
cgit v1.2.3


From 8694e6a7f7dba23d3abd9f5a96f64d161704c7b1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 12 Dec 2024 10:02:56 +0100
Subject: selftests: timers: clocksource-switch: Adapt progress to kselftest
 framework

When adapting the test to the kselftest framework, a few printf() calls
indicating test progress were not updated.

Fix this by replacing these printf() calls by ksft_print_msg() calls.

Link: https://lore.kernel.org/r/7dd4b9ab6e43268846e250878ebf25ae6d3d01ce.1733994134.git.geert+renesas@glider.be
Fixes: ce7d101750ff ("selftests: timers: clocksource-switch: adapt to kselftest framework")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/clocksource-switch.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c
index c5264594064c..83faa4e354e3 100644
--- a/tools/testing/selftests/timers/clocksource-switch.c
+++ b/tools/testing/selftests/timers/clocksource-switch.c
@@ -156,8 +156,8 @@ int main(int argc, char **argv)
 	/* Check everything is sane before we start switching asynchronously */
 	if (do_sanity_check) {
 		for (i = 0; i < count; i++) {
-			printf("Validating clocksource %s\n",
-				clocksource_list[i]);
+			ksft_print_msg("Validating clocksource %s\n",
+					clocksource_list[i]);
 			if (change_clocksource(clocksource_list[i])) {
 				status = -1;
 				goto out;
@@ -169,7 +169,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	printf("Running Asynchronous Switching Tests...\n");
+	ksft_print_msg("Running Asynchronous Switching Tests...\n");
 	pid = fork();
 	if (!pid)
 		return run_tests(runtime);
-- 
cgit v1.2.3


From 5a7a4e46f85ebc87853b4edc4436008d2372c3c3 Mon Sep 17 00:00:00 2001
From: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Date: Mon, 2 Dec 2024 12:31:11 +0800
Subject: selftests: kselftest: Fix the wrong format specifier

The format specifier of "unsigned int" in printf()
should be "%u", not "%d".

Link: https://lore.kernel.org/r/20241202043111.3888-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 5263961db51f..cdf91b0ca40f 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -427,7 +427,7 @@ static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ...
 	 */
 	if (ksft_plan || ksft_test_num()) {
 		ksft_cnt.ksft_xskip++;
-		printf("ok %d # SKIP ", 1 + ksft_test_num());
+		printf("ok %u # SKIP ", 1 + ksft_test_num());
 	} else {
 		printf("1..0 # SKIP ");
 	}
-- 
cgit v1.2.3


From 9301be2ce104390d461442f1a092cd1c9c4d7b23 Mon Sep 17 00:00:00 2001
From: Shivam Chaudhary <cvam0000@gmail.com>
Date: Tue, 10 Dec 2024 18:02:12 +0530
Subject: selftests: acct: Add ksft_exit_skip if not running as root

If the selftest is not running as root, it should skip not
fail and give an appropriate warning to the user. This patch adds
ksft_exit_skip() if the test is not running as root.

Logs:

Before change:

TAP version 13
1..1
ok 1 # SKIP This test needs root to run!

After change:

TAP version 13
1..1
ok 2 # SKIP This test needs root to run!
Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0

Link: https://lore.kernel.org/r/20241210123212.332050-1-cvam0000@gmail.com
Signed-off-by: Shivam Chaudhary <cvam0000@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/acct/acct_syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/acct/acct_syscall.c b/tools/testing/selftests/acct/acct_syscall.c
index e44e8fe1f4a3..87c044fb9293 100644
--- a/tools/testing/selftests/acct/acct_syscall.c
+++ b/tools/testing/selftests/acct/acct_syscall.c
@@ -24,7 +24,7 @@ int main(void)
 
 	// Check if test is run a root
 	if (geteuid()) {
-		ksft_test_result_skip("This test needs root to run!\n");
+		ksft_exit_skip("This test needs root to run!\n");
 		return 1;
 	}
 
-- 
cgit v1.2.3


From 103c0b5e82fb0f7c66c410897588095e9a1f64f6 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Fri, 20 Dec 2024 16:12:41 +0000
Subject: selftests/run_kselftest.sh: Fix help string for --per-test-log

This is documented as --per_test_log but the argument is actually
--per-test-log.

Link: https://lore.kernel.org/r/20241220-per-test-log-v1-1-de5afe69fdf4@google.com
Signed-off-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/run_kselftest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index a28c1416cb89..50e03eefe7ac 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -21,7 +21,7 @@ usage()
 	cat <<EOF
 Usage: $0 [OPTIONS]
   -s | --summary		Print summary with detailed log in output.log (conflict with -p)
-  -p | --per_test_log		Print test log in /tmp with each test name (conflict with -s)
+  -p | --per-test-log		Print test log in /tmp with each test name (conflict with -s)
   -t | --test COLLECTION:TEST	Run TEST from COLLECTION
   -c | --collection COLLECTION	Run all tests from COLLECTION
   -l | --list			List the available collection:test entries
-- 
cgit v1.2.3


From b6f9cd83c60ebb48f580dd0ec9e9080590ebc053 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 18 Dec 2024 17:03:18 +0000
Subject: selftests/ring-buffer: Add test for out-of-bound pgoff mapping

Extend the ring-buffer mapping test coverage by checking an out-of-bound
pgoff which has proven to be problematic in the past.

Link: https://lore.kernel.org/r/20241218170318.2814991-1-vdonnefort@google.com
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ring-buffer/map_test.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c
index d10a847130fb..a58f520f2f41 100644
--- a/tools/testing/selftests/ring-buffer/map_test.c
+++ b/tools/testing/selftests/ring-buffer/map_test.c
@@ -233,12 +233,18 @@ TEST_F(map, data_mmap)
 	ASSERT_NE(data, MAP_FAILED);
 	munmap(data, data_len);
 
-	/* Overflow the available subbufs by 1 */
+	/* Offset within ring-buffer bounds, mapping size overflow */
 	meta_len += desc->meta->subbuf_size * 2;
 	data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
 		    desc->cpu_fd, meta_len);
 	ASSERT_EQ(data, MAP_FAILED);
 
+	/* Offset outside ring-buffer bounds */
+	data_len = desc->meta->subbuf_size * desc->meta->nr_subbufs;
+	data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
+		    desc->cpu_fd, data_len + (desc->meta->subbuf_size * 2));
+	ASSERT_EQ(data, MAP_FAILED);
+
 	/* Verify meta-page padding */
 	if (desc->meta->meta_page_size > getpagesize()) {
 		data_len = desc->meta->meta_page_size;
-- 
cgit v1.2.3


From 02bc220dc6dc7c56edc4859bc5dd2c08b95d5fb5 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@strace.io>
Date: Wed, 8 Jan 2025 19:07:57 +0200
Subject: selftests: harness: fix printing of mismatch values in __EXPECT()

intptr_t and uintptr_t are not big enough types on 32-bit architectures
when printing 64-bit values, resulting to the following incorrect
diagnostic output:

  # get_syscall_info.c:209:get_syscall_info:Expected exp_args[2] (3134324433) == info.entry.args[1] (3134324433)

Replace intptr_t and uintptr_t with intmax_t and uintmax_t, respectively.
With this fix, the same test produces more usable diagnostic output:

  # get_syscall_info.c:209:get_syscall_info:Expected exp_args[2] (3134324433) == info.entry.args[1] (18446744072548908753)

Link: https://lore.kernel.org/r/20250108170757.GA6723@strace.io
Fixes: b5bb6d3068ea ("selftests/seccomp: fix 32-bit build warnings")
Signed-off-by: Dmitry V. Levin <ldv@strace.io>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest_harness.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index a5a72415e37b..666c9fde76da 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -760,33 +760,33 @@
 		/* Report with actual signedness to avoid weird output. */ \
 		switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \
 		case 0: { \
-			unsigned long long __exp_print = (uintptr_t)__exp; \
-			unsigned long long __seen_print = (uintptr_t)__seen; \
-			__TH_LOG("Expected %s (%llu) %s %s (%llu)", \
+			uintmax_t __exp_print = (uintmax_t)__exp; \
+			uintmax_t __seen_print = (uintmax_t)__seen; \
+			__TH_LOG("Expected %s (%ju) %s %s (%ju)", \
 				 _expected_str, __exp_print, #_t, \
 				 _seen_str, __seen_print); \
 			break; \
 			} \
 		case 1: { \
-			unsigned long long __exp_print = (uintptr_t)__exp; \
-			long long __seen_print = (intptr_t)__seen; \
-			__TH_LOG("Expected %s (%llu) %s %s (%lld)", \
+			uintmax_t __exp_print = (uintmax_t)__exp; \
+			intmax_t  __seen_print = (intmax_t)__seen; \
+			__TH_LOG("Expected %s (%ju) %s %s (%jd)", \
 				 _expected_str, __exp_print, #_t, \
 				 _seen_str, __seen_print); \
 			break; \
 			} \
 		case 2: { \
-			long long __exp_print = (intptr_t)__exp; \
-			unsigned long long __seen_print = (uintptr_t)__seen; \
-			__TH_LOG("Expected %s (%lld) %s %s (%llu)", \
+			intmax_t  __exp_print = (intmax_t)__exp; \
+			uintmax_t __seen_print = (uintmax_t)__seen; \
+			__TH_LOG("Expected %s (%jd) %s %s (%ju)", \
 				 _expected_str, __exp_print, #_t, \
 				 _seen_str, __seen_print); \
 			break; \
 			} \
 		case 3: { \
-			long long __exp_print = (intptr_t)__exp; \
-			long long __seen_print = (intptr_t)__seen; \
-			__TH_LOG("Expected %s (%lld) %s %s (%lld)", \
+			intmax_t  __exp_print = (intmax_t)__exp; \
+			intmax_t  __seen_print = (intmax_t)__seen; \
+			__TH_LOG("Expected %s (%jd) %s %s (%jd)", \
 				 _expected_str, __exp_print, #_t, \
 				 _seen_str, __seen_print); \
 			break; \
-- 
cgit v1.2.3


From 41ca14efaf850c3694ccc5abc18ba552ce1a3fce Mon Sep 17 00:00:00 2001
From: Shivam Chaudhary <cvam0000@gmail.com>
Date: Sun, 5 Jan 2025 14:22:54 +0530
Subject: selftests: tmpfs: Add Test-skip if not run as root

Add 'ksft_exit_skip()', if  not run as root, with an appropriate
Warning.

Add 'ksft_print_header()' and 'ksft_set_plan()' to structure test
outputs more effectively.

Test logs:

Before Change:
- Without root
 error: unshare, errno 1

- With root
 No, output

After change:

- Without root
TAP version 13
1..1
ok 2 # SKIP This test needs root to run!
Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0

- With root
TAP version 13
1..1

Link: https://lore.kernel.org/r/20250105085255.124929-2-cvam0000@gmail.com
Signed-off-by: Shivam Chaudhary <cvam0000@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
index b5c3ddb90942..657b64857e82 100644
--- a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
+++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
@@ -23,10 +23,22 @@
 #include <sys/mount.h>
 #include <unistd.h>
 
+#include "../kselftest.h"
+
 int main(void)
 {
 	int fd;
 
+	// Setting up kselftest framework
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	// Check if test is run as root
+	if (geteuid()) {
+		ksft_exit_skip("This test needs root to run!\n");
+		return 1;
+	}
+
 	if (unshare(CLONE_NEWNS) == -1) {
 		if (errno == ENOSYS || errno == EPERM) {
 			fprintf(stderr, "error: unshare, errno %d\n", errno);
-- 
cgit v1.2.3


From 58beae2585d5a31bb27559b437630b61424834bb Mon Sep 17 00:00:00 2001
From: Shivam Chaudhary <cvam0000@gmail.com>
Date: Sun, 5 Jan 2025 14:22:55 +0530
Subject: selftests: tmpfs: Add kselftest support to tmpfs

Replace direct error handling with 'ksft_test_result_*'
macros for better reporting.

Test logs:

Before change:
- Without root
 error: unshare, errno 1

- With root
 No, output

After change:
- Without root
TAP version 13
1..1
ok 2 # SKIP This test needs root to run!
Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0

- With root
TAP version 13
1..1
ok 1 Test : Success
Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0

Link: https://lore.kernel.org/r/20250105085255.124929-3-cvam0000@gmail.com
Signed-off-by: Shivam Chaudhary <cvam0000@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c | 29 +++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
index 657b64857e82..02ecfe687dc2 100644
--- a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
+++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
@@ -41,39 +41,38 @@ int main(void)
 
 	if (unshare(CLONE_NEWNS) == -1) {
 		if (errno == ENOSYS || errno == EPERM) {
-			fprintf(stderr, "error: unshare, errno %d\n", errno);
-			return 4;
+			ksft_exit_skip("unshare() error: unshare, errno %d\n", errno);
+		} else {
+			ksft_exit_fail_msg("unshare() error: unshare, errno %d\n", errno);
 		}
-		fprintf(stderr, "error: unshare, errno %d\n", errno);
-		return 1;
 	}
+
 	if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
-		fprintf(stderr, "error: mount '/', errno %d\n", errno);
-		return 1;
+		ksft_exit_fail_msg("mount() error: Root filesystem private mount: Fail %d\n", errno);
 	}
 
 	/* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */
 	if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) {
-		fprintf(stderr, "error: mount tmpfs, errno %d\n", errno);
-		return 1;
+		ksft_exit_fail_msg("mount() error: Mounting tmpfs on /tmp: Fail %d\n", errno);
 	}
 
 	fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600);
 	if (fd == -1) {
-		fprintf(stderr, "error: open 1, errno %d\n", errno);
-		return 1;
+		ksft_exit_fail_msg("openat() error: Open first temporary file: Fail %d\n", errno);
 	}
+
 	if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) {
-		fprintf(stderr, "error: linkat, errno %d\n", errno);
-		return 1;
+		ksft_exit_fail_msg("linkat() error: Linking the temporary file: Fail %d\n", errno);
+		/* Ensure fd is closed on failure */
+		close(fd);
 	}
 	close(fd);
 
 	fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600);
 	if (fd == -1) {
-		fprintf(stderr, "error: open 2, errno %d\n", errno);
-		return 1;
+		ksft_exit_fail_msg("openat() error: Opening the second temporary file: Fail %d\n", errno);
 	}
-
+	ksft_test_result_pass(" ");
+	ksft_exit_pass();
 	return 0;
 }
-- 
cgit v1.2.3


From 159ca65c42d90d5ab98fc31b708b12c0be2c26e0 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 7 Jan 2025 21:11:07 +0900
Subject: selftests/ftrace: Fix to use remount when testing mount GID option

Fix mount_options.tc to use remount option to mount the tracefs.
Since the current implementation does not umount the tracefs,
this test always fails because of -EBUSY error.
Using remount option will allow us to change the mount option.

Link: https://lore.kernel.org/r/173625186741.1383744.16707876180798573039.stgit@devnote2
Fixes: 8b55572e5180 ("tracing/selftests: Add tracefs mount options test")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc
index 35e8d47d6072..8a7ce647a60d 100644
--- a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc
+++ b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc
@@ -15,11 +15,11 @@ find_alternate_gid() {
 	tac /etc/group | grep -v ":$original_gid:" | head -1 | cut -d: -f3
 }
 
-mount_tracefs_with_options() {
+remount_tracefs_with_options() {
 	local mount_point="$1"
 	local options="$2"
 
-	mount -t tracefs -o "$options" nodev "$mount_point"
+	mount -t tracefs -o "remount,$options" nodev "$mount_point"
 
 	setup
 }
@@ -81,7 +81,7 @@ test_gid_mount_option() {
 
 	# Unmount existing tracefs instance and mount with new GID
 	unmount_tracefs "$mount_point"
-	mount_tracefs_with_options "$mount_point" "$new_options"
+	remount_tracefs_with_options "$mount_point" "$new_options"
 
 	check_gid "$mount_point" "$other_group"
 
@@ -92,7 +92,7 @@ test_gid_mount_option() {
 
 	# Unmount and remount with the original GID
 	unmount_tracefs "$mount_point"
-	mount_tracefs_with_options "$mount_point" "$mount_options"
+	remount_tracefs_with_options "$mount_point" "$mount_options"
 	check_gid "$mount_point" "$original_group"
 }
 
-- 
cgit v1.2.3


From 89ae64384e5886e8e7c9cc0fcc31e50f4987fae8 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 7 Jan 2025 21:11:16 +0900
Subject: selftests/ftrace: Make uprobe test more robust against binary name

Make add_remove_uprobe test case more robust against various real
binary name.
Current add_remove_uprobe.tc test expects the real binary of /bin/sh
is '*/bin/*sh', but it does not work on busybox environment.
Instead of using fixed pattern, use readlink to identify real binary
name.

Link: https://lore.kernel.org/r/173625187633.1383744.2840679071525852811.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc
index a275decdc880..86c76679c56e 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc
@@ -6,8 +6,10 @@
 echo 0 > events/enable
 echo > dynamic_events
 
+REALBIN=`readlink -f /bin/sh`
+
 echo 'cat /proc/$$/maps' | /bin/sh | \
-	grep "r-xp .*/bin/.*sh$" | \
+	grep "r-xp .*${REALBIN}$" | \
 	awk '{printf "p:myevent %s:0x%s\n", $6,$3 }' >> uprobe_events
 
 grep -q myevent uprobe_events
-- 
cgit v1.2.3


From a1cd99e700ec006c36c4f01be984e2a19eb3e2f1 Mon Sep 17 00:00:00 2001
From: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Date: Mon, 16 Dec 2024 16:18:53 +0100
Subject: selftests/resctrl: Adjust effective L3 cache size with SNC enabled

Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
nodes. Systems may support splitting into either two, three, four or six
nodes. When SNC mode is enabled the effective amount of L3 cache
available for allocation is divided by the number of nodes per L3.

It's possible to detect which SNC mode is active by comparing the number
of CPUs that share a cache with CPU0, with the number of CPUs on node0.

Detect SNC mode once and let other tests inherit that information.

Update CFLAGS after including lib.mk in the Makefile so that fallthrough
macro can be used.

To check if SNC detection is reliable one can check the
/sys/devices/system/cpu/offline file. If it's empty, it means all cores
are operational and the ratio should be calculated correctly. If it has
any contents, it means the detected SNC mode can't be trusted and should
be disabled.

Check if detection was not reliable due to offline cpus. If it was skip
running tests since the results couldn't be trusted.

Co-developed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/Makefile        |   1 +
 tools/testing/selftests/resctrl/resctrl.h       |   5 ++
 tools/testing/selftests/resctrl/resctrl_tests.c |   9 +-
 tools/testing/selftests/resctrl/resctrlfs.c     | 105 ++++++++++++++++++++++++
 4 files changed, 119 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
index f408bd6bfc3d..984534cfbf1b 100644
--- a/tools/testing/selftests/resctrl/Makefile
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -8,5 +8,6 @@ TEST_GEN_PROGS := resctrl_tests
 LOCAL_HDRS += $(wildcard *.h)
 
 include ../lib.mk
+CFLAGS += -I$(top_srcdir)/tools/include
 
 $(OUTPUT)/resctrl_tests: $(wildcard *.c)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index dab1953fc7a0..35fa3afee9c3 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -11,6 +11,7 @@
 #include <signal.h>
 #include <dirent.h>
 #include <stdbool.h>
+#include <ctype.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
@@ -21,6 +22,7 @@
 #include <sys/eventfd.h>
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
+#include <linux/compiler.h>
 #include "../kselftest.h"
 
 #define MB			(1024 * 1024)
@@ -156,8 +158,11 @@ struct perf_event_read {
  */
 extern volatile int *value_sink;
 
+extern int snc_unreliable;
+
 extern char llc_occup_path[1024];
 
+int snc_nodes_per_l3_cache(void);
 int get_vendor(void);
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 3335af815b21..5154ffd821c4 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -118,7 +118,7 @@ static bool test_vendor_specific_check(const struct resctrl_test *test)
 
 static void run_single_test(const struct resctrl_test *test, const struct user_params *uparams)
 {
-	int ret;
+	int ret, snc_mode;
 
 	if (test->disabled)
 		return;
@@ -128,8 +128,15 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p
 		return;
 	}
 
+	snc_mode = snc_nodes_per_l3_cache();
+
 	ksft_print_msg("Starting %s test ...\n", test->name);
 
+	if (snc_mode == 1 && snc_unreliable && get_vendor() == ARCH_INTEL) {
+		ksft_test_result_skip("SNC detection unreliable due to offline CPUs. Test results may not be accurate if SNC enabled.\n");
+		return;
+	}
+
 	if (test_prepare(test)) {
 		ksft_exit_fail_msg("Abnormal failure when preparing for the test\n");
 		return;
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index d38d6dd90be4..dc7ce3cbdb27 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -13,6 +13,8 @@
 
 #include "resctrl.h"
 
+int snc_unreliable;
+
 static int find_resctrl_mount(char *buffer)
 {
 	FILE *mounts;
@@ -156,6 +158,98 @@ int get_domain_id(const char *resource, int cpu_no, int *domain_id)
 	return 0;
 }
 
+/*
+ * Count number of CPUs in a /sys bitmap
+ */
+static unsigned int count_sys_bitmap_bits(char *name)
+{
+	FILE *fp = fopen(name, "r");
+	int count = 0, c;
+
+	if (!fp)
+		return 0;
+
+	while ((c = fgetc(fp)) != EOF) {
+		if (!isxdigit(c))
+			continue;
+		switch (c) {
+		case 'f':
+			count++;
+			fallthrough;
+		case '7': case 'b': case 'd': case 'e':
+			count++;
+			fallthrough;
+		case '3': case '5': case '6': case '9': case 'a': case 'c':
+			count++;
+			fallthrough;
+		case '1': case '2': case '4': case '8':
+			count++;
+			break;
+		}
+	}
+	fclose(fp);
+
+	return count;
+}
+
+static bool cpus_offline_empty(void)
+{
+	char offline_cpus_str[64];
+	FILE *fp;
+
+	fp = fopen("/sys/devices/system/cpu/offline", "r");
+	if (!fp) {
+		ksft_perror("Could not open /sys/devices/system/cpu/offline");
+		return 0;
+	}
+
+	if (fscanf(fp, "%63s", offline_cpus_str) < 0) {
+		if (!errno) {
+			fclose(fp);
+			return 1;
+		}
+		ksft_perror("Could not read /sys/devices/system/cpu/offline");
+	}
+
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * Detect SNC by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0.
+ * If any CPUs are offline declare the detection as unreliable.
+ */
+int snc_nodes_per_l3_cache(void)
+{
+	int node_cpus, cache_cpus;
+	static int snc_mode;
+
+	if (!snc_mode) {
+		snc_mode = 1;
+		if (!cpus_offline_empty()) {
+			ksft_print_msg("Runtime SNC detection unreliable due to offline CPUs.\n");
+			ksft_print_msg("Setting SNC mode to disabled.\n");
+			snc_unreliable = 1;
+			return snc_mode;
+		}
+		node_cpus = count_sys_bitmap_bits("/sys/devices/system/node/node0/cpumap");
+		cache_cpus = count_sys_bitmap_bits("/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_map");
+
+		if (!node_cpus || !cache_cpus) {
+			ksft_print_msg("Could not determine Sub-NUMA Cluster mode.\n");
+			snc_unreliable = 1;
+			return snc_mode;
+		}
+		snc_mode = cache_cpus / node_cpus;
+
+		if (snc_mode > 1)
+			ksft_print_msg("SNC-%d mode discovered.\n", snc_mode);
+	}
+
+	return snc_mode;
+}
+
 /*
  * get_cache_size - Get cache size for a specified CPU
  * @cpu_no:	CPU number
@@ -211,6 +305,17 @@ int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size
 			break;
 	}
 
+	/*
+	 * The amount of cache represented by each bit in the masks
+	 * in the schemata file is reduced by a factor equal to SNC
+	 * nodes per L3 cache.
+	 * E.g. on a SNC-2 system with a 100MB L3 cache a test that
+	 * allocates memory from its local SNC node (default behavior
+	 * without using libnuma) will only see 50 MB llc_occupancy
+	 * with a fully populated L3 mask in the schemata file.
+	 */
+	if (cache_num == 3)
+		*cache_size /= snc_nodes_per_l3_cache();
 	return 0;
 }
 
-- 
cgit v1.2.3


From d6d35d0b0f4267612eb905305f3f2f7aa048bfd4 Mon Sep 17 00:00:00 2001
From: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Date: Mon, 16 Dec 2024 16:18:54 +0100
Subject: selftests/resctrl: Discover SNC kernel support and adjust messages

Resctrl selftest prints a message on test failure that Sub-Numa
Clustering (SNC) could be enabled and points the user to check their BIOS
settings. No actual check is performed before printing that message so
it is not very accurate in pinpointing a problem.

When there is SNC support for kernel's resctrl subsystem and SNC is
enabled then sub node files are created for each node in the resctrlfs.
The sub node files exist in each regular node's L3 monitoring directory.
The reliable path to check for existence of sub node files is
/sys/fs/resctrl/mon_data/mon_L3_00/mon_sub_L3_00.

Add helper that checks for mon_sub_L3_00 existence.

Correct old messages to account for kernel support of SNC in
resctrl.

Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cmt_test.c  |  4 ++--
 tools/testing/selftests/resctrl/mba_test.c  |  2 ++
 tools/testing/selftests/resctrl/mbm_test.c  |  4 ++--
 tools/testing/selftests/resctrl/resctrl.h   |  1 +
 tools/testing/selftests/resctrl/resctrlfs.c | 32 +++++++++++++++++++++++++++++
 5 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
index 3bbf3042fb06..d09e693dc739 100644
--- a/tools/testing/selftests/resctrl/cmt_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -169,8 +169,8 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param
 		return ret;
 
 	ret = check_results(&param, span, n);
-	if (ret && (get_vendor() == ARCH_INTEL))
-		ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
+	if (ret && (get_vendor() == ARCH_INTEL) && !snc_kernel_support())
+		ksft_print_msg("Kernel doesn't support Sub-NUMA Clustering but it is enabled on the system.\n");
 
 	return ret;
 }
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 536d9089d2f6..c7e9adc0368f 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -201,6 +201,8 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param
 		return ret;
 
 	ret = check_results();
+	if (ret && (get_vendor() == ARCH_INTEL) && !snc_kernel_support())
+		ksft_print_msg("Kernel doesn't support Sub-NUMA Clustering but it is enabled on the system.\n");
 
 	return ret;
 }
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index 315b2ef3b3bc..84d8bc250539 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -160,8 +160,8 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param
 		return ret;
 
 	ret = check_results(param.fill_buf ? param.fill_buf->buf_size : 0);
-	if (ret && (get_vendor() == ARCH_INTEL))
-		ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
+	if (ret && (get_vendor() == ARCH_INTEL) && !snc_kernel_support())
+		ksft_print_msg("Kernel doesn't support Sub-NUMA Clustering but it is enabled on the system.\n");
 
 	return ret;
 }
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 35fa3afee9c3..cd3adfc14969 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -203,6 +203,7 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
 int signal_handler_register(const struct resctrl_test *test);
 void signal_handler_unregister(void);
 unsigned int count_bits(unsigned long n);
+int snc_kernel_support(void);
 
 void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config);
 void perf_event_initialize_read_format(struct perf_event_read *pe_read);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index dc7ce3cbdb27..195f04c4d158 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -957,3 +957,35 @@ unsigned int count_bits(unsigned long n)
 
 	return count;
 }
+
+/**
+ * snc_kernel_support - Check for existence of mon_sub_L3_00 file that indicates
+ * SNC resctrl support on the kernel side.
+ *
+ * Return: 0 if not supported, 1 if SNC is disabled or SNC discovery is
+ * unreliable or SNC is both enabled and supported.
+ */
+int snc_kernel_support(void)
+{
+	char node_path[PATH_MAX];
+	struct stat statbuf;
+	int ret;
+
+	ret = snc_nodes_per_l3_cache();
+	/*
+	 * If SNC is disabled then its kernel support isn't important. If SNC
+	 * got disabled because the discovery process was unreliable the
+	 * snc_unreliable variable was set. It can be used to verify the SNC
+	 * discovery reliability elsewhere in the selftest.
+	 */
+	if (ret == 1)
+		return ret;
+
+	snprintf(node_path, sizeof(node_path), "%s/%s", RESCTRL_PATH,
+		 "mon_data/mon_L3_00/mon_sub_L3_00");
+
+	if (!stat(node_path, &statbuf))
+		return 1;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 0a5b8fff01bde1b9908f00004c676f2e2459333b Mon Sep 17 00:00:00 2001
From: Victor Nogueira <victor@mojatatu.com>
Date: Sat, 11 Jan 2025 18:15:15 -0300
Subject: selftests: net: Adapt ethtool mq tests to fix in qdisc graft

Because of patch[1] the graft behaviour changed

So the command:

tcq replace parent 100:1 handle 204:

Is no longer valid and will not delete 100:4 added by command:

tcq replace parent 100:4 handle 204: pfifo_fast

So to maintain the original behaviour, this patch manually deletes 100:4
and grafts 100:1

Note: This change will also work fine without [1]

[1] https://lore.kernel.org/netdev/20250111151455.75480-1-jhs@mojatatu.com/T/#u

Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh b/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh
index fd13c8cfb7a8..b411fe66510f 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/tc-mq-visibility.sh
@@ -58,9 +58,12 @@ for root in mq mqprio; do
     ethtool -L $NDEV combined 4
     n_child_assert 4 "One real queue, rest default"
 
-    # Graft some
-    tcq replace parent 100:1 handle 204:
-    n_child_assert 3 "Grafted"
+    # Remove real one
+    tcq del parent 100:4 handle 204:
+
+    # Replace default with pfifo
+    tcq replace parent 100:1 handle 205: pfifo limit 1000
+    n_child_assert 3 "Deleting real one, replacing default one with pfifo"
 
     ethtool -L $NDEV combined 1
     n_child_assert 1 "Grafted, one"
-- 
cgit v1.2.3


From 9fe17b7466f6c1dd29bac83a7a53303b8b16410f Mon Sep 17 00:00:00 2001
From: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Date: Fri, 10 Jan 2025 16:01:09 +0530
Subject: selftests/bpf: Fix test_xdp_adjust_tail_grow2 selftest on powerpc

On powerpc cache line size is 128 bytes, so skb_shared_info must be
aligned accordingly.

Signed-off-by: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20250110103109.3670793-1-skb99@linux.ibm.com
---
 tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c      | 2 ++
 tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
index 53d6ad8c2257..b2b2d85dbb1b 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -82,6 +82,8 @@ static void test_xdp_adjust_tail_grow2(void)
 	/* SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) */
 #if defined(__s390x__)
 	int tailroom = 512;
+#elif defined(__powerpc__)
+	int tailroom = 384;
 #else
 	int tailroom = 320;
 #endif
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
index 81bb38d72ced..dc74d8cf9e3f 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
@@ -10,6 +10,8 @@ int _xdp_adjust_tail_grow(struct xdp_md *xdp)
 	/* SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) */
 #if defined(__TARGET_ARCH_s390)
 	int tailroom = 512;
+#elif defined(__TARGET_ARCH_powerpc)
+	int tailroom = 384;
 #else
 	int tailroom = 320;
 #endif
-- 
cgit v1.2.3


From 336d02bc4c6bec5c3d933e5d470a94970f830957 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 14 Jan 2025 09:51:32 -0500
Subject: selftests/rseq: Fix handling of glibc without rseq support

When porting librseq commit:

commit c7b45750fa85 ("Adapt to glibc __rseq_size feature detection")

from librseq to the kernel selftests, the following line was missed
at the end of rseq_init():

  rseq_size = get_rseq_kernel_feature_size();

which effectively leaves rseq_size initialized to -1U when glibc does not
have rseq support. glibc supports rseq from version 2.35 onwards.

In a following librseq commit

commit c67d198627c2 ("Only set 'rseq_size' on first thread registration")

to mimic the libc behavior, a new approach is taken: don't set the
feature size in 'rseq_size' until at least one thread has successfully
registered. This allows using 'rseq_size' in fast-paths to test for both
registration status and available features. The caveat is that on libc
either all threads are registered or none are, while with bare librseq
it is the responsability of the user to register all threads using rseq.

This combines the changes from the following librseq git commits:

commit c7b45750fa85 ("Adapt to glibc __rseq_size feature detection")
commit c67d198627c2 ("Only set 'rseq_size' on first thread registration")

Fixes: a0cc649353bb ("selftests/rseq: Fix mm_cid test failure")
Reported-by: Raghavendra Rao Ananta <rananta@google.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Raghavendra Rao Ananta <rananta@google.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Carlos O'Donell <carlos@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: linux-kselftest@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rseq/rseq.c | 32 +++++++++++++++++++++++++-------
 tools/testing/selftests/rseq/rseq.h |  9 ++++++++-
 2 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 5b9772cdf265..f6156790c3b4 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -61,7 +61,6 @@ unsigned int rseq_size = -1U;
 unsigned int rseq_flags;
 
 static int rseq_ownership;
-static int rseq_reg_success;	/* At least one rseq registration has succeded. */
 
 /* Allocate a large area for the TLS. */
 #define RSEQ_THREAD_AREA_ALLOC_SIZE	1024
@@ -152,14 +151,27 @@ int rseq_register_current_thread(void)
 	}
 	rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
 	if (rc) {
-		if (RSEQ_READ_ONCE(rseq_reg_success)) {
+		/*
+		 * After at least one thread has registered successfully
+		 * (rseq_size > 0), the registration of other threads should
+		 * never fail.
+		 */
+		if (RSEQ_READ_ONCE(rseq_size) > 0) {
 			/* Incoherent success/failure within process. */
 			abort();
 		}
 		return -1;
 	}
 	assert(rseq_current_cpu_raw() >= 0);
-	RSEQ_WRITE_ONCE(rseq_reg_success, 1);
+
+	/*
+	 * The first thread to register sets the rseq_size to mimic the libc
+	 * behavior.
+	 */
+	if (RSEQ_READ_ONCE(rseq_size) == 0) {
+		RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size());
+	}
+
 	return 0;
 }
 
@@ -235,12 +247,18 @@ void rseq_init(void)
 		return;
 	}
 	rseq_ownership = 1;
-	if (!rseq_available()) {
-		rseq_size = 0;
-		return;
-	}
+
+	/* Calculate the offset of the rseq area from the thread pointer. */
 	rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
+
+	/* rseq flags are deprecated, always set to 0. */
 	rseq_flags = 0;
+
+	/*
+	 * Set the size to 0 until at least one thread registers to mimic the
+	 * libc behavior.
+	 */
+	rseq_size = 0;
 }
 
 static __attribute__((destructor))
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index 4e217b620e0c..062d10925a10 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -60,7 +60,14 @@
 extern ptrdiff_t rseq_offset;
 
 /*
- * Size of the registered rseq area. 0 if the registration was
+ * The rseq ABI is composed of extensible feature fields. The extensions
+ * are done by appending additional fields at the end of the structure.
+ * The rseq_size defines the size of the active feature set which can be
+ * used by the application for the current rseq registration. Features
+ * starting at offset >= rseq_size are inactive and should not be used.
+ *
+ * The rseq_size is the intersection between the available allocation
+ * size for the rseq area and the feature size supported by the kernel.
  * unsuccessful.
  */
 extern unsigned int rseq_size;
-- 
cgit v1.2.3


From 894dae026bf6348766cf1951da63af6e36e4d90a Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 14 Jan 2025 19:03:11 +0100
Subject: selftests: mptcp: simult_flows: unify errors msgs

In order to unify what is printed in case of error, similar to what is
done in mptcp_connect.sh and mptcp_join.sh, it is interesting to do the
following modifications in simult_flows.sh:

- Print the rc errors at the end of the line.

- Print the MIB counters.

- Use the same ss options: add -M (MPTCP sockets) and -e (detailed
  socket information).

While at it, also print of the 'max' time only in case of success,
because 'mptcp_connect.c' will already print this info in case of error,
e.g.:

  transfer slower than expected! runtime 11948 ms, expected 11921 ms

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250114-net-next-mptcp-st-more-debug-err-v1-1-2ffb16a6cf35@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/simult_flows.sh | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index 8fa77c8e9b65..e98e5907d52c 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -155,6 +155,11 @@ do_transfer()
 		sleep 1
 	fi
 
+	NSTAT_HISTORY=/tmp/${ns3}.nstat ip netns exec ${ns3} \
+		nstat -n
+	NSTAT_HISTORY=/tmp/${ns1}.nstat ip netns exec ${ns1} \
+		nstat -n
+
 	timeout ${timeout_test} \
 		ip netns exec ${ns3} \
 			./mptcp_connect -jt ${timeout_poll} -l -p $port -T $max_time \
@@ -180,25 +185,31 @@ do_transfer()
 		kill ${cappid_connector}
 	fi
 
+	NSTAT_HISTORY=/tmp/${ns3}.nstat ip netns exec ${ns3} \
+		nstat | grep Tcp > /tmp/${ns3}.out
+	NSTAT_HISTORY=/tmp/${ns1}.nstat ip netns exec ${ns1} \
+		nstat | grep Tcp > /tmp/${ns1}.out
+
 	cmp $sin $cout > /dev/null 2>&1
 	local cmps=$?
 	cmp $cin $sout > /dev/null 2>&1
 	local cmpc=$?
 
-	printf "%-16s" " max $max_time "
 	if [ $retc -eq 0 ] && [ $rets -eq 0 ] && \
 	   [ $cmpc -eq 0 ] && [ $cmps -eq 0 ]; then
+		printf "%-16s" " max $max_time "
 		mptcp_lib_pr_ok
 		cat "$capout"
 		return 0
 	fi
 
-	mptcp_lib_pr_fail
-	echo "client exit code $retc, server $rets" 1>&2
+	mptcp_lib_pr_fail "client exit code $retc, server $rets"
 	echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
-	ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
+	ip netns exec ${ns3} ss -Menita 1>&2 -o "sport = :$port"
+	cat /tmp/${ns3}.out
 	echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
-	ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+	ip netns exec ${ns1} ss -Menita 1>&2 -o "dport = :$port"
+	cat /tmp/${ns1}.out
 	ls -l $sin $cout
 	ls -l $cin $sout
 
-- 
cgit v1.2.3


From 3257d4cb8d5c0090a87053f16086ef1e2c32dc33 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Tue, 14 Jan 2025 19:03:12 +0100
Subject: selftests: mptcp: sockopt: save nstat infos

Similar to the way nstat information is stored in mptcp_connect.sh
and mptcp_join.sh scripts, this patch adds a similar way for
mptcp_sockopt.sh and displays the nstat information when errors
occur.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250114-net-next-mptcp-st-more-debug-err-v1-2-2ffb16a6cf35@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index 5e8d5b83e2d0..9a78bfdc3d5e 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -169,6 +169,11 @@ do_transfer()
 		cmsg+=",TCPINQ"
 	fi
 
+	NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+		nstat -n
+	NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+		nstat -n
+
 	timeout ${timeout_test} \
 		ip netns exec ${listener_ns} \
 			$mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c "${cmsg}" \
@@ -189,14 +194,20 @@ do_transfer()
 	wait $spid
 	local rets=$?
 
+	NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+		nstat | grep Tcp > /tmp/${listener_ns}.out
+	NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+		nstat | grep Tcp > /tmp/${connector_ns}.out
+
 	print_title "Transfer ${ip:2}"
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		mptcp_lib_pr_fail "client exit code $retc, server $rets"
 		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
 		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-
+		cat /tmp/${listener_ns}.out
 		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
 		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
+		cat /tmp/${connector_ns}.out
 
 		mptcp_lib_result_fail "transfer ${ip}"
 
-- 
cgit v1.2.3


From 8c6bb011e18811175e7bd351b82286318496b334 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 14 Jan 2025 19:03:13 +0100
Subject: selftests: mptcp: move stats info in case of errors to lib.sh

A few MPTCP selftests are using the same code to print stats in case of
error. This code can then be moved to mptcp_lib.sh.

No behaviour changes intended, except to print the error in red and to
stderr, like most error messages.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250114-net-next-mptcp-st-more-debug-err-v1-3-2ffb16a6cf35@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh |  8 ++------
 tools/testing/selftests/net/mptcp/mptcp_join.sh    |  9 ++-------
 tools/testing/selftests/net/mptcp/mptcp_lib.sh     | 21 +++++++++++++++++++++
 tools/testing/selftests/net/mptcp/mptcp_sockopt.sh |  8 ++------
 tools/testing/selftests/net/mptcp/simult_flows.sh  |  8 ++------
 5 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index b48b4e56826a..bfdaecd0a6a0 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -445,12 +445,8 @@ do_transfer()
 	printf "(duration %05sms) " "${duration}"
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		mptcp_lib_pr_fail "client exit code $retc, server $rets"
-		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-		cat /tmp/${listener_ns}.out
-		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
-		[ ${listener_ns} != ${connector_ns} ] && cat /tmp/${connector_ns}.out
+		mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \
+			"/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out"
 
 		echo
 		cat "$capout"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index c07e2bd3a315..13a3b68181ee 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1039,13 +1039,8 @@ do_transfer()
 
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		fail_test "client exit code $retc, server $rets"
-		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-		cat /tmp/${listener_ns}.out
-		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
-		cat /tmp/${connector_ns}.out
-
+		mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \
+			"/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out"
 		return 1
 	fi
 
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index 975d4d4c862a..91a1d3b76e66 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -107,6 +107,27 @@ mptcp_lib_pr_info() {
 	mptcp_lib_print_info "INFO: ${*}"
 }
 
+# $1-2: listener/connector ns ; $3 port ; $4-5 listener/connector stat file
+mptcp_lib_pr_err_stats() {
+	local lns="${1}"
+	local cns="${2}"
+	local port="${3}"
+	local lstat="${4}"
+	local cstat="${5}"
+
+	echo -en "${MPTCP_LIB_COLOR_RED}"
+	{
+		printf "\nnetns %s (listener) socket stat for %d:\n" "${lns}" "${port}"
+		ip netns exec "${lns}" ss -Menita -o "sport = :${port}"
+		cat "${lstat}"
+
+		printf "\nnetns %s (connector) socket stat for %d:\n" "${cns}" "${port}"
+		ip netns exec "${cns}" ss -Menita -o "dport = :${port}"
+		[ "${lstat}" != "${cstat}" ] && cat "${cstat}"
+	} 1>&2
+	echo -en "${MPTCP_LIB_COLOR_RESET}"
+}
+
 # SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var can be set when validating all
 # features using the last version of the kernel and the selftests to make sure
 # a test is not being skipped by mistake.
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index 9a78bfdc3d5e..418a903c3a4d 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -202,12 +202,8 @@ do_transfer()
 	print_title "Transfer ${ip:2}"
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		mptcp_lib_pr_fail "client exit code $retc, server $rets"
-		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-		cat /tmp/${listener_ns}.out
-		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
-		cat /tmp/${connector_ns}.out
+		mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \
+			"/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out"
 
 		mptcp_lib_result_fail "transfer ${ip}"
 
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index e98e5907d52c..9c2a415976cb 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -204,12 +204,8 @@ do_transfer()
 	fi
 
 	mptcp_lib_pr_fail "client exit code $retc, server $rets"
-	echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
-	ip netns exec ${ns3} ss -Menita 1>&2 -o "sport = :$port"
-	cat /tmp/${ns3}.out
-	echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
-	ip netns exec ${ns1} ss -Menita 1>&2 -o "dport = :$port"
-	cat /tmp/${ns1}.out
+	mptcp_lib_pr_err_stats "${ns3}" "${ns1}" "${port}" \
+		"/tmp/${ns3}.out" "/tmp/${ns1}.out"
 	ls -l $sin $cout
 	ls -l $cin $sout
 
-- 
cgit v1.2.3


From 5fbea888f8aa0668761f6a1e9736664b3580cb65 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 14 Jan 2025 19:03:14 +0100
Subject: selftests: mptcp: add -m with ss in case of errors

Recently, we had an issue where getting info about the memory would have
helped better understanding what went wrong.

Let add it just in case for later.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250114-net-next-mptcp-st-more-debug-err-v1-4-2ffb16a6cf35@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_lib.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index 91a1d3b76e66..051e289d7967 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -118,11 +118,11 @@ mptcp_lib_pr_err_stats() {
 	echo -en "${MPTCP_LIB_COLOR_RED}"
 	{
 		printf "\nnetns %s (listener) socket stat for %d:\n" "${lns}" "${port}"
-		ip netns exec "${lns}" ss -Menita -o "sport = :${port}"
+		ip netns exec "${lns}" ss -Menitam -o "sport = :${port}"
 		cat "${lstat}"
 
 		printf "\nnetns %s (connector) socket stat for %d:\n" "${cns}" "${port}"
-		ip netns exec "${cns}" ss -Menita -o "dport = :${port}"
+		ip netns exec "${cns}" ss -Menitam -o "dport = :${port}"
 		[ "${lstat}" != "${cstat}" ] && cat "${cstat}"
 	} 1>&2
 	echo -en "${MPTCP_LIB_COLOR_RESET}"
-- 
cgit v1.2.3


From b265c5a174237b33e4973bd385f24db39cc76d26 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 14 Jan 2025 19:03:15 +0100
Subject: selftests: mptcp: connect: remove unused variable

'cin_disconnect' is used in run_tests_disconnect(), but not
'cout_disconnect', so it is safe to drop it.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250114-net-next-mptcp-st-more-debug-err-v1-5-2ffb16a6cf35@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index bfdaecd0a6a0..e508d356fcda 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -137,7 +137,7 @@ TEST_GROUP=""
 #shellcheck disable=SC2317
 cleanup()
 {
-	rm -f "$cin_disconnect" "$cout_disconnect"
+	rm -f "$cin_disconnect"
 	rm -f "$cin" "$cout"
 	rm -f "$sin" "$sout"
 	rm -f "$capout"
@@ -155,7 +155,6 @@ cin=$(mktemp)
 cout=$(mktemp)
 capout=$(mktemp)
 cin_disconnect="$cin".disconnect
-cout_disconnect="$cout".disconnect
 trap cleanup EXIT
 
 mptcp_lib_ns_init ns1 ns2 ns3 ns4
-- 
cgit v1.2.3


From 540d3f8f1daccde52ed85c9e480586387461eb86 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 14 Jan 2025 19:03:16 +0100
Subject: selftests: mptcp: connect: better display the files size

'du' will print the name of the file, which was already displayed
before, e.g.

  Created /tmp/tmp.UOyy0ghfmQ (size 4703740/tmp/tmp.UOyy0ghfmQ) containing data sent by client
  Created /tmp/tmp.xq3zvFinGo (size 1391724/tmp/tmp.xq3zvFinGo) containing data sent by server

'stat' can be used instead, to display this instead:

  Created /tmp/tmp.UOyy0ghfmQ (size 4703740 B) containing data sent by client
  Created /tmp/tmp.xq3zvFinGo (size 1391724 B) containing data sent by server

So easier to spot the file sizes.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250114-net-next-mptcp-st-more-debug-err-v1-6-2ffb16a6cf35@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index e508d356fcda..5e3c56253274 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -582,7 +582,7 @@ make_file()
 	mptcp_lib_make_file $name 1024 $ksize
 	dd if=/dev/urandom conv=notrunc of="$name" oflag=append bs=1 count=$rem 2> /dev/null
 
-	echo "Created $name (size $(du -b "$name")) containing data sent by $who"
+	echo "Created $name (size $(stat -c "%s" "$name") B) containing data sent by $who"
 }
 
 run_tests_lo()
-- 
cgit v1.2.3


From 7a649f39dab77b494b2b2dd38153d0ba7b037c4b Mon Sep 17 00:00:00 2001
From: Alessandro Zanni <alessandro.zanni87@gmail.com>
Date: Tue, 14 Jan 2025 01:33:16 +0100
Subject: selftests/net/forwarding: teamd command not found

Running "make kselftest TARGETS=net/forwarding" results in
multiple ccurrences of the same error:
- ./lib.sh: line 787: teamd: command not found

This patch adds the variable $REQUIRE_TEAMD in every test that uses the
command teamd and checks the $REQUIRE_TEAMD variable in the file "lib.sh"
to skip the test if the command is not installed.

Signed-off-by: Alessandro Zanni <alessandro.zanni87@gmail.com>
Link: https://patch.msgid.link/20250114003323.97207-1-alessandro.zanni87@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/rif_bridge.sh            | 1 +
 tools/testing/selftests/drivers/net/mlxsw/rif_lag.sh               | 1 +
 tools/testing/selftests/drivers/net/mlxsw/rif_lag_vlan.sh          | 1 +
 tools/testing/selftests/net/forwarding/lib.sh                      | 4 ++++
 tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh | 1 +
 tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh      | 1 +
 tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh     | 1 +
 tools/testing/selftests/net/forwarding/router_bridge_lag.sh        | 1 +
 8 files changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/rif_bridge.sh b/tools/testing/selftests/drivers/net/mlxsw/rif_bridge.sh
index b79542a4dcc7..4a11bf1d514a 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/rif_bridge.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/rif_bridge.sh
@@ -12,6 +12,7 @@ ALL_TESTS="
 	bridge_rif_remaster_port
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=2
 source $lib_dir/lib.sh
 source $lib_dir/devlink_lib.sh
diff --git a/tools/testing/selftests/drivers/net/mlxsw/rif_lag.sh b/tools/testing/selftests/drivers/net/mlxsw/rif_lag.sh
index e28f978104f3..b8bbe94f4736 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/rif_lag.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/rif_lag.sh
@@ -10,6 +10,7 @@ ALL_TESTS="
 	lag_rif_nomaster_addr
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=2
 source $lib_dir/lib.sh
 source $lib_dir/devlink_lib.sh
diff --git a/tools/testing/selftests/drivers/net/mlxsw/rif_lag_vlan.sh b/tools/testing/selftests/drivers/net/mlxsw/rif_lag_vlan.sh
index 6318cfa6434c..d1a9d379eaf3 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/rif_lag_vlan.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/rif_lag_vlan.sh
@@ -10,6 +10,7 @@ ALL_TESTS="
 	lag_rif_nomaster_addr
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=2
 source $lib_dir/lib.sh
 source $lib_dir/devlink_lib.sh
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 1fd40bada694..8de80acf249e 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -68,6 +68,7 @@ declare -A NETIFS=(
 : "${REQUIRE_JQ:=yes}"
 : "${REQUIRE_MZ:=yes}"
 : "${REQUIRE_MTOOLS:=no}"
+: "${REQUIRE_TEAMD:=no}"
 
 # Whether to override MAC addresses on interfaces participating in the test.
 : "${STABLE_MAC_ADDRS:=no}"
@@ -321,6 +322,9 @@ fi
 if [[ "$REQUIRE_MZ" = "yes" ]]; then
 	require_command $MZ
 fi
+if [[ "$REQUIRE_TEAMD" = "yes" ]]; then
+	require_command $TEAMD
+fi
 if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then
 	# https://github.com/troglobit/mtools
 	require_command msend
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
index fe4d7c906a70..a20d22d1df36 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
@@ -49,6 +49,7 @@ ALL_TESTS="
 	test_mirror_gretap_second
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
index 1261e6f46e34..ff7049582d35 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
@@ -53,6 +53,7 @@ ALL_TESTS="
 	test_mirror_gretap_second
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
diff --git a/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh b/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh
index e064b946e821..16583a470ec3 100755
--- a/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh
+++ b/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh
@@ -109,6 +109,7 @@ ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
 "
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=8
 source lib.sh
 
diff --git a/tools/testing/selftests/net/forwarding/router_bridge_lag.sh b/tools/testing/selftests/net/forwarding/router_bridge_lag.sh
index f05ffe213c46..2a4cd1af1b85 100755
--- a/tools/testing/selftests/net/forwarding/router_bridge_lag.sh
+++ b/tools/testing/selftests/net/forwarding/router_bridge_lag.sh
@@ -76,6 +76,7 @@
 	ping_ipv4
 	ping_ipv6
     "}
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=8
 : ${lib_dir:=.}
 source $lib_dir/lib.sh
-- 
cgit v1.2.3


From cfd70e3eba2b68aa230d431e3c6ca0a1566e8d2e Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 14 Jan 2025 14:28:52 +0000
Subject: selftest: net-drv: hds: add test for HDS feature

HDS/HDS-thresh features were updated/implemented. so add some tests for
these features.

HDS tests are the same with `ethtool -G eth0 tcp-data-split <on | off |
auto >` but `auto` depends on driver specification.
So, it doesn't include `auto` case.

HDS-thresh tests are same with `ethtool -G eth0 hds-thresh <0 - MAX>`
It includes both 0 and MAX cases. It also includes exceed case, MAX + 1.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Link: https://patch.msgid.link/20250114142852.3364986-11-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/Makefile |   1 +
 tools/testing/selftests/drivers/net/hds.py   | 120 +++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/hds.py

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 469179c18935..137470bdee0c 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -12,6 +12,7 @@ TEST_PROGS := \
 	queues.py \
 	stats.py \
 	shaper.py \
+	hds.py \
 # end of TEST_PROGS
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/hds.py b/tools/testing/selftests/drivers/net/hds.py
new file mode 100755
index 000000000000..394971b25c0b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hds.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import errno
+from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_raises, KsftSkipEx
+from lib.py import EthtoolFamily, NlError
+from lib.py import NetDrvEnv
+
+def get_hds(cfg, netnl) -> None:
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'tcp-data-split' not in rings:
+        raise KsftSkipEx('tcp-data-split not supported by device')
+
+def get_hds_thresh(cfg, netnl) -> None:
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'hds-thresh' not in rings:
+        raise KsftSkipEx('hds-thresh not supported by device')
+
+def set_hds_enable(cfg, netnl) -> None:
+    try:
+        netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'tcp-data-split': 'enabled'})
+    except NlError as e:
+        if e.error == errno.EINVAL:
+            raise KsftSkipEx("disabling of HDS not supported by the device")
+        elif e.error == errno.EOPNOTSUPP:
+            raise KsftSkipEx("ring-set not supported by the device")
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'tcp-data-split' not in rings:
+        raise KsftSkipEx('tcp-data-split not supported by device')
+
+    ksft_eq('enabled', rings['tcp-data-split'])
+
+def set_hds_disable(cfg, netnl) -> None:
+    try:
+        netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'tcp-data-split': 'disabled'})
+    except NlError as e:
+        if e.error == errno.EINVAL:
+            raise KsftSkipEx("disabling of HDS not supported by the device")
+        elif e.error == errno.EOPNOTSUPP:
+            raise KsftSkipEx("ring-set not supported by the device")
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'tcp-data-split' not in rings:
+        raise KsftSkipEx('tcp-data-split not supported by device')
+
+    ksft_eq('disabled', rings['tcp-data-split'])
+
+def set_hds_thresh_zero(cfg, netnl) -> None:
+    try:
+        netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': 0})
+    except NlError as e:
+        if e.error == errno.EINVAL:
+            raise KsftSkipEx("hds-thresh-set not supported by the device")
+        elif e.error == errno.EOPNOTSUPP:
+            raise KsftSkipEx("ring-set not supported by the device")
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'hds-thresh' not in rings:
+        raise KsftSkipEx('hds-thresh not supported by device')
+
+    ksft_eq(0, rings['hds-thresh'])
+
+def set_hds_thresh_max(cfg, netnl) -> None:
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'hds-thresh' not in rings:
+        raise KsftSkipEx('hds-thresh not supported by device')
+    try:
+        netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': rings['hds-thresh-max']})
+    except NlError as e:
+        if e.error == errno.EINVAL:
+            raise KsftSkipEx("hds-thresh-set not supported by the device")
+        elif e.error == errno.EOPNOTSUPP:
+            raise KsftSkipEx("ring-set not supported by the device")
+    rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    ksft_eq(rings['hds-thresh'], rings['hds-thresh-max'])
+
+def set_hds_thresh_gt(cfg, netnl) -> None:
+    try:
+        rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    except NlError as e:
+        raise KsftSkipEx('ring-get not supported by device')
+    if 'hds-thresh' not in rings:
+        raise KsftSkipEx('hds-thresh not supported by device')
+    if 'hds-thresh-max' not in rings:
+        raise KsftSkipEx('hds-thresh-max not defined by device')
+    hds_gt = rings['hds-thresh-max'] + 1
+    with ksft_raises(NlError) as e:
+        netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': hds_gt})
+    ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+def main() -> None:
+    with NetDrvEnv(__file__, queue_count=3) as cfg:
+        ksft_run([get_hds,
+                  get_hds_thresh,
+                  set_hds_disable,
+                  set_hds_enable,
+                  set_hds_thresh_zero,
+                  set_hds_thresh_max,
+                  set_hds_thresh_gt],
+                 args=(cfg, EthtoolFamily()))
+    ksft_exit()
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From a50da36562cd62b41de9bef08edbb3e8af00f118 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Jan 2025 08:14:36 -0800
Subject: netdev: avoid CFI problems with sock priv helpers

Li Li reports that casting away callback type may cause issues
for CFI. Let's generate a small wrapper for each callback,
to make sure compiler sees the anticipated types.

Reported-by: Li Li <dualli@chromium.org>
Link: https://lore.kernel.org/CANBPYPjQVqmzZ4J=rVQX87a9iuwmaetULwbK_5_3YWk2eGzkaA@mail.gmail.com
Fixes: 170aafe35cb9 ("netdev: support binding dma-buf to netdevice")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20250115161436.648646-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/net/ynl/ynl-gen-c.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index d8201c4b1520..6750fdb42564 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2384,6 +2384,17 @@ def print_kernel_family_struct_src(family, cw):
     if not kernel_can_gen_family_struct(family):
         return
 
+    if 'sock-priv' in family.kernel_family:
+        # Generate "trampolines" to make CFI happy
+        cw.write_func("static void", f"__{family.c_name}_nl_sock_priv_init",
+                      [f"{family.c_name}_nl_sock_priv_init(priv);"],
+                      ["void *priv"])
+        cw.nl()
+        cw.write_func("static void", f"__{family.c_name}_nl_sock_priv_destroy",
+                      [f"{family.c_name}_nl_sock_priv_destroy(priv);"],
+                      ["void *priv"])
+        cw.nl()
+
     cw.block_start(f"struct genl_family {family.ident_name}_nl_family __ro_after_init =")
     cw.p('.name\t\t= ' + family.fam_key + ',')
     cw.p('.version\t= ' + family.ver_key + ',')
@@ -2401,9 +2412,8 @@ def print_kernel_family_struct_src(family, cw):
         cw.p(f'.n_mcgrps\t= ARRAY_SIZE({family.c_name}_nl_mcgrps),')
     if 'sock-priv' in family.kernel_family:
         cw.p(f'.sock_priv_size\t= sizeof({family.kernel_family["sock-priv"]}),')
-        # Force cast here, actual helpers take pointer to the real type.
-        cw.p(f'.sock_priv_init\t= (void *){family.c_name}_nl_sock_priv_init,')
-        cw.p(f'.sock_priv_destroy = (void *){family.c_name}_nl_sock_priv_destroy,')
+        cw.p(f'.sock_priv_init\t= __{family.c_name}_nl_sock_priv_init,')
+        cw.p(f'.sock_priv_destroy = __{family.c_name}_nl_sock_priv_destroy,')
     cw.block_end(';')
 
 
-- 
cgit v1.2.3


From 542079b4b12e89f82c8a689b6e9b119ab7d52018 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 16 Jan 2025 09:33:37 -0500
Subject: selftests/ftrace: Add test that tests event :mod: commands

Now that here's a :mod: command that can be sent into set_event, add a
test that tests its use. Both setting events for a loaded module, as well
as caching what events to set for a module that is not loaded yet.

Cc: Shuah Khan <shuah@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kselftest@vger.kernel.org
Link: https://lore.kernel.org/20250116143533.819228058@goodmis.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 .../selftests/ftrace/test.d/event/event-mod.tc     | 191 +++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/event/event-mod.tc

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/event/event-mod.tc b/tools/testing/selftests/ftrace/test.d/event/event-mod.tc
new file mode 100644
index 000000000000..175243cd9ab7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/event-mod.tc
@@ -0,0 +1,191 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - enable/disable with module event
+# requires: set_event "Can enable module events via: :mod:":README
+# flags: instance
+
+rmmod trace-events-sample ||:
+if ! modprobe trace-events-sample ; then
+  echo "No trace-events sample module - please make CONFIG_SAMPLE_TRACE_EVENTS=m"
+  exit_unresolved;
+fi
+trap "rmmod trace-events-sample" EXIT
+
+# Set events for the module
+echo ":mod:trace-events-sample" > set_event
+
+test_all_enabled() {
+
+	# Check if more than one is enabled
+	grep -q sample-trace:foo_bar set_event
+	grep -q sample-trace:foo_bar_with_cond set_event
+	grep -q sample-trace:foo_bar_with_fn set_event
+
+	# All of them should be enabled. Check via the enable file
+	val=`cat events/sample-trace/enable`
+	if [ $val -ne 1 ]; then
+		exit_fail
+	fi
+}
+
+clear_events() {
+	echo > set_event
+	val=`cat events/enable`
+	if [ "$val" != "0" ]; then
+		exit_fail
+	fi
+	count=`cat set_event | wc -l`
+	if [ $count -ne 0 ]; then
+		exit_fail
+	fi
+}
+
+test_all_enabled
+
+echo clear all events
+echo 0 > events/enable
+
+echo Confirm the events are disabled
+val=`cat events/sample-trace/enable`
+if [ $val -ne 0 ]; then
+	exit_fail
+fi
+
+echo And the set_event file is empty
+
+cnt=`wc -l set_event`
+if [ $cnt -ne 0 ]; then
+	exit_fail
+fi
+
+echo now enable all events
+echo 1 > events/enable
+
+echo Confirm the events are enabled again
+val=`cat events/sample-trace/enable`
+if [ $val -ne 1 ]; then
+	exit_fail
+fi
+
+echo disable just the module events
+echo '!:mod:trace-events-sample' >> set_event
+
+echo Should have mix of events enabled
+val=`cat events/enable`
+if [ "$val" != "X" ]; then
+	exit_fail
+fi
+
+echo Confirm the module events are disabled
+val=`cat events/sample-trace/enable`
+if [ $val -ne 0 ]; then
+	exit_fail
+fi
+
+echo 0 > events/enable
+
+echo now enable the system events
+echo 'sample-trace:mod:trace-events-sample' > set_event
+
+test_all_enabled
+
+echo clear all events
+echo 0 > events/enable
+
+echo Confirm the events are disabled
+val=`cat events/sample-trace/enable`
+if [ $val -ne 0 ]; then
+	exit_fail
+fi
+
+echo Test enabling foo_bar only
+echo 'foo_bar:mod:trace-events-sample' > set_event
+
+grep -q sample-trace:foo_bar set_event
+
+echo make sure nothing is found besides foo_bar
+if grep -q -v sample-trace:foo_bar set_event ; then
+	exit_fail
+fi
+
+echo Append another using the system and event name
+echo 'sample-trace:foo_bar_with_cond:mod:trace-events-sample' >> set_event
+
+grep -q sample-trace:foo_bar set_event
+grep -q sample-trace:foo_bar_with_cond set_event
+
+count=`cat set_event | wc -l`
+
+if [ $count -ne 2 ]; then
+	exit_fail
+fi
+
+clear_events
+
+rmmod trace-events-sample
+
+echo ':mod:trace-events-sample' > set_event
+
+echo make sure that the module shows up, and '-' is converted to '_'
+grep -q '\*:\*:mod:trace_events_sample' set_event
+
+modprobe trace-events-sample
+
+test_all_enabled
+
+clear_events
+
+rmmod trace-events-sample
+
+echo Enable just the system events
+echo 'sample-trace:mod:trace-events-sample' > set_event
+grep -q 'sample-trace:mod:trace_events_sample' set_event
+
+modprobe trace-events-sample
+
+test_all_enabled
+
+clear_events
+
+rmmod trace-events-sample
+
+echo Enable event with just event name
+echo 'foo_bar:mod:trace-events-sample' > set_event
+grep -q 'foo_bar:mod:trace_events_sample' set_event
+
+echo Enable another event with both system and event name
+echo 'sample-trace:foo_bar_with_cond:mod:trace-events-sample' >> set_event
+grep -q 'sample-trace:foo_bar_with_cond:mod:trace_events_sample' set_event
+echo Make sure the other event was still there
+grep -q 'foo_bar:mod:trace_events_sample' set_event
+
+modprobe trace-events-sample
+
+echo There should be no :mod: cached events
+if grep -q ':mod:' set_event; then
+	exit_fail
+fi
+
+echo two events should be enabled
+count=`cat set_event | wc -l`
+if [ $count -ne 2 ]; then
+	exit_fail
+fi
+
+echo only two events should be enabled
+val=`cat events/sample-trace/enable`
+if [ "$val" != "X" ]; then
+	exit_fail
+fi
+
+val=`cat events/sample-trace/foo_bar/enable`
+if [ "$val" != "1" ]; then
+	exit_fail
+fi
+
+val=`cat events/sample-trace/foo_bar_with_cond/enable`
+if [ "$val" != "1" ]; then
+	exit_fail
+fi
+
+clear_trace
-- 
cgit v1.2.3


From a8d1c48d0720140b53063ff23507845bb2078e92 Mon Sep 17 00:00:00 2001
From: Tony Ambardar <tony.ambardar@gmail.com>
Date: Wed, 15 Jan 2025 23:50:36 -0800
Subject: selftests/bpf: Fix undefined UINT_MAX in veristat.c

Include <limits.h> in 'veristat.c' to provide a UINT_MAX definition and
avoid multiple compile errors against mips64el/musl-libc:

veristat.c: In function 'max_verifier_log_size':
veristat.c:1135:36: error: 'UINT_MAX' undeclared (first use in this function)
 1135 |         const int SMALL_LOG_SIZE = UINT_MAX >> 8;
      |                                    ^~~~~~~~
veristat.c:24:1: note: 'UINT_MAX' is defined in header '<limits.h>'; did you forget to '#include <limits.h>'?
   23 | #include <math.h>
  +++ |+#include <limits.h>
   24 |

Fixes: 1f7c33630724 ("selftests/bpf: Increase verifier log limit in veristat")
Signed-off-by: Tony Ambardar <tony.ambardar@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250116075036.3459898-1-tony.ambardar@gmail.com
---
 tools/testing/selftests/bpf/veristat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 8dcf5ee000ca..c72111dfb35d 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -21,6 +21,7 @@
 #include <gelf.h>
 #include <float.h>
 #include <math.h>
+#include <limits.h>
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-- 
cgit v1.2.3


From 7c311b7cb3c7d84c5c342e803c5cb1b2fabbc438 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 15 Jan 2025 14:38:35 -0800
Subject: veristat: Load struct_ops programs only once

libbpf automatically adjusts autoload for struct_ops programs,
see libbpf.c:bpf_object_adjust_struct_ops_autoload.

For example, if there is a map:

    SEC(".struct_ops.link")
    struct sched_ext_ops ops = {
    	.enqueue = foo,
        .tick = bar,
    };

Both 'foo' and 'bar' would be loaded if 'ops' autocreate is true,
both 'foo' and 'bar' would be skipped if 'ops' autocreate is false.

This means that when veristat processes object file with 'ops',
it would load 4 programs in total: two programs per each
'process_prog' call.

The adjustment occurs at object load time, and libbpf remembers
association between 'ops' and 'foo'/'bar' at object open time.
The only way to persuade libbpf to load one of two is to adjust map
initial value, such that only one program is referenced.
This patch does exactly that, significantly reducing time to process
object files with big number of struct_ops programs.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115223835.919989-1-eddyz87@gmail.com
---
 tools/testing/selftests/bpf/veristat.c | 38 ++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index c72111dfb35d..06af5029885b 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -1062,6 +1062,41 @@ static int guess_prog_type_by_ctx_name(const char *ctx_name,
 	return -ESRCH;
 }
 
+/* Make sure only target program is referenced from struct_ops map,
+ * otherwise libbpf would automatically set autocreate for all
+ * referenced programs.
+ * See libbpf.c:bpf_object_adjust_struct_ops_autoload.
+ */
+static void mask_unrelated_struct_ops_progs(struct bpf_object *obj,
+					    struct bpf_map *map,
+					    struct bpf_program *prog)
+{
+	struct btf *btf = bpf_object__btf(obj);
+	const struct btf_type *t, *mt;
+	struct btf_member *m;
+	int i, moff;
+	size_t data_sz, ptr_sz = sizeof(void *);
+	void *data;
+
+	t = btf__type_by_id(btf, bpf_map__btf_value_type_id(map));
+	if (!btf_is_struct(t))
+		return;
+
+	data = bpf_map__initial_value(map, &data_sz);
+	for (i = 0; i < btf_vlen(t); i++) {
+		m = &btf_members(t)[i];
+		mt = btf__type_by_id(btf, m->type);
+		if (!btf_is_ptr(mt))
+			continue;
+		moff = m->offset / 8;
+		if (moff + ptr_sz > data_sz)
+			continue;
+		if (memcmp(data + moff, &prog, ptr_sz) == 0)
+			continue;
+		memset(data + moff, 0, ptr_sz);
+	}
+}
+
 static void fixup_obj(struct bpf_object *obj, struct bpf_program *prog, const char *filename)
 {
 	struct bpf_map *map;
@@ -1077,6 +1112,9 @@ static void fixup_obj(struct bpf_object *obj, struct bpf_program *prog, const ch
 		case BPF_MAP_TYPE_INODE_STORAGE:
 		case BPF_MAP_TYPE_CGROUP_STORAGE:
 			break;
+		case BPF_MAP_TYPE_STRUCT_OPS:
+			mask_unrelated_struct_ops_progs(obj, map, prog);
+			break;
 		default:
 			if (bpf_map__max_entries(map) == 0)
 				bpf_map__set_max_entries(map, 1);
-- 
cgit v1.2.3


From 4a04cb326a6c7f9a2c066f8c2ca78a5a9b87ddab Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 15 Jan 2025 10:02:38 +0000
Subject: selftests/bpf: Fix btf leak on new btf alloc failure in btf_distill
 test

Fix btf leak on new btf alloc failure in btf_distill test.

Fixes: affdeb50616b ("selftests/bpf: Extend distilled BTF tests to cover BTF relocation")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115100241.4171581-1-pulehui@huaweicloud.com
---
 tools/testing/selftests/bpf/prog_tests/btf_distill.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_distill.c b/tools/testing/selftests/bpf/prog_tests/btf_distill.c
index ca84726d5ac1..b72b966df77b 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_distill.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_distill.c
@@ -385,7 +385,7 @@ static void test_distilled_base_missing_err(void)
 		"[2] INT 'int' size=8 bits_offset=0 nr_bits=64 encoding=SIGNED");
 	btf5 = btf__new_empty();
 	if (!ASSERT_OK_PTR(btf5, "empty_reloc_btf"))
-		return;
+		goto cleanup;
 	btf__add_int(btf5, "int", 4, BTF_INT_SIGNED);   /* [1] int */
 	VALIDATE_RAW_BTF(
 		btf5,
@@ -478,7 +478,7 @@ static void test_distilled_base_multi_err2(void)
 		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED");
 	btf5 = btf__new_empty();
 	if (!ASSERT_OK_PTR(btf5, "empty_reloc_btf"))
-		return;
+		goto cleanup;
 	btf__add_int(btf5, "int", 4, BTF_INT_SIGNED);   /* [1] int */
 	btf__add_int(btf5, "int", 4, BTF_INT_SIGNED);   /* [2] int */
 	VALIDATE_RAW_BTF(
-- 
cgit v1.2.3


From 5436a54332c19df0acbef2b87cbf9f7cba56f2dd Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 15 Jan 2025 10:02:39 +0000
Subject: libbpf: Fix return zero when elf_begin failed

The error number of elf_begin is omitted when encapsulating the
btf_find_elf_sections function.

Fixes: c86f180ffc99 ("libbpf: Make btf_parse_elf process .BTF.base transparently")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115100241.4171581-2-pulehui@huaweicloud.com
---
 tools/lib/bpf/btf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index a4ae2df68b91..48c66f3a9200 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1186,6 +1186,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
 
 	elf = elf_begin(fd, ELF_C_READ, NULL);
 	if (!elf) {
+		err = -LIBBPF_ERRNO__FORMAT;
 		pr_warn("failed to open %s as ELF file\n", path);
 		goto done;
 	}
-- 
cgit v1.2.3


From 5ca681a86ef93369685cb63f71994f4cf7303e7c Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 15 Jan 2025 10:02:40 +0000
Subject: libbpf: Fix incorrect traversal end type ID when marking
 BTF_IS_EMBEDDED

When redirecting the split BTF to the vmlinux base BTF, we need to mark
the distilled base struct/union members of split BTF structs/unions in
id_map with BTF_IS_EMBEDDED. This indicates that these types must match
both name and size later. Therefore, we need to traverse the entire
split BTF, which involves traversing type IDs from nr_dist_base_types to
nr_types. However, the current implementation uses an incorrect
traversal end type ID, so let's correct it.

Fixes: 19e00c897d50 ("libbpf: Split BTF relocation")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115100241.4171581-3-pulehui@huaweicloud.com
---
 tools/lib/bpf/btf_relocate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c
index b72f83e15156..53d1f3541bce 100644
--- a/tools/lib/bpf/btf_relocate.c
+++ b/tools/lib/bpf/btf_relocate.c
@@ -212,7 +212,7 @@ static int btf_relocate_map_distilled_base(struct btf_relocate *r)
 	 * need to match both name and size, otherwise embedding the base
 	 * struct/union in the split type is invalid.
 	 */
-	for (id = r->nr_dist_base_types; id < r->nr_split_types; id++) {
+	for (id = r->nr_dist_base_types; id < r->nr_dist_base_types + r->nr_split_types; id++) {
 		err = btf_mark_embedded_composite_type_ids(r, id);
 		if (err)
 			goto done;
-- 
cgit v1.2.3


From 556a399406635566413f9c71b134d5d287b25b29 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 15 Jan 2025 10:02:41 +0000
Subject: selftests/bpf: Add distilled BTF test about marking BTF_IS_EMBEDDED

When redirecting the split BTF to the vmlinux base BTF, we need to mark
the distilled base struct/union members of split BTF structs/unions in
id_map with BTF_IS_EMBEDDED. This indicates that these types must match
both name and size later. So if a needed composite type, which is the
member of composite type in the split BTF, has a different size in the
base BTF we wish to relocate with, btf__relocate() should error out.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250115100241.4171581-4-pulehui@huaweicloud.com
---
 .../testing/selftests/bpf/prog_tests/btf_distill.c | 72 ++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_distill.c b/tools/testing/selftests/bpf/prog_tests/btf_distill.c
index b72b966df77b..fb67ae195a73 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_distill.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_distill.c
@@ -601,6 +601,76 @@ cleanup:
 	btf__free(base);
 }
 
+/* If a needed composite type, which is the member of composite type
+ * in the split BTF, has a different size in the base BTF we wish to
+ * relocate with, btf__relocate() should error out.
+ */
+static void test_distilled_base_embedded_err(void)
+{
+	struct btf *btf1 = NULL, *btf2 = NULL, *btf3 = NULL, *btf4 = NULL, *btf5 = NULL;
+
+	btf1 = btf__new_empty();
+	if (!ASSERT_OK_PTR(btf1, "empty_main_btf"))
+		return;
+
+	btf__add_int(btf1, "int", 4, BTF_INT_SIGNED);   /* [1] int */
+	btf__add_struct(btf1, "s1", 4);                 /* [2] struct s1 { */
+	btf__add_field(btf1, "f1", 1, 0, 0);            /*      int f1; */
+							/* } */
+	VALIDATE_RAW_BTF(
+		btf1,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] STRUCT 's1' size=4 vlen=1\n"
+		"\t'f1' type_id=1 bits_offset=0");
+
+	btf2 = btf__new_empty_split(btf1);
+	if (!ASSERT_OK_PTR(btf2, "empty_split_btf"))
+		goto cleanup;
+
+	btf__add_struct(btf2, "with_embedded", 8);      /* [3] struct with_embedded { */
+	btf__add_field(btf2, "e1", 2, 0, 0);		/*      struct s1 e1; */
+							/* } */
+
+	VALIDATE_RAW_BTF(
+		btf2,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] STRUCT 's1' size=4 vlen=1\n"
+		"\t'f1' type_id=1 bits_offset=0",
+		"[3] STRUCT 'with_embedded' size=8 vlen=1\n"
+		"\t'e1' type_id=2 bits_offset=0");
+
+	if (!ASSERT_EQ(0, btf__distill_base(btf2, &btf3, &btf4),
+		       "distilled_base") ||
+	    !ASSERT_OK_PTR(btf3, "distilled_base") ||
+	    !ASSERT_OK_PTR(btf4, "distilled_split") ||
+	    !ASSERT_EQ(2, btf__type_cnt(btf3), "distilled_base_type_cnt"))
+		goto cleanup;
+
+	VALIDATE_RAW_BTF(
+		btf4,
+		"[1] STRUCT 's1' size=4 vlen=0",
+		"[2] STRUCT 'with_embedded' size=8 vlen=1\n"
+		"\t'e1' type_id=1 bits_offset=0");
+
+	btf5 = btf__new_empty();
+	if (!ASSERT_OK_PTR(btf5, "empty_reloc_btf"))
+		goto cleanup;
+
+	btf__add_int(btf5, "int", 4, BTF_INT_SIGNED);   /* [1] int */
+	/* struct with the same name but different size */
+	btf__add_struct(btf5, "s1", 8);                 /* [2] struct s1 { */
+	btf__add_field(btf5, "f1", 1, 0, 0);            /*      int f1; */
+							/* } */
+
+	ASSERT_EQ(btf__relocate(btf4, btf5), -EINVAL, "relocate_split");
+cleanup:
+	btf__free(btf5);
+	btf__free(btf4);
+	btf__free(btf3);
+	btf__free(btf2);
+	btf__free(btf1);
+}
+
 void test_btf_distill(void)
 {
 	if (test__start_subtest("distilled_base"))
@@ -613,6 +683,8 @@ void test_btf_distill(void)
 		test_distilled_base_multi_err();
 	if (test__start_subtest("distilled_base_multi_err2"))
 		test_distilled_base_multi_err2();
+	if (test__start_subtest("distilled_base_embedded_err"))
+		test_distilled_base_embedded_err();
 	if (test__start_subtest("distilled_base_vmlinux"))
 		test_distilled_base_vmlinux();
 	if (test__start_subtest("distilled_endianness"))
-- 
cgit v1.2.3


From 8d20dcda404d48784f2359976811bfc189992aa0 Mon Sep 17 00:00:00 2001
From: John Daley <johndale@cisco.com>
Date: Wed, 15 Jan 2025 10:13:12 -0800
Subject: selftests: drv-net-hw: inject pp_alloc_fail errors in the right place

The tool pp_alloc_fail.py tested error recovery by injecting errors
into the function page_pool_alloc_pages(). The page pool allocation
function page_pool_dev_alloc() does not end up calling
page_pool_alloc_pages(). page_pool_alloc_netmems() seems to be the
function that is called by all of the page pool alloc functions in
the API, so move error injection to that function instead.

Signed-off-by: John Daley <johndale@cisco.com>
Link: https://patch.msgid.link/20250115181312.3544-2-johndale@cisco.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
index 05b6fbb3fcdd..ad192fef3117 100755
--- a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
+++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
@@ -21,9 +21,9 @@ def _enable_pp_allocation_fail():
     if not os.path.exists("/sys/kernel/debug/fail_function"):
         raise KsftSkipEx("Kernel built without function error injection (or DebugFS)")
 
-    if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+    if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_netmems"):
         with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
-            fp.write("page_pool_alloc_pages\n")
+            fp.write("page_pool_alloc_netmems\n")
 
     _write_fail_config({
         "verbose": 0,
@@ -37,7 +37,7 @@ def _disable_pp_allocation_fail():
     if not os.path.exists("/sys/kernel/debug/fail_function"):
         return
 
-    if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+    if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_netmems"):
         with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
             fp.write("\n")
 
-- 
cgit v1.2.3


From 3030e3d57ba8d0f59bd8162b3b1f3f7ee273f280 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Jan 2025 15:21:29 -0800
Subject: selftests/net: packetdrill: make tcp buf limited timing tests benign

The following tests are failing on debug kernels:

  tcp_tcp_info_tcp-info-rwnd-limited.pkt
  tcp_tcp_info_tcp-info-sndbuf-limited.pkt

with reports like:

      assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited; \
  AssertionError: 18000

and:

      assert 348000 <= tcpi_busy_time <= 360000, tcpi_busy_time
  AssertionError: 362000

Extend commit 912d6f669725 ("selftests/net: packetdrill: report benign
debug flakes as xfail") to cover them.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250115232129.845884-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/packetdrill/ksft_runner.sh | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index ff989c325eef..e15c43b7359b 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -43,6 +43,7 @@ if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
 		"tcp_timestamping.*.pkt"
 		"tcp_user_timeout_user-timeout-probe.pkt"
 		"tcp_zerocopy_epoll_.*.pkt"
+		"tcp_tcp_info_tcp-info-*-limited.pkt"
 	)
 	readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$"
 	[[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail
-- 
cgit v1.2.3


From 37cce22dbd51a3ef7f6c08c3fb5f1c5075a17fbb Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 14 Jan 2025 13:28:44 -0700
Subject: bpf: verifier: Refactor helper access type tracking

Previously, the verifier was treating all PTR_TO_STACK registers passed
to a helper call as potentially written to by the helper. However, all
calls to check_stack_range_initialized() already have precise access type
information available.

Rather than treat ACCESS_HELPER as a proxy for BPF_WRITE, pass
enum bpf_access_type to check_stack_range_initialized() to more
precisely track helper arguments.

One benefit from this precision is that registers tracked as valid
spills and passed as a read-only helper argument remain tracked after
the call.  Rather than being marked STACK_MISC afterwards.

An additional benefit is the verifier logs are also more precise. For
this particular error, users will enjoy a slightly clearer message. See
included selftest updates for examples.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/ff885c0e5859e0cd12077c3148ff0754cad4f7ed.1736886479.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/dynptr_fail.c              |  6 +++---
 tools/testing/selftests/bpf/progs/test_global_func10.c       |  2 +-
 tools/testing/selftests/bpf/progs/uninit_stack.c             |  5 +++--
 tools/testing/selftests/bpf/progs/verifier_basic_stack.c     |  2 +-
 tools/testing/selftests/bpf/progs/verifier_const_or.c        |  4 ++--
 .../selftests/bpf/progs/verifier_helper_access_var_len.c     | 12 ++++++------
 tools/testing/selftests/bpf/progs/verifier_int_ptr.c         |  2 +-
 tools/testing/selftests/bpf/progs/verifier_mtu.c             |  2 +-
 tools/testing/selftests/bpf/progs/verifier_raw_stack.c       |  4 ++--
 tools/testing/selftests/bpf/progs/verifier_unpriv.c          |  2 +-
 tools/testing/selftests/bpf/progs/verifier_var_off.c         |  8 ++++----
 tools/testing/selftests/bpf/verifier/calls.c                 |  2 +-
 12 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index dfd817d0348c..bd8f15229f5c 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -192,7 +192,7 @@ done:
 
 /* Can't add a dynptr to a map */
 SEC("?raw_tp")
-__failure __msg("invalid indirect read from stack")
+__failure __msg("invalid read from stack")
 int add_dynptr_to_map1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -210,7 +210,7 @@ int add_dynptr_to_map1(void *ctx)
 
 /* Can't add a struct with an embedded dynptr to a map */
 SEC("?raw_tp")
-__failure __msg("invalid indirect read from stack")
+__failure __msg("invalid read from stack")
 int add_dynptr_to_map2(void *ctx)
 {
 	struct test_info x;
@@ -398,7 +398,7 @@ int data_slice_missing_null_check2(void *ctx)
  * dynptr argument
  */
 SEC("?raw_tp")
-__failure __msg("invalid indirect read from stack")
+__failure __msg("invalid read from stack")
 int invalid_helper1(void *ctx)
 {
 	struct bpf_dynptr ptr;
diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c
index 5da001ca57a5..09d027bd3ea8 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func10.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func10.c
@@ -26,7 +26,7 @@ __noinline int foo(const struct Big *big)
 }
 
 SEC("cgroup_skb/ingress")
-__failure __msg("invalid indirect access to stack")
+__failure __msg("invalid read from stack")
 int global_func10(struct __sk_buff *skb)
 {
 	const struct Small small = {.x = skb->len };
diff --git a/tools/testing/selftests/bpf/progs/uninit_stack.c b/tools/testing/selftests/bpf/progs/uninit_stack.c
index 8a403470e557..046a204c8fc6 100644
--- a/tools/testing/selftests/bpf/progs/uninit_stack.c
+++ b/tools/testing/selftests/bpf/progs/uninit_stack.c
@@ -70,7 +70,8 @@ __naked int helper_uninit_to_misc(void *ctx)
 		r1 = r10;				\
 		r1 += -128;				\
 		r2 = 32;				\
-		call %[bpf_trace_printk];		\
+		r3 = 0;					\
+		call %[bpf_probe_read_user];		\
 		/* Call to dummy() forces print_verifier_state(..., true),	\
 		 * thus showing the stack state, matched by __msg().		\
 		 */					\
@@ -79,7 +80,7 @@ __naked int helper_uninit_to_misc(void *ctx)
 		exit;					\
 "
 		      :
-		      : __imm(bpf_trace_printk),
+		      : __imm(bpf_probe_read_user),
 			__imm(dummy)
 		      : __clobber_all);
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_basic_stack.c b/tools/testing/selftests/bpf/progs/verifier_basic_stack.c
index 8d77cc5323d3..fb62e09f2114 100644
--- a/tools/testing/selftests/bpf/progs/verifier_basic_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_basic_stack.c
@@ -28,7 +28,7 @@ __naked void stack_out_of_bounds(void)
 SEC("socket")
 __description("uninitialized stack1")
 __success __log_level(4) __msg("stack depth 8")
-__failure_unpriv __msg_unpriv("invalid indirect read from stack")
+__failure_unpriv __msg_unpriv("invalid read from stack")
 __naked void uninitialized_stack1(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_const_or.c b/tools/testing/selftests/bpf/progs/verifier_const_or.c
index ba8922b2eebd..68c568c3c3a0 100644
--- a/tools/testing/selftests/bpf/progs/verifier_const_or.c
+++ b/tools/testing/selftests/bpf/progs/verifier_const_or.c
@@ -25,7 +25,7 @@ __naked void constant_should_keep_constant_type(void)
 
 SEC("tracepoint")
 __description("constant register |= constant should not bypass stack boundary checks")
-__failure __msg("invalid indirect access to stack R1 off=-48 size=58")
+__failure __msg("invalid write to stack R1 off=-48 size=58")
 __naked void not_bypass_stack_boundary_checks_1(void)
 {
 	asm volatile ("					\
@@ -62,7 +62,7 @@ __naked void register_should_keep_constant_type(void)
 
 SEC("tracepoint")
 __description("constant register |= constant register should not bypass stack boundary checks")
-__failure __msg("invalid indirect access to stack R1 off=-48 size=58")
+__failure __msg("invalid write to stack R1 off=-48 size=58")
 __naked void not_bypass_stack_boundary_checks_2(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_access_var_len.c b/tools/testing/selftests/bpf/progs/verifier_helper_access_var_len.c
index 50c6b22606f6..f2c54e4d89eb 100644
--- a/tools/testing/selftests/bpf/progs/verifier_helper_access_var_len.c
+++ b/tools/testing/selftests/bpf/progs/verifier_helper_access_var_len.c
@@ -67,7 +67,7 @@ SEC("socket")
 __description("helper access to variable memory: stack, bitwise AND, zero included")
 /* in privileged mode reads from uninitialized stack locations are permitted */
 __success __failure_unpriv
-__msg_unpriv("invalid indirect read from stack R2 off -64+0 size 64")
+__msg_unpriv("invalid read from stack R2 off -64+0 size 64")
 __retval(0)
 __naked void stack_bitwise_and_zero_included(void)
 {
@@ -100,7 +100,7 @@ __naked void stack_bitwise_and_zero_included(void)
 
 SEC("tracepoint")
 __description("helper access to variable memory: stack, bitwise AND + JMP, wrong max")
-__failure __msg("invalid indirect access to stack R1 off=-64 size=65")
+__failure __msg("invalid write to stack R1 off=-64 size=65")
 __naked void bitwise_and_jmp_wrong_max(void)
 {
 	asm volatile ("					\
@@ -187,7 +187,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("tracepoint")
 __description("helper access to variable memory: stack, JMP, bounds + offset")
-__failure __msg("invalid indirect access to stack R1 off=-64 size=65")
+__failure __msg("invalid write to stack R1 off=-64 size=65")
 __naked void memory_stack_jmp_bounds_offset(void)
 {
 	asm volatile ("					\
@@ -211,7 +211,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("tracepoint")
 __description("helper access to variable memory: stack, JMP, wrong max")
-__failure __msg("invalid indirect access to stack R1 off=-64 size=65")
+__failure __msg("invalid write to stack R1 off=-64 size=65")
 __naked void memory_stack_jmp_wrong_max(void)
 {
 	asm volatile ("					\
@@ -260,7 +260,7 @@ SEC("socket")
 __description("helper access to variable memory: stack, JMP, no min check")
 /* in privileged mode reads from uninitialized stack locations are permitted */
 __success __failure_unpriv
-__msg_unpriv("invalid indirect read from stack R2 off -64+0 size 64")
+__msg_unpriv("invalid read from stack R2 off -64+0 size 64")
 __retval(0)
 __naked void stack_jmp_no_min_check(void)
 {
@@ -750,7 +750,7 @@ SEC("socket")
 __description("helper access to variable memory: 8 bytes leak")
 /* in privileged mode reads from uninitialized stack locations are permitted */
 __success __failure_unpriv
-__msg_unpriv("invalid indirect read from stack R2 off -64+32 size 64")
+__msg_unpriv("invalid read from stack R2 off -64+32 size 64")
 __retval(0)
 __naked void variable_memory_8_bytes_leak(void)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
index 5f2efb895edb..59e34d558654 100644
--- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
@@ -96,7 +96,7 @@ __naked void arg_ptr_to_long_misaligned(void)
 
 SEC("cgroup/sysctl")
 __description("arg pointer to long size < sizeof(long)")
-__failure __msg("invalid indirect access to stack R4 off=-4 size=8")
+__failure __msg("invalid write to stack R4 off=-4 size=8")
 __naked void to_long_size_sizeof_long(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_mtu.c b/tools/testing/selftests/bpf/progs/verifier_mtu.c
index 4ccf1ebc42d1..256956ea1ac5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_mtu.c
+++ b/tools/testing/selftests/bpf/progs/verifier_mtu.c
@@ -8,7 +8,7 @@ SEC("tc/ingress")
 __description("uninit/mtu: write rejected")
 __success
 __caps_unpriv(CAP_BPF|CAP_NET_ADMIN)
-__failure_unpriv __msg_unpriv("invalid indirect read from stack")
+__failure_unpriv __msg_unpriv("invalid read from stack")
 int tc_uninit_mtu(struct __sk_buff *ctx)
 {
 	__u32 mtu;
diff --git a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
index 7cc83acac727..c689665e07b9 100644
--- a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
@@ -236,7 +236,7 @@ __naked void load_bytes_spilled_regs_data(void)
 
 SEC("tc")
 __description("raw_stack: skb_load_bytes, invalid access 1")
-__failure __msg("invalid indirect access to stack R3 off=-513 size=8")
+__failure __msg("invalid write to stack R3 off=-513 size=8")
 __naked void load_bytes_invalid_access_1(void)
 {
 	asm volatile ("					\
@@ -255,7 +255,7 @@ __naked void load_bytes_invalid_access_1(void)
 
 SEC("tc")
 __description("raw_stack: skb_load_bytes, invalid access 2")
-__failure __msg("invalid indirect access to stack R3 off=-1 size=8")
+__failure __msg("invalid write to stack R3 off=-1 size=8")
 __naked void load_bytes_invalid_access_2(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c
index 7ea535bfbacd..a4a5e2071604 100644
--- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c
+++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c
@@ -199,7 +199,7 @@ __naked void pass_pointer_to_helper_function(void)
 SEC("socket")
 __description("unpriv: indirectly pass pointer on stack to helper function")
 __success __failure_unpriv
-__msg_unpriv("invalid indirect read from stack R2 off -8+0 size 8")
+__msg_unpriv("invalid read from stack R2 off -8+0 size 8")
 __retval(0)
 __naked void on_stack_to_helper_function(void)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_var_off.c b/tools/testing/selftests/bpf/progs/verifier_var_off.c
index c810f4f6f479..1d36d01b746e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_var_off.c
+++ b/tools/testing/selftests/bpf/progs/verifier_var_off.c
@@ -203,7 +203,7 @@ __naked void stack_write_clobbers_spilled_regs(void)
 
 SEC("sockops")
 __description("indirect variable-offset stack access, unbounded")
-__failure __msg("invalid unbounded variable-offset indirect access to stack R4")
+__failure __msg("invalid unbounded variable-offset write to stack R4")
 __naked void variable_offset_stack_access_unbounded(void)
 {
 	asm volatile ("					\
@@ -236,7 +236,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("lwt_in")
 __description("indirect variable-offset stack access, max out of bound")
-__failure __msg("invalid variable-offset indirect access to stack R2")
+__failure __msg("invalid variable-offset read from stack R2")
 __naked void access_max_out_of_bound(void)
 {
 	asm volatile ("					\
@@ -269,7 +269,7 @@ __naked void access_max_out_of_bound(void)
  */
 SEC("socket")
 __description("indirect variable-offset stack access, zero-sized, max out of bound")
-__failure __msg("invalid variable-offset indirect access to stack R1")
+__failure __msg("invalid variable-offset write to stack R1")
 __naked void zero_sized_access_max_out_of_bound(void)
 {
 	asm volatile ("                      \
@@ -294,7 +294,7 @@ __naked void zero_sized_access_max_out_of_bound(void)
 
 SEC("lwt_in")
 __description("indirect variable-offset stack access, min out of bound")
-__failure __msg("invalid variable-offset indirect access to stack R2")
+__failure __msg("invalid variable-offset read from stack R2")
 __naked void access_min_out_of_bound(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 7afc2619ab14..18596ae0b0c1 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -2252,7 +2252,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.fixup_map_hash_48b = { 7 },
-	.errstr_unpriv = "invalid indirect read from stack R2 off -8+0 size 8",
+	.errstr_unpriv = "invalid read from stack R2 off -8+0 size 8",
 	.result_unpriv = REJECT,
 	/* in privileged mode reads from uninitialized stack locations are permitted */
 	.result = ACCEPT,
-- 
cgit v1.2.3


From d2102f2f5d75a84dbab6ff890359f0bd4a18ca22 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 14 Jan 2025 13:28:45 -0700
Subject: bpf: verifier: Support eliding map lookup nullness

This commit allows progs to elide a null check on statically known map
lookup keys. In other words, if the verifier can statically prove that
the lookup will be in-bounds, allow the prog to drop the null check.

This is useful for two reasons:

1. Large numbers of nullness checks (especially when they cannot fail)
   unnecessarily pushes prog towards BPF_COMPLEXITY_LIMIT_JMP_SEQ.
2. It forms a tighter contract between programmer and verifier.

For (1), bpftrace is starting to make heavier use of percpu scratch
maps. As a result, for user scripts with large number of unrolled loops,
we are starting to hit jump complexity verification errors.  These
percpu lookups cannot fail anyways, as we only use static key values.
Eliding nullness probably results in less work for verifier as well.

For (2), percpu scratch maps are often used as a larger stack, as the
currrent stack is limited to 512 bytes. In these situations, it is
desirable for the programmer to express: "this lookup should never fail,
and if it does, it means I messed up the code". By omitting the null
check, the programmer can "ask" the verifier to double check the logic.

Tests also have to be updated in sync with these changes, as the
verifier is more efficient with this change. Notable, iters.c tests had
to be changed to use a map type that still requires null checks, as it's
exercising verifier tracking logic w.r.t iterators.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/68f3ea96ff3809a87e502a11a4bd30177fc5823e.1736886479.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/iters.c               | 14 +++++++-------
 tools/testing/selftests/bpf/progs/map_kptr_fail.c       |  2 +-
 tools/testing/selftests/bpf/progs/verifier_map_in_map.c |  2 +-
 tools/testing/selftests/bpf/verifier/map_kptr.c         |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 7c969c127573..190822b2f08b 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -524,11 +524,11 @@ int iter_subprog_iters(const void *ctx)
 }
 
 struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
 	__type(value, int);
 	__uint(max_entries, 1000);
-} arr_map SEC(".maps");
+} hash_map SEC(".maps");
 
 SEC("?raw_tp")
 __failure __msg("invalid mem access 'scalar'")
@@ -539,7 +539,7 @@ int iter_err_too_permissive1(const void *ctx)
 
 	MY_PID_GUARD();
 
-	map_val = bpf_map_lookup_elem(&arr_map, &key);
+	map_val = bpf_map_lookup_elem(&hash_map, &key);
 	if (!map_val)
 		return 0;
 
@@ -561,12 +561,12 @@ int iter_err_too_permissive2(const void *ctx)
 
 	MY_PID_GUARD();
 
-	map_val = bpf_map_lookup_elem(&arr_map, &key);
+	map_val = bpf_map_lookup_elem(&hash_map, &key);
 	if (!map_val)
 		return 0;
 
 	bpf_repeat(1000000) {
-		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		map_val = bpf_map_lookup_elem(&hash_map, &key);
 	}
 
 	*map_val = 123;
@@ -585,7 +585,7 @@ int iter_err_too_permissive3(const void *ctx)
 	MY_PID_GUARD();
 
 	bpf_repeat(1000000) {
-		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		map_val = bpf_map_lookup_elem(&hash_map, &key);
 		found = true;
 	}
 
@@ -606,7 +606,7 @@ int iter_tricky_but_fine(const void *ctx)
 	MY_PID_GUARD();
 
 	bpf_repeat(1000000) {
-		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		map_val = bpf_map_lookup_elem(&hash_map, &key);
 		if (map_val) {
 			found = true;
 			break;
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index c2a6bd392e48..4c0ff01f1a96 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -345,7 +345,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Unreleased reference id=5 alloc_insn=")
+__failure __msg("Unreleased reference id=4 alloc_insn=")
 int kptr_xchg_ref_state(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
diff --git a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
index 4eaab1468eb7..7d088ba99ea5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
+++ b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
@@ -47,7 +47,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("xdp")
 __description("map in map state pruning")
-__success __msg("processed 26 insns")
+__success __msg("processed 15 insns")
 __log_level(2) __retval(0) __flag(BPF_F_TEST_STATE_FREQ)
 __naked void map_in_map_state_pruning(void)
 {
diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
index f420c0312aa0..4b39f8472f9b 100644
--- a/tools/testing/selftests/bpf/verifier/map_kptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
@@ -373,7 +373,7 @@
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.fixup_map_kptr = { 1 },
 	.result = REJECT,
-	.errstr = "Unreleased reference id=5 alloc_insn=20",
+	.errstr = "Unreleased reference id=4 alloc_insn=20",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 15 },
 	}
-- 
cgit v1.2.3


From f932a8e4824b529e455b7e3eb3e5118beceb3e32 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 14 Jan 2025 13:28:46 -0700
Subject: bpf: selftests: verifier: Add nullness elision tests

Test that nullness elision works for common use cases. For example, we
want to check that both constant scalar spills and STACK_ZERO functions.
As well as when there's both const and non-const values of R2 leading up
to a lookup. And obviously some bound checks.

Particularly tricky are spills both smaller or larger than key size. For
smaller, we need to ensure verifier doesn't let through a potential read
into unchecked bytes. For larger, endianness comes into play, as the
native endian value tracked in the verifier may not be the bytes the
kernel would have read out of the key pointer. So check that we disallow
both.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/f1dacaa777d4516a5476162e0ea549f7c3354d73.1736886479.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_array_access.c    | 188 +++++++++++++++++++++
 1 file changed, 188 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_array_access.c b/tools/testing/selftests/bpf/progs/verifier_array_access.c
index 4195aa824ba5..29eb9568633f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_array_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_array_access.c
@@ -28,6 +28,20 @@ struct {
 	__uint(map_flags, BPF_F_WRONLY_PROG);
 } map_array_wo SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, __u32);
+	__type(value, struct test_val);
+} map_array_pcpu SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, __u32);
+	__type(value, struct test_val);
+} map_array SEC(".maps");
+
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(max_entries, 1);
@@ -525,4 +539,178 @@ l0_%=:	exit;						\
 	: __clobber_all);
 }
 
+SEC("socket")
+__description("valid map access into an array using constant without nullness")
+__success __retval(4) __log_level(2)
+__msg("mark_precise: frame0: regs= stack=-8 before {{[0-9]}}: ({{[a-f0-9]+}}) *(u32 *)(r10 -8) = {{(1|r[0-9])}}")
+unsigned int an_array_with_a_constant_no_nullness(void)
+{
+	/* Need 8-byte alignment for spill tracking */
+	__u32 __attribute__((aligned(8))) key = 1;
+	struct test_val *val;
+
+	val = bpf_map_lookup_elem(&map_array, &key);
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
+SEC("socket")
+__description("valid multiple map access into an array using constant without nullness")
+__success __retval(8) __log_level(2)
+__msg("mark_precise: frame0: regs= stack=-8 before {{[0-9]}}: ({{[a-f0-9]+}}) *(u32 *)(r10 -16) = {{(0|r[0-9])}}")
+__msg("mark_precise: frame0: regs= stack=-8 before {{[0-9]}}: ({{[a-f0-9]+}}) *(u32 *)(r10 -8) = {{(1|r[0-9])}}")
+unsigned int multiple_array_with_a_constant_no_nullness(void)
+{
+	__u32 __attribute__((aligned(8))) key = 1;
+	__u32 __attribute__((aligned(8))) key2 = 0;
+	struct test_val *val, *val2;
+
+	val = bpf_map_lookup_elem(&map_array, &key);
+	val->index = offsetof(struct test_val, foo);
+
+	val2 = bpf_map_lookup_elem(&map_array, &key2);
+	val2->index = offsetof(struct test_val, foo);
+
+	return val->index + val2->index;
+}
+
+SEC("socket")
+__description("valid map access into an array using natural aligned 32-bit constant 0 without nullness")
+__success __retval(4)
+unsigned int an_array_with_a_32bit_constant_0_no_nullness(void)
+{
+	/* Unlike the above tests, 32-bit zeroing is precisely tracked even
+	 * if writes are not aligned to BPF_REG_SIZE. This tests that our
+	 * STACK_ZERO handling functions.
+	 */
+	struct test_val *val;
+	__u32 key = 0;
+
+	val = bpf_map_lookup_elem(&map_array, &key);
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
+SEC("socket")
+__description("valid map access into a pcpu array using constant without nullness")
+__success __retval(4) __log_level(2)
+__msg("mark_precise: frame0: regs= stack=-8 before {{[0-9]}}: ({{[a-f0-9]+}}) *(u32 *)(r10 -8) = {{(1|r[0-9])}}")
+unsigned int a_pcpu_array_with_a_constant_no_nullness(void)
+{
+	__u32 __attribute__((aligned(8))) key = 1;
+	struct test_val *val;
+
+	val = bpf_map_lookup_elem(&map_array_pcpu, &key);
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
+SEC("socket")
+__description("invalid map access into an array using constant without nullness")
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+unsigned int an_array_with_a_constant_no_nullness_out_of_bounds(void)
+{
+	/* Out of bounds */
+	__u32 __attribute__((aligned(8))) key = 3;
+	struct test_val *val;
+
+	val = bpf_map_lookup_elem(&map_array, &key);
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
+SEC("socket")
+__description("invalid map access into an array using constant smaller than key_size")
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+unsigned int an_array_with_a_constant_too_small(void)
+{
+	__u32 __attribute__((aligned(8))) key;
+	struct test_val *val;
+
+	/* Mark entire key as STACK_MISC */
+	bpf_probe_read_user(&key, sizeof(key), NULL);
+
+	/* Spilling only the bottom byte results in a tnum const of 1.
+	 * We want to check that the verifier rejects it, as the spill is < 4B.
+	 */
+	*(__u8 *)&key = 1;
+	val = bpf_map_lookup_elem(&map_array, &key);
+
+	/* Should fail, as verifier cannot prove in-bound lookup */
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
+SEC("socket")
+__description("invalid map access into an array using constant larger than key_size")
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+unsigned int an_array_with_a_constant_too_big(void)
+{
+	struct test_val *val;
+	__u64 key = 1;
+
+	/* Even if the constant value is < max_entries, if the spill size is
+	 * larger than the key size, the set bits may not be where we expect them
+	 * to be on different endian architectures.
+	 */
+	val = bpf_map_lookup_elem(&map_array, &key);
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
+SEC("socket")
+__description("invalid elided lookup using const and non-const key")
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+unsigned int mixed_const_and_non_const_key_lookup(void)
+{
+	__u32 __attribute__((aligned(8))) key;
+	struct test_val *val;
+	__u32 rand;
+
+	rand = bpf_get_prandom_u32();
+	key = rand > 42 ? 1 : rand;
+	val = bpf_map_lookup_elem(&map_array, &key);
+
+	return val->index;
+}
+
+SEC("socket")
+__failure __msg("invalid read from stack R2 off=4096 size=4")
+__naked void key_lookup_at_invalid_fp(void)
+{
+	asm volatile ("					\
+	r1 = %[map_array] ll;				\
+	r2 = r10;					\
+	r2 += 4096;					\
+	call %[bpf_map_lookup_elem];			\
+	r0 = *(u64*)(r0 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm_addr(map_array)
+	: __clobber_all);
+}
+
+volatile __u32 __attribute__((aligned(8))) global_key;
+
+SEC("socket")
+__description("invalid elided lookup using non-stack key")
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+unsigned int non_stack_key_lookup(void)
+{
+	struct test_val *val;
+
+	global_key = 1;
+	val = bpf_map_lookup_elem(&map_array, (void *)&global_key);
+	val->index = offsetof(struct test_val, foo);
+
+	return val->index;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From f8a05692de060a889fe5b9eb00cc8f1fc8935899 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 16 Jan 2025 16:39:57 -0800
Subject: libbpf: Work around kernel inconsistently stripping '.llvm.' suffix

Some versions of kernel were stripping out '.llvm.<hash>' suffix from
kerne symbols (produced by Clang LTO compilation) from function names
reported in available_filter_functions, while kallsyms reported full
original name. This confuses libbpf's multi-kprobe logic of finding all
matching kernel functions for specified user glob pattern by joining
available_filter_functions and kallsyms contents, because joining by
full symbol name won't work for symbols containing '.llvm.<hash>' suffix.

This was eventually fixed by [0] in the kernel, but we'd like to not
regress multi-kprobe experience and add a work around for this bug on
libbpf side, stripping kallsym's name if it matches user pattern and
contains '.llvm.' suffix.

  [0] fb6a421fb615 ("kallsyms: Match symbols exactly with CONFIG_LTO_CLANG")

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250117003957.179331-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6c262d0152f8..194809da5172 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11387,9 +11387,33 @@ static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type,
 	struct kprobe_multi_resolve *res = data->res;
 	int err;
 
-	if (!bsearch(&sym_name, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp))
+	if (!glob_match(sym_name, res->pattern))
 		return 0;
 
+	if (!bsearch(&sym_name, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp)) {
+		/* Some versions of kernel strip out .llvm.<hash> suffix from
+		 * function names reported in available_filter_functions, but
+		 * don't do so for kallsyms. While this is clearly a kernel
+		 * bug (fixed by [0]) we try to accommodate that in libbpf to
+		 * make multi-kprobe usability a bit better: if no match is
+		 * found, we will strip .llvm. suffix and try one more time.
+		 *
+		 *   [0] fb6a421fb615 ("kallsyms: Match symbols exactly with CONFIG_LTO_CLANG")
+		 */
+		char sym_trim[256], *psym_trim = sym_trim, *sym_sfx;
+
+		if (!(sym_sfx = strstr(sym_name, ".llvm.")))
+			return 0;
+
+		/* psym_trim vs sym_trim dance is done to avoid pointer vs array
+		 * coercion differences and get proper `const char **` pointer
+		 * which avail_func_cmp() expects
+		 */
+		snprintf(sym_trim, sizeof(sym_trim), "%.*s", (int)(sym_sfx - sym_name), sym_name);
+		if (!bsearch(&psym_trim, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp))
+			return 0;
+	}
+
 	err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, sizeof(*res->addrs), res->cnt + 1);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 01f3ce5328c405179b2c69ea047c423dad2bfa6d Mon Sep 17 00:00:00 2001
From: Vishal Chourasia <vishalc@linux.ibm.com>
Date: Fri, 17 Jan 2025 15:42:59 +0100
Subject: tools: Sync if_xdp.h uapi tooling header

Sync if_xdp.h uapi header to remove following warning:

  Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h'
  differs from latest version at 'include/uapi/linux/if_xdp.h'

Fixes: 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support")
Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
Signed-off-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20250115032248.125742-1-yoong.siang.song@intel.com
---
 tools/include/uapi/linux/if_xdp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 2f082b01ff22..42ec5ddaab8d 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -117,12 +117,12 @@ struct xdp_options {
 	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
 
 /* Request transmit timestamp. Upon completion, put it into tx_timestamp
- * field of union xsk_tx_metadata.
+ * field of struct xsk_tx_metadata.
  */
 #define XDP_TXMD_FLAGS_TIMESTAMP		(1 << 0)
 
 /* Request transmit checksum offload. Checksum start position and offset
- * are communicated via csum_start and csum_offset fields of union
+ * are communicated via csum_start and csum_offset fields of struct
  * xsk_tx_metadata.
  */
 #define XDP_TXMD_FLAGS_CHECKSUM			(1 << 1)
-- 
cgit v1.2.3


From 0e4db4f843c2c0115b5981bd6f6b75dea62e7d60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 15 Jan 2025 15:54:07 +0100
Subject: selftests/landlock: Fix build with non-default pthread linking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Old toolchains require explicit -lpthread (e.g. on Debian 11).

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tahera Fahimi <fahimitahera@gmail.com>
Fixes: c8994965013e ("selftests/landlock: Test signal scoping for threads")
Reviewed-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20250115145409.312226-1-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
index 348e2dbdb4e0..480f13e77fcc 100644
--- a/tools/testing/selftests/landlock/Makefile
+++ b/tools/testing/selftests/landlock/Makefile
@@ -13,11 +13,11 @@ TEST_GEN_PROGS := $(src_test:.c=)
 TEST_GEN_PROGS_EXTENDED := true
 
 # Short targets:
-$(TEST_GEN_PROGS): LDLIBS += -lcap
+$(TEST_GEN_PROGS): LDLIBS += -lcap -lpthread
 $(TEST_GEN_PROGS_EXTENDED): LDFLAGS += -static
 
 include ../lib.mk
 
 # Targets with $(OUTPUT)/ prefix:
-$(TEST_GEN_PROGS): LDLIBS += -lcap
+$(TEST_GEN_PROGS): LDLIBS += -lcap -lpthread
 $(TEST_GEN_PROGS_EXTENDED): LDFLAGS += -static
-- 
cgit v1.2.3


From 12264f721f64a235f81e845e2cf95ad4a267613a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 8 Jan 2025 16:43:20 +0100
Subject: selftests/landlock: Add test to check partial access in a mount tree
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add layout1.refer_part_mount_tree_is_allowed to test the masked logical
issue regarding collect_domain_accesses() calls followed by the
is_access_to_paths_allowed() check in current_check_refer_path().  See
previous commit.

This test should work without the previous fix as well, but it enables
us to make sure future changes will not have impact regarding this
behavior.

Cc: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20250108154338.1129069-13-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 54 +++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 6788762188fe..42ce1e79ba82 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -85,6 +85,9 @@ static const char file1_s3d1[] = TMP_DIR "/s3d1/f1";
 /* dir_s3d2 is a mount point. */
 static const char dir_s3d2[] = TMP_DIR "/s3d1/s3d2";
 static const char dir_s3d3[] = TMP_DIR "/s3d1/s3d2/s3d3";
+static const char file1_s3d3[] = TMP_DIR "/s3d1/s3d2/s3d3/f1";
+static const char dir_s3d4[] = TMP_DIR "/s3d1/s3d2/s3d4";
+static const char file1_s3d4[] = TMP_DIR "/s3d1/s3d2/s3d4/f1";
 
 /*
  * layout1 hierarchy:
@@ -108,8 +111,11 @@ static const char dir_s3d3[] = TMP_DIR "/s3d1/s3d2/s3d3";
  * │           └── f2
  * └── s3d1
  *     ├── f1
- *     └── s3d2
- *         └── s3d3
+ *     └── s3d2 [mount point]
+ *         ├── s3d3
+ *         │   └── f1
+ *         └── s3d4
+ *             └── f1
  */
 
 static bool fgrep(FILE *const inf, const char *const str)
@@ -358,7 +364,8 @@ static void create_layout1(struct __test_metadata *const _metadata)
 	ASSERT_EQ(0, mount_opt(&mnt_tmp, dir_s3d2));
 	clear_cap(_metadata, CAP_SYS_ADMIN);
 
-	ASSERT_EQ(0, mkdir(dir_s3d3, 0700));
+	create_file(_metadata, file1_s3d3);
+	create_file(_metadata, file1_s3d4);
 }
 
 static void remove_layout1(struct __test_metadata *const _metadata)
@@ -378,7 +385,8 @@ static void remove_layout1(struct __test_metadata *const _metadata)
 	EXPECT_EQ(0, remove_path(dir_s2d2));
 
 	EXPECT_EQ(0, remove_path(file1_s3d1));
-	EXPECT_EQ(0, remove_path(dir_s3d3));
+	EXPECT_EQ(0, remove_path(file1_s3d3));
+	EXPECT_EQ(0, remove_path(file1_s3d4));
 	set_cap(_metadata, CAP_SYS_ADMIN);
 	umount(dir_s3d2);
 	clear_cap(_metadata, CAP_SYS_ADMIN);
@@ -2444,6 +2452,44 @@ TEST_F_FORK(layout1, refer_mount_root_deny)
 	EXPECT_EQ(0, close(root_fd));
 }
 
+TEST_F_FORK(layout1, refer_part_mount_tree_is_allowed)
+{
+	const struct rule layer1[] = {
+		{
+			/* Parent mount point. */
+			.path = dir_s3d1,
+			.access = LANDLOCK_ACCESS_FS_REFER |
+				  LANDLOCK_ACCESS_FS_MAKE_REG,
+		},
+		{
+			/*
+			 * Removing the source file is allowed because its
+			 * access rights are already a superset of the
+			 * destination.
+			 */
+			.path = dir_s3d4,
+			.access = LANDLOCK_ACCESS_FS_REFER |
+				  LANDLOCK_ACCESS_FS_MAKE_REG |
+				  LANDLOCK_ACCESS_FS_REMOVE_FILE,
+		},
+		{},
+	};
+	int ruleset_fd;
+
+	ASSERT_EQ(0, unlink(file1_s3d3));
+	ruleset_fd = create_ruleset(_metadata,
+				    LANDLOCK_ACCESS_FS_REFER |
+					    LANDLOCK_ACCESS_FS_MAKE_REG |
+					    LANDLOCK_ACCESS_FS_REMOVE_FILE,
+				    layer1);
+
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	ASSERT_EQ(0, rename(file1_s3d4, file1_s3d3));
+}
+
 TEST_F_FORK(layout1, reparent_link)
 {
 	const struct rule layer1[] = {
-- 
cgit v1.2.3


From 2107c35128ad751b201eb92fe91443450d9e5c37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 8 Jan 2025 16:43:28 +0100
Subject: selftests/landlock: Fix error message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The global variable errno may not be set in test_execute().  Do not use
it in related error message.

Cc: Günther Noack <gnoack@google.com>
Fixes: e1199815b47b ("selftests/landlock: Add user space tests")
Link: https://lore.kernel.org/r/20250108154338.1129069-21-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 42ce1e79ba82..a359c0d3107f 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -2011,8 +2011,7 @@ static void test_execute(struct __test_metadata *const _metadata, const int err,
 	ASSERT_EQ(1, WIFEXITED(status));
 	ASSERT_EQ(err ? 2 : 0, WEXITSTATUS(status))
 	{
-		TH_LOG("Unexpected return code for \"%s\": %s", path,
-		       strerror(errno));
+		TH_LOG("Unexpected return code for \"%s\"", path);
 	};
 }
 
-- 
cgit v1.2.3


From 5147779d5e1b6425f30eb57071717be2bb65fa3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 8 Jan 2025 16:43:29 +0100
Subject: selftests/landlock: Add wrappers.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract syscall wrappers to make them usable by standalone binaries (see
next commit).

Cc: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20250108154338.1129069-22-mic@digikod.net
[mic: Fix comments]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h   | 37 +----------------------
 tools/testing/selftests/landlock/wrappers.h | 47 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 36 deletions(-)
 create mode 100644 tools/testing/selftests/landlock/wrappers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 61056fa074bb..8391ab574f64 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -9,17 +9,15 @@
 
 #include <arpa/inet.h>
 #include <errno.h>
-#include <linux/landlock.h>
 #include <linux/securebits.h>
 #include <sys/capability.h>
 #include <sys/socket.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
 #include <sys/un.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
 #include "../kselftest_harness.h"
+#include "wrappers.h"
 
 #define TMP_DIR "tmp"
 
@@ -30,34 +28,6 @@
 /* TEST_F_FORK() should not be used for new tests. */
 #define TEST_F_FORK(fixture_name, test_name) TEST_F(fixture_name, test_name)
 
-#ifndef landlock_create_ruleset
-static inline int
-landlock_create_ruleset(const struct landlock_ruleset_attr *const attr,
-			const size_t size, const __u32 flags)
-{
-	return syscall(__NR_landlock_create_ruleset, attr, size, flags);
-}
-#endif
-
-#ifndef landlock_add_rule
-static inline int landlock_add_rule(const int ruleset_fd,
-				    const enum landlock_rule_type rule_type,
-				    const void *const rule_attr,
-				    const __u32 flags)
-{
-	return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, rule_attr,
-		       flags);
-}
-#endif
-
-#ifndef landlock_restrict_self
-static inline int landlock_restrict_self(const int ruleset_fd,
-					 const __u32 flags)
-{
-	return syscall(__NR_landlock_restrict_self, ruleset_fd, flags);
-}
-#endif
-
 static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 {
 	cap_t cap_p;
@@ -250,11 +220,6 @@ struct service_fixture {
 	};
 };
 
-static pid_t __maybe_unused sys_gettid(void)
-{
-	return syscall(__NR_gettid);
-}
-
 static void __maybe_unused set_unix_address(struct service_fixture *const srv,
 					    const unsigned short index)
 {
diff --git a/tools/testing/selftests/landlock/wrappers.h b/tools/testing/selftests/landlock/wrappers.h
new file mode 100644
index 000000000000..65548323e45d
--- /dev/null
+++ b/tools/testing/selftests/landlock/wrappers.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Syscall wrappers
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ * Copyright © 2021-2025 Microsoft Corporation
+ */
+
+#define _GNU_SOURCE
+#include <linux/landlock.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef landlock_create_ruleset
+static inline int
+landlock_create_ruleset(const struct landlock_ruleset_attr *const attr,
+			const size_t size, const __u32 flags)
+{
+	return syscall(__NR_landlock_create_ruleset, attr, size, flags);
+}
+#endif
+
+#ifndef landlock_add_rule
+static inline int landlock_add_rule(const int ruleset_fd,
+				    const enum landlock_rule_type rule_type,
+				    const void *const rule_attr,
+				    const __u32 flags)
+{
+	return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, rule_attr,
+		       flags);
+}
+#endif
+
+#ifndef landlock_restrict_self
+static inline int landlock_restrict_self(const int ruleset_fd,
+					 const __u32 flags)
+{
+	return syscall(__NR_landlock_restrict_self, ruleset_fd, flags);
+}
+#endif
+
+static inline pid_t sys_gettid(void)
+{
+	return syscall(__NR_gettid);
+}
-- 
cgit v1.2.3


From 2a794ee613617b5d8fd978b7ef08d64aa07ff2e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 8 Jan 2025 16:43:30 +0100
Subject: selftests/landlock: Add layout1.umount_sandboxer tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check that a domain is not tied to the executable file that created it.
For instance, that could happen if a Landlock domain took a reference to
a struct path.

Move global path names to common.h and replace copy_binary() with a more
generic copy_file() helper.

Test coverage for security/landlock is 92.7% of 1133 lines according to
gcc/gcov-14.

Cc: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20250108154338.1129069-23-mic@digikod.net
[mic: Update date and add test coverage]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/Makefile          |  2 +-
 tools/testing/selftests/landlock/common.h          |  3 +
 tools/testing/selftests/landlock/fs_test.c         | 94 +++++++++++++++++++---
 .../selftests/landlock/sandbox-and-launch.c        | 82 +++++++++++++++++++
 tools/testing/selftests/landlock/wait-pipe.c       | 42 ++++++++++
 5 files changed, 213 insertions(+), 10 deletions(-)
 create mode 100644 tools/testing/selftests/landlock/sandbox-and-launch.c
 create mode 100644 tools/testing/selftests/landlock/wait-pipe.c

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
index 480f13e77fcc..5cb0828f0514 100644
--- a/tools/testing/selftests/landlock/Makefile
+++ b/tools/testing/selftests/landlock/Makefile
@@ -10,7 +10,7 @@ src_test := $(wildcard *_test.c)
 
 TEST_GEN_PROGS := $(src_test:.c=)
 
-TEST_GEN_PROGS_EXTENDED := true
+TEST_GEN_PROGS_EXTENDED := true sandbox-and-launch wait-pipe
 
 # Short targets:
 $(TEST_GEN_PROGS): LDLIBS += -lcap -lpthread
diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 8391ab574f64..a604ea5d8297 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -28,6 +28,9 @@
 /* TEST_F_FORK() should not be used for new tests. */
 #define TEST_F_FORK(fixture_name, test_name) TEST_F(fixture_name, test_name)
 
+static const char bin_sandbox_and_launch[] = "./sandbox-and-launch";
+static const char bin_wait_pipe[] = "./wait-pipe";
+
 static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 {
 	cap_t cap_p;
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index a359c0d3107f..8ac9aaf38eaa 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -59,7 +59,7 @@ int open_tree(int dfd, const char *filename, unsigned int flags)
 #define RENAME_EXCHANGE (1 << 1)
 #endif
 
-#define BINARY_PATH "./true"
+static const char bin_true[] = "./true";
 
 /* Paths (sibling number and depth) */
 static const char dir_s1d1[] = TMP_DIR "/s1d1";
@@ -1965,8 +1965,8 @@ TEST_F_FORK(layout1, relative_chroot_chdir)
 	test_relative_path(_metadata, REL_CHROOT_CHDIR);
 }
 
-static void copy_binary(struct __test_metadata *const _metadata,
-			const char *const dst_path)
+static void copy_file(struct __test_metadata *const _metadata,
+		      const char *const src_path, const char *const dst_path)
 {
 	int dst_fd, src_fd;
 	struct stat statbuf;
@@ -1976,11 +1976,10 @@ static void copy_binary(struct __test_metadata *const _metadata,
 	{
 		TH_LOG("Failed to open \"%s\": %s", dst_path, strerror(errno));
 	}
-	src_fd = open(BINARY_PATH, O_RDONLY | O_CLOEXEC);
+	src_fd = open(src_path, O_RDONLY | O_CLOEXEC);
 	ASSERT_LE(0, src_fd)
 	{
-		TH_LOG("Failed to open \"" BINARY_PATH "\": %s",
-		       strerror(errno));
+		TH_LOG("Failed to open \"%s\": %s", src_path, strerror(errno));
 	}
 	ASSERT_EQ(0, fstat(src_fd, &statbuf));
 	ASSERT_EQ(statbuf.st_size,
@@ -2028,9 +2027,9 @@ TEST_F_FORK(layout1, execute)
 		create_ruleset(_metadata, rules[0].access, rules);
 
 	ASSERT_LE(0, ruleset_fd);
-	copy_binary(_metadata, file1_s1d1);
-	copy_binary(_metadata, file1_s1d2);
-	copy_binary(_metadata, file1_s1d3);
+	copy_file(_metadata, bin_true, file1_s1d1);
+	copy_file(_metadata, bin_true, file1_s1d2);
+	copy_file(_metadata, bin_true, file1_s1d3);
 
 	enforce_ruleset(_metadata, ruleset_fd);
 	ASSERT_EQ(0, close(ruleset_fd));
@@ -2048,6 +2047,83 @@ TEST_F_FORK(layout1, execute)
 	test_execute(_metadata, 0, file1_s1d3);
 }
 
+TEST_F_FORK(layout1, umount_sandboxer)
+{
+	int pipe_child[2], pipe_parent[2];
+	char buf_parent;
+	pid_t child;
+	int status;
+
+	copy_file(_metadata, bin_sandbox_and_launch, file1_s3d3);
+	ASSERT_EQ(0, pipe2(pipe_child, 0));
+	ASSERT_EQ(0, pipe2(pipe_parent, 0));
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		char pipe_child_str[12], pipe_parent_str[12];
+		char *const argv[] = { (char *)file1_s3d3,
+				       (char *)bin_wait_pipe, pipe_child_str,
+				       pipe_parent_str, NULL };
+
+		/* Passes the pipe FDs to the executed binary and its child. */
+		EXPECT_EQ(0, close(pipe_child[0]));
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		snprintf(pipe_child_str, sizeof(pipe_child_str), "%d",
+			 pipe_child[1]);
+		snprintf(pipe_parent_str, sizeof(pipe_parent_str), "%d",
+			 pipe_parent[0]);
+
+		/*
+		 * We need bin_sandbox_and_launch (copied inside the mount as
+		 * file1_s3d3) to execute bin_wait_pipe (outside the mount) to
+		 * make sure the mount point will not be EBUSY because of
+		 * file1_s3d3 being in use.  This avoids a potential race
+		 * condition between the following read() and umount() calls.
+		 */
+		ASSERT_EQ(0, execve(argv[0], argv, NULL))
+		{
+			TH_LOG("Failed to execute \"%s\": %s", argv[0],
+			       strerror(errno));
+		};
+		_exit(1);
+		return;
+	}
+
+	EXPECT_EQ(0, close(pipe_child[1]));
+	EXPECT_EQ(0, close(pipe_parent[0]));
+
+	/* Waits for the child to sandbox itself. */
+	EXPECT_EQ(1, read(pipe_child[0], &buf_parent, 1));
+
+	/* Tests that the sandboxer is tied to its mount point. */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	EXPECT_EQ(-1, umount(dir_s3d2));
+	EXPECT_EQ(EBUSY, errno);
+	clear_cap(_metadata, CAP_SYS_ADMIN);
+
+	/* Signals the child to launch a grandchild. */
+	EXPECT_EQ(1, write(pipe_parent[1], ".", 1));
+
+	/* Waits for the grandchild. */
+	EXPECT_EQ(1, read(pipe_child[0], &buf_parent, 1));
+
+	/* Tests that the domain's sandboxer is not tied to its mount point. */
+	set_cap(_metadata, CAP_SYS_ADMIN);
+	EXPECT_EQ(0, umount(dir_s3d2))
+	{
+		TH_LOG("Failed to umount \"%s\": %s", dir_s3d2,
+		       strerror(errno));
+	};
+	clear_cap(_metadata, CAP_SYS_ADMIN);
+
+	/* Signals the grandchild to terminate. */
+	EXPECT_EQ(1, write(pipe_parent[1], ".", 1));
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	ASSERT_EQ(1, WIFEXITED(status));
+	ASSERT_EQ(0, WEXITSTATUS(status));
+}
+
 TEST_F_FORK(layout1, link)
 {
 	const struct rule layer1[] = {
diff --git a/tools/testing/selftests/landlock/sandbox-and-launch.c b/tools/testing/selftests/landlock/sandbox-and-launch.c
new file mode 100644
index 000000000000..3e32e1a51ac5
--- /dev/null
+++ b/tools/testing/selftests/landlock/sandbox-and-launch.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sandbox itself and execute another program (in a different mount point).
+ *
+ * Used by layout1.umount_sandboxer from fs_test.c
+ *
+ * Copyright © 2024-2025 Microsoft Corporation
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "wrappers.h"
+
+int main(int argc, char *argv[])
+{
+	struct landlock_ruleset_attr ruleset_attr = {
+		.scoped = LANDLOCK_SCOPE_SIGNAL,
+	};
+	int pipe_child, pipe_parent, ruleset_fd;
+	char buf;
+
+	/*
+	 * The first argument must be the file descriptor number of a pipe.
+	 * The second argument must be the program to execute.
+	 */
+	if (argc != 4) {
+		fprintf(stderr, "Wrong number of arguments (not three)\n");
+		return 1;
+	}
+
+	pipe_child = atoi(argv[2]);
+	pipe_parent = atoi(argv[3]);
+
+	ruleset_fd =
+		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+	if (ruleset_fd < 0) {
+		perror("Failed to create ruleset");
+		return 1;
+	}
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		perror("Failed to call prctl()");
+		return 1;
+	}
+
+	if (landlock_restrict_self(ruleset_fd, 0)) {
+		perror("Failed to restrict self");
+		return 1;
+	}
+
+	if (close(ruleset_fd)) {
+		perror("Failed to close ruleset");
+		return 1;
+	}
+
+	/* Signals that we are sandboxed. */
+	errno = 0;
+	if (write(pipe_child, ".", 1) != 1) {
+		perror("Failed to write to the second argument");
+		return 1;
+	}
+
+	/* Waits for the parent to try to umount. */
+	if (read(pipe_parent, &buf, 1) != 1) {
+		perror("Failed to write to the third argument");
+		return 1;
+	}
+
+	/* Shifts arguments. */
+	argv[0] = argv[1];
+	argv[1] = argv[2];
+	argv[2] = argv[3];
+	argv[3] = NULL;
+	execve(argv[0], argv, NULL);
+	perror("Failed to execute the provided binary");
+	return 1;
+}
diff --git a/tools/testing/selftests/landlock/wait-pipe.c b/tools/testing/selftests/landlock/wait-pipe.c
new file mode 100644
index 000000000000..0dbcd260a0fa
--- /dev/null
+++ b/tools/testing/selftests/landlock/wait-pipe.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Write in a pipe and wait.
+ *
+ * Used by layout1.umount_sandboxer from fs_test.c
+ *
+ * Copyright © 2024-2025 Microsoft Corporation
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+	int pipe_child, pipe_parent;
+	char buf;
+
+	/* The first argument must be the file descriptor number of a pipe. */
+	if (argc != 3) {
+		fprintf(stderr, "Wrong number of arguments (not two)\n");
+		return 1;
+	}
+
+	pipe_child = atoi(argv[1]);
+	pipe_parent = atoi(argv[2]);
+
+	/* Signals that we are waiting. */
+	if (write(pipe_child, ".", 1) != 1) {
+		perror("Failed to write to first argument");
+		return 1;
+	}
+
+	/* Waits for the parent do its test. */
+	if (read(pipe_parent, &buf, 1) != 1) {
+		perror("Failed to write to the second argument");
+		return 1;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 58cf9c383c5c686668082f83f7e0f3e0bd5cc2e3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Dec 2024 19:35:36 -0500
Subject: dcache: back inline names with a struct-wrapped array of unsigned
 long

... so that they can be copied with struct assignment (which generates
better code) and accessed word-by-word.

The type is union shortname_storage; it's a union of arrays of
unsigned char and unsigned long.

struct name_snapshot.inline_name turned into union shortname_storage;
users (all in fs/dcache.c) adjusted.

struct dentry.d_iname has some users outside of fs/dcache.c; to
reduce the amount of noise in commit, it is replaced with
union shortname_storage d_shortname and d_iname is turned into a macro
that expands to d_shortname.string (similar to d_lock handling).
That compat macro is temporary - most of the remaining instances will
be taken out by debugfs series, and once that is merged and few others
are taken care of this will go away.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 tools/testing/selftests/bpf/progs/find_vma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/find_vma.c b/tools/testing/selftests/bpf/progs/find_vma.c
index 38034fb82530..02b82774469c 100644
--- a/tools/testing/selftests/bpf/progs/find_vma.c
+++ b/tools/testing/selftests/bpf/progs/find_vma.c
@@ -25,7 +25,7 @@ static long check_vma(struct task_struct *task, struct vm_area_struct *vma,
 {
 	if (vma->vm_file)
 		bpf_probe_read_kernel_str(d_iname, DNAME_INLINE_LEN - 1,
-					  vma->vm_file->f_path.dentry->d_iname);
+					  vma->vm_file->f_path.dentry->d_shortname.string);
 
 	/* check for VM_EXEC */
 	if (vma->vm_flags & VM_EXEC)
-- 
cgit v1.2.3


From 54ea680b759c4d76de16381e4d2a4623653b2f9f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Jan 2025 18:01:05 -0800
Subject: selftests: net: give up on the cmsg_time accuracy on slow machines

Commit b9d5f5711dd8 ("selftests: net: increase the delay for relative
cmsg_time.sh test") widened the accepted value range 8x but we still
see flakes (at a rate of around 7%).

Return XFAIL for the most timing sensitive test on slow machines.

Before:

  # ./cmsg_time.sh
    Case UDPv4  - TXTIME rel returned '8074us - 7397us < 4000', expected 'OK'
  FAIL - 1/36 cases failed

After:

  # ./cmsg_time.sh
    Case UDPv4  - TXTIME rel returned '1123us - 941us < 500', expected 'OK' (XFAIL)
    Case UDPv6  - TXTIME rel returned '1227us - 776us < 500', expected 'OK' (XFAIL)
  OK

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250116020105.931338-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/cmsg_time.sh | 35 +++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh
index 1d7e756644bc..478af0aefa97 100755
--- a/tools/testing/selftests/net/cmsg_time.sh
+++ b/tools/testing/selftests/net/cmsg_time.sh
@@ -34,13 +34,28 @@ BAD=0
 TOTAL=0
 
 check_result() {
+    local ret=$1
+    local got=$2
+    local exp=$3
+    local case=$4
+    local xfail=$5
+    local xf=
+    local inc=
+
+    if [ "$xfail" == "xfail" ]; then
+	xf="(XFAIL)"
+	inc=0
+    else
+	inc=1
+    fi
+
     ((TOTAL++))
-    if [ $1 -ne 0 ]; then
-	echo "  Case $4 returned $1, expected 0"
-	((BAD++))
+    if [ $ret -ne 0 ]; then
+	echo "  Case $case returned $ret, expected 0 $xf"
+	((BAD+=inc))
     elif [ "$2" != "$3" ]; then
-	echo "  Case $4 returned '$2', expected '$3'"
-	((BAD++))
+	echo "  Case $case returned '$got', expected '$exp' $xf"
+	((BAD+=inc))
     fi
 }
 
@@ -66,14 +81,14 @@ for i in "-4 $TGT4" "-6 $TGT6"; do
 		 awk '/SND/ { if ($3 > 1000) print "OK"; }')
 	check_result $? "$ts" "OK" "$prot - TXTIME abs"
 
-	[ "$KSFT_MACHINE_SLOW" = yes ] && delay=8000 || delay=1000
+	[ "$KSFT_MACHINE_SLOW" = yes ] && xfail=xfail
 
-	ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d $delay |
+	ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 |
 		 awk '/SND/ {snd=$3}
 		      /SCHED/ {sch=$3}
-		      END { if (snd - sch > '$((delay/2))') print "OK";
-			    else print snd, "-", sch, "<", '$((delay/2))'; }')
-	check_result $? "$ts" "OK" "$prot - TXTIME rel"
+		      END { if (snd - sch > 500) print "OK";
+			    else print snd, "-", sch, "<", 500; }')
+	check_result $? "$ts" "OK" "$prot - TXTIME rel" $xfail
     done
 done
 
-- 
cgit v1.2.3


From 57d7713af93e4b7344d3022fad9ddf0f10f815ec Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Wed, 13 Nov 2024 18:21:18 -0800
Subject: selftests: riscv: Fix vector tests

Overhaul the riscv vector tests to use kselftest_harness to help the
test cases correctly report the results and decouple the individual test
cases from each other. With this refactoring, only run the test cases if
vector is reported and properly report the test case as skipped
otherwise. The v_initval_nolibc test was previously not checking if
vector was supported and used a function (malloc) which invalidates
the state of the vector registers.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Yangyu Chen <cyy@cyyself.name>
Link: https://lore.kernel.org/r/20241113-xtheadvector-v11-12-236c22791ef9@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 tools/testing/selftests/riscv/vector/.gitignore    |   3 +-
 tools/testing/selftests/riscv/vector/Makefile      |  17 +-
 .../selftests/riscv/vector/v_exec_initval_nolibc.c |  85 +++++++
 tools/testing/selftests/riscv/vector/v_helpers.c   |  57 +++++
 tools/testing/selftests/riscv/vector/v_helpers.h   |   6 +
 tools/testing/selftests/riscv/vector/v_initval.c   |  16 ++
 .../selftests/riscv/vector/v_initval_nolibc.c      |  68 -----
 .../testing/selftests/riscv/vector/vstate_prctl.c  | 278 ++++++++++++---------
 8 files changed, 337 insertions(+), 193 deletions(-)
 create mode 100644 tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
 create mode 100644 tools/testing/selftests/riscv/vector/v_helpers.c
 create mode 100644 tools/testing/selftests/riscv/vector/v_helpers.h
 create mode 100644 tools/testing/selftests/riscv/vector/v_initval.c
 delete mode 100644 tools/testing/selftests/riscv/vector/v_initval_nolibc.c

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/.gitignore b/tools/testing/selftests/riscv/vector/.gitignore
index 9ae7964491d5..7d9c87cd0649 100644
--- a/tools/testing/selftests/riscv/vector/.gitignore
+++ b/tools/testing/selftests/riscv/vector/.gitignore
@@ -1,3 +1,4 @@
 vstate_exec_nolibc
 vstate_prctl
-v_initval_nolibc
+v_initval
+v_exec_initval_nolibc
diff --git a/tools/testing/selftests/riscv/vector/Makefile b/tools/testing/selftests/riscv/vector/Makefile
index bfff0ff4f3be..6f7497f4e7b3 100644
--- a/tools/testing/selftests/riscv/vector/Makefile
+++ b/tools/testing/selftests/riscv/vector/Makefile
@@ -2,18 +2,27 @@
 # Copyright (C) 2021 ARM Limited
 # Originally tools/testing/arm64/abi/Makefile
 
-TEST_GEN_PROGS := vstate_prctl v_initval_nolibc
-TEST_GEN_PROGS_EXTENDED := vstate_exec_nolibc
+TEST_GEN_PROGS := v_initval vstate_prctl
+TEST_GEN_PROGS_EXTENDED := vstate_exec_nolibc v_exec_initval_nolibc
 
 include ../../lib.mk
 
-$(OUTPUT)/vstate_prctl: vstate_prctl.c ../hwprobe/sys_hwprobe.S
+$(OUTPUT)/sys_hwprobe.o: ../hwprobe/sys_hwprobe.S
+	$(CC) -static -c -o$@ $(CFLAGS) $^
+
+$(OUTPUT)/v_helpers.o: v_helpers.c
+	$(CC) -static -c -o$@ $(CFLAGS) $^
+
+$(OUTPUT)/vstate_prctl: vstate_prctl.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o
 	$(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^
 
 $(OUTPUT)/vstate_exec_nolibc: vstate_exec_nolibc.c
 	$(CC) -nostdlib -static -include ../../../../include/nolibc/nolibc.h \
 		-Wall $(CFLAGS) $(LDFLAGS) $^ -o $@ -lgcc
 
-$(OUTPUT)/v_initval_nolibc: v_initval_nolibc.c
+$(OUTPUT)/v_initval: v_initval.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o
+	$(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^
+
+$(OUTPUT)/v_exec_initval_nolibc: v_exec_initval_nolibc.c
 	$(CC) -nostdlib -static -include ../../../../include/nolibc/nolibc.h \
 		-Wall $(CFLAGS) $(LDFLAGS) $^ -o $@ -lgcc
diff --git a/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
new file mode 100644
index 000000000000..4a39cab29c34
--- /dev/null
+++ b/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Get values of vector registers as soon as the program starts to test if
+ * is properly cleaning the values before starting a new program. Vector
+ * registers are caller saved, so no function calls may happen before reading
+ * the values. To further ensure consistency, this file is compiled without
+ * libc and without auto-vectorization.
+ *
+ * To be "clean" all values must be either all ones or all zeroes.
+ */
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+
+int main(int argc, char **argv)
+{
+	char prev_value = 0, value;
+	unsigned long vl;
+	int first = 1;
+
+	asm volatile (
+		".option push\n\t"
+		".option arch, +v\n\t"
+		"vsetvli	%[vl], x0, e8, m1, ta, ma\n\t"
+		".option pop\n\t"
+		: [vl] "=r" (vl)
+	);
+
+#define CHECK_VECTOR_REGISTER(register) ({					\
+	for (int i = 0; i < vl; i++) {						\
+		asm volatile (							\
+			".option push\n\t"					\
+			".option arch, +v\n\t"					\
+			"vmv.x.s %0, " __stringify(register) "\n\t"		\
+			"vsrl.vi " __stringify(register) ", " __stringify(register) ", 8\n\t" \
+			".option pop\n\t"					\
+			: "=r" (value));					\
+		if (first) {							\
+			first = 0;						\
+		} else if (value != prev_value || !(value == 0x00 || value == 0xff)) { \
+			printf("Register " __stringify(register)		\
+				" values not clean! value: %u\n", value);	\
+			exit(-1);						\
+		}								\
+		prev_value = value;						\
+	}									\
+})
+
+	CHECK_VECTOR_REGISTER(v0);
+	CHECK_VECTOR_REGISTER(v1);
+	CHECK_VECTOR_REGISTER(v2);
+	CHECK_VECTOR_REGISTER(v3);
+	CHECK_VECTOR_REGISTER(v4);
+	CHECK_VECTOR_REGISTER(v5);
+	CHECK_VECTOR_REGISTER(v6);
+	CHECK_VECTOR_REGISTER(v7);
+	CHECK_VECTOR_REGISTER(v8);
+	CHECK_VECTOR_REGISTER(v9);
+	CHECK_VECTOR_REGISTER(v10);
+	CHECK_VECTOR_REGISTER(v11);
+	CHECK_VECTOR_REGISTER(v12);
+	CHECK_VECTOR_REGISTER(v13);
+	CHECK_VECTOR_REGISTER(v14);
+	CHECK_VECTOR_REGISTER(v15);
+	CHECK_VECTOR_REGISTER(v16);
+	CHECK_VECTOR_REGISTER(v17);
+	CHECK_VECTOR_REGISTER(v18);
+	CHECK_VECTOR_REGISTER(v19);
+	CHECK_VECTOR_REGISTER(v20);
+	CHECK_VECTOR_REGISTER(v21);
+	CHECK_VECTOR_REGISTER(v22);
+	CHECK_VECTOR_REGISTER(v23);
+	CHECK_VECTOR_REGISTER(v24);
+	CHECK_VECTOR_REGISTER(v25);
+	CHECK_VECTOR_REGISTER(v26);
+	CHECK_VECTOR_REGISTER(v27);
+	CHECK_VECTOR_REGISTER(v28);
+	CHECK_VECTOR_REGISTER(v29);
+	CHECK_VECTOR_REGISTER(v30);
+	CHECK_VECTOR_REGISTER(v31);
+
+#undef CHECK_VECTOR_REGISTER
+
+	return 0;
+}
diff --git a/tools/testing/selftests/riscv/vector/v_helpers.c b/tools/testing/selftests/riscv/vector/v_helpers.c
new file mode 100644
index 000000000000..d50f4dfbf9e5
--- /dev/null
+++ b/tools/testing/selftests/riscv/vector/v_helpers.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "../hwprobe/hwprobe.h"
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+bool is_vector_supported(void)
+{
+	struct riscv_hwprobe pair;
+
+	pair.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
+	riscv_hwprobe(&pair, 1, 0, NULL, 0);
+	return pair.value & RISCV_HWPROBE_EXT_ZVE32X;
+}
+
+int launch_test(char *next_program, int test_inherit)
+{
+	char *exec_argv[3], *exec_envp[1];
+	int rc, pid, status;
+
+	pid = fork();
+	if (pid < 0) {
+		printf("fork failed %d", pid);
+		return -1;
+	}
+
+	if (!pid) {
+		exec_argv[0] = next_program;
+		exec_argv[1] = test_inherit != 0 ? "x" : NULL;
+		exec_argv[2] = NULL;
+		exec_envp[0] = NULL;
+		/* launch the program again to check inherit */
+		rc = execve(next_program, exec_argv, exec_envp);
+		if (rc) {
+			perror("execve");
+			printf("child execve failed %d\n", rc);
+			exit(-1);
+		}
+	}
+
+	rc = waitpid(-1, &status, 0);
+	if (rc < 0) {
+		printf("waitpid failed\n");
+		return -3;
+	}
+
+	if ((WIFEXITED(status) && WEXITSTATUS(status) == -1) ||
+	    WIFSIGNALED(status)) {
+		printf("child exited abnormally\n");
+		return -4;
+	}
+
+	return WEXITSTATUS(status);
+}
diff --git a/tools/testing/selftests/riscv/vector/v_helpers.h b/tools/testing/selftests/riscv/vector/v_helpers.h
new file mode 100644
index 000000000000..faeeeb625b6e
--- /dev/null
+++ b/tools/testing/selftests/riscv/vector/v_helpers.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <stdbool.h>
+
+bool is_vector_supported(void);
+
+int launch_test(char *next_program, int test_inherit);
diff --git a/tools/testing/selftests/riscv/vector/v_initval.c b/tools/testing/selftests/riscv/vector/v_initval.c
new file mode 100644
index 000000000000..f38b5797fa31
--- /dev/null
+++ b/tools/testing/selftests/riscv/vector/v_initval.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "../../kselftest_harness.h"
+#include "v_helpers.h"
+
+#define NEXT_PROGRAM "./v_exec_initval_nolibc"
+
+TEST(v_initval)
+{
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
+
+	ASSERT_EQ(0, launch_test(NEXT_PROGRAM, 0));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c
deleted file mode 100644
index 1dd94197da30..000000000000
--- a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include "../../kselftest.h"
-#define MAX_VSIZE	(8192 * 32)
-
-void dump(char *ptr, int size)
-{
-	int i = 0;
-
-	for (i = 0; i < size; i++) {
-		if (i != 0) {
-			if (i % 16 == 0)
-				printf("\n");
-			else if (i % 8 == 0)
-				printf("  ");
-		}
-		printf("%02x ", ptr[i]);
-	}
-	printf("\n");
-}
-
-int main(void)
-{
-	int i;
-	unsigned long vl;
-	char *datap, *tmp;
-
-	datap = malloc(MAX_VSIZE);
-	if (!datap) {
-		ksft_test_result_fail("fail to allocate memory for size = %d\n", MAX_VSIZE);
-		exit(-1);
-	}
-
-	tmp = datap;
-	asm volatile (
-		".option push\n\t"
-		".option arch, +v\n\t"
-		"vsetvli	%0, x0, e8, m8, ta, ma\n\t"
-		"vse8.v		v0, (%2)\n\t"
-		"add		%1, %2, %0\n\t"
-		"vse8.v		v8, (%1)\n\t"
-		"add		%1, %1, %0\n\t"
-		"vse8.v		v16, (%1)\n\t"
-		"add		%1, %1, %0\n\t"
-		"vse8.v		v24, (%1)\n\t"
-		".option pop\n\t"
-		: "=&r" (vl), "=r" (tmp) : "r" (datap) : "memory");
-
-	ksft_print_msg("vl = %lu\n", vl);
-
-	if (datap[0] != 0x00 && datap[0] != 0xff) {
-		ksft_test_result_fail("v-regesters are not properly initialized\n");
-		dump(datap, vl * 4);
-		exit(-1);
-	}
-
-	for (i = 1; i < vl * 4; i++) {
-		if (datap[i] != datap[0]) {
-			ksft_test_result_fail("detect stale values on v-regesters\n");
-			dump(datap, vl * 4);
-			exit(-2);
-		}
-	}
-
-	free(datap);
-	ksft_exit_pass();
-	return 0;
-}
diff --git a/tools/testing/selftests/riscv/vector/vstate_prctl.c b/tools/testing/selftests/riscv/vector/vstate_prctl.c
index 895177f6bf4c..2fc86924bf42 100644
--- a/tools/testing/selftests/riscv/vector/vstate_prctl.c
+++ b/tools/testing/selftests/riscv/vector/vstate_prctl.c
@@ -3,50 +3,13 @@
 #include <unistd.h>
 #include <errno.h>
 #include <sys/wait.h>
+#include <sys/types.h>
+#include <stdlib.h>
 
-#include "../hwprobe/hwprobe.h"
-#include "../../kselftest.h"
+#include "../../kselftest_harness.h"
+#include "v_helpers.h"
 
 #define NEXT_PROGRAM "./vstate_exec_nolibc"
-static int launch_test(int test_inherit)
-{
-	char *exec_argv[3], *exec_envp[1];
-	int rc, pid, status;
-
-	pid = fork();
-	if (pid < 0) {
-		ksft_test_result_fail("fork failed %d", pid);
-		return -1;
-	}
-
-	if (!pid) {
-		exec_argv[0] = NEXT_PROGRAM;
-		exec_argv[1] = test_inherit != 0 ? "x" : NULL;
-		exec_argv[2] = NULL;
-		exec_envp[0] = NULL;
-		/* launch the program again to check inherit */
-		rc = execve(NEXT_PROGRAM, exec_argv, exec_envp);
-		if (rc) {
-			perror("execve");
-			ksft_test_result_fail("child execve failed %d\n", rc);
-			exit(-1);
-		}
-	}
-
-	rc = waitpid(-1, &status, 0);
-	if (rc < 0) {
-		ksft_test_result_fail("waitpid failed\n");
-		return -3;
-	}
-
-	if ((WIFEXITED(status) && WEXITSTATUS(status) == -1) ||
-	    WIFSIGNALED(status)) {
-		ksft_test_result_fail("child exited abnormally\n");
-		return -4;
-	}
-
-	return WEXITSTATUS(status);
-}
 
 int test_and_compare_child(long provided, long expected, int inherit)
 {
@@ -54,128 +17,203 @@ int test_and_compare_child(long provided, long expected, int inherit)
 
 	rc = prctl(PR_RISCV_V_SET_CONTROL, provided);
 	if (rc != 0) {
-		ksft_test_result_fail("prctl with provided arg %lx failed with code %d\n",
-				      provided, rc);
+		printf("prctl with provided arg %lx failed with code %d\n",
+		       provided, rc);
 		return -1;
 	}
-	rc = launch_test(inherit);
+	rc = launch_test(NEXT_PROGRAM, inherit);
 	if (rc != expected) {
-		ksft_test_result_fail("Test failed, check %d != %ld\n", rc,
-				      expected);
+		printf("Test failed, check %d != %ld\n", rc, expected);
 		return -2;
 	}
 	return 0;
 }
 
-#define PR_RISCV_V_VSTATE_CTRL_CUR_SHIFT	0
-#define PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT	2
+#define PR_RISCV_V_VSTATE_CTRL_CUR_SHIFT 0
+#define PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT 2
 
-int main(void)
+TEST(get_control_no_v)
 {
-	struct riscv_hwprobe pair;
-	long flag, expected;
 	long rc;
 
-	pair.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
-	rc = riscv_hwprobe(&pair, 1, 0, NULL, 0);
-	if (rc < 0) {
-		ksft_test_result_fail("hwprobe() failed with %ld\n", rc);
-		return -1;
-	}
+	if (is_vector_supported())
+		SKIP(return, "Test expects vector to be not supported");
 
-	if (pair.key != RISCV_HWPROBE_KEY_IMA_EXT_0) {
-		ksft_test_result_fail("hwprobe cannot probe RISCV_HWPROBE_KEY_IMA_EXT_0\n");
-		return -2;
-	}
+	rc = prctl(PR_RISCV_V_GET_CONTROL);
+	EXPECT_EQ(-1, rc)
+	TH_LOG("GET_CONTROL should fail on kernel/hw without ZVE32X");
+	EXPECT_EQ(EINVAL, errno)
+	TH_LOG("GET_CONTROL should fail on kernel/hw without ZVE32X");
+}
 
-	if (!(pair.value & RISCV_HWPROBE_EXT_ZVE32X)) {
-		rc = prctl(PR_RISCV_V_GET_CONTROL);
-		if (rc != -1 || errno != EINVAL) {
-			ksft_test_result_fail("GET_CONTROL should fail on kernel/hw without ZVE32X\n");
-			return -3;
-		}
-
-		rc = prctl(PR_RISCV_V_SET_CONTROL, PR_RISCV_V_VSTATE_CTRL_ON);
-		if (rc != -1 || errno != EINVAL) {
-			ksft_test_result_fail("SET_CONTROL should fail on kernel/hw without ZVE32X\n");
-			return -4;
-		}
-
-		ksft_test_result_skip("Vector not supported\n");
-		return 0;
-	}
+TEST(set_control_no_v)
+{
+	long rc;
+
+	if (is_vector_supported())
+		SKIP(return, "Test expects vector to be not supported");
+
+	rc = prctl(PR_RISCV_V_SET_CONTROL, PR_RISCV_V_VSTATE_CTRL_ON);
+	EXPECT_EQ(-1, rc)
+	TH_LOG("SET_CONTROL should fail on kernel/hw without ZVE32X");
+	EXPECT_EQ(EINVAL, errno)
+	TH_LOG("SET_CONTROL should fail on kernel/hw without ZVE32X");
+}
+
+TEST(vstate_on_current)
+{
+	long flag;
+	long rc;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	flag = PR_RISCV_V_VSTATE_CTRL_ON;
 	rc = prctl(PR_RISCV_V_SET_CONTROL, flag);
-	if (rc != 0) {
-		ksft_test_result_fail("Enabling V for current should always success\n");
-		return -5;
-	}
+	EXPECT_EQ(0, rc) TH_LOG("Enabling V for current should always success");
+}
+
+TEST(vstate_off_eperm)
+{
+	long flag;
+	long rc;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF;
 	rc = prctl(PR_RISCV_V_SET_CONTROL, flag);
-	if (rc != -1 || errno != EPERM) {
-		ksft_test_result_fail("Disabling current's V alive must fail with EPERM(%d)\n",
-				      errno);
-		return -5;
-	}
+	EXPECT_EQ(EPERM, errno)
+	TH_LOG("Disabling V in current thread with V enabled must fail with EPERM(%d)", errno);
+	EXPECT_EQ(-1, rc)
+	TH_LOG("Disabling V in current thread with V enabled must fail with EPERM(%d)", errno);
+}
+
+TEST(vstate_on_no_nesting)
+{
+	long flag;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	/* Turn on next's vector explicitly and test */
 	flag = PR_RISCV_V_VSTATE_CTRL_ON << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
-	if (test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_ON, 0))
-		return -6;
+
+	EXPECT_EQ(0,
+		  test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_ON, 0));
+}
+
+TEST(vstate_off_nesting)
+{
+	long flag;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	/* Turn off next's vector explicitly and test */
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
-	if (test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_OFF, 0))
-		return -7;
+
+	EXPECT_EQ(0,
+		  test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_OFF, 1));
+}
+
+TEST(vstate_on_inherit_no_nesting)
+{
+	long flag, expected;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
+
+	/* Turn on next's vector explicitly and test no inherit */
+	flag = PR_RISCV_V_VSTATE_CTRL_ON << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
+	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
+	expected = flag | PR_RISCV_V_VSTATE_CTRL_ON;
+
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 0));
+}
+
+TEST(vstate_on_inherit)
+{
+	long flag, expected;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	/* Turn on next's vector explicitly and test inherit */
 	flag = PR_RISCV_V_VSTATE_CTRL_ON << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
 	expected = flag | PR_RISCV_V_VSTATE_CTRL_ON;
-	if (test_and_compare_child(flag, expected, 0))
-		return -8;
 
-	if (test_and_compare_child(flag, expected, 1))
-		return -9;
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 1));
+}
+
+TEST(vstate_off_inherit_no_nesting)
+{
+	long flag, expected;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
+
+	/* Turn off next's vector explicitly and test no inherit */
+	flag = PR_RISCV_V_VSTATE_CTRL_OFF << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
+	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
+	expected = flag | PR_RISCV_V_VSTATE_CTRL_OFF;
+
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 0));
+}
+
+TEST(vstate_off_inherit)
+{
+	long flag, expected;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	/* Turn off next's vector explicitly and test inherit */
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
 	expected = flag | PR_RISCV_V_VSTATE_CTRL_OFF;
-	if (test_and_compare_child(flag, expected, 0))
-		return -10;
 
-	if (test_and_compare_child(flag, expected, 1))
-		return -11;
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 1));
+}
+
+/* arguments should fail with EINVAL */
+TEST(inval_set_control_1)
+{
+	int rc;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
-	/* arguments should fail with EINVAL */
 	rc = prctl(PR_RISCV_V_SET_CONTROL, 0xff0);
-	if (rc != -1 || errno != EINVAL) {
-		ksft_test_result_fail("Undefined control argument should return EINVAL\n");
-		return -12;
-	}
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EINVAL, errno);
+}
+
+/* arguments should fail with EINVAL */
+TEST(inval_set_control_2)
+{
+	int rc;
+
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
 	rc = prctl(PR_RISCV_V_SET_CONTROL, 0x3);
-	if (rc != -1 || errno != EINVAL) {
-		ksft_test_result_fail("Undefined control argument should return EINVAL\n");
-		return -12;
-	}
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EINVAL, errno);
+}
 
-	rc = prctl(PR_RISCV_V_SET_CONTROL, 0xc);
-	if (rc != -1 || errno != EINVAL) {
-		ksft_test_result_fail("Undefined control argument should return EINVAL\n");
-		return -12;
-	}
+/* arguments should fail with EINVAL */
+TEST(inval_set_control_3)
+{
+	int rc;
 
-	rc = prctl(PR_RISCV_V_SET_CONTROL, 0xc);
-	if (rc != -1 || errno != EINVAL) {
-		ksft_test_result_fail("Undefined control argument should return EINVAL\n");
-		return -12;
-	}
+	if (!is_vector_supported())
+		SKIP(return, "Vector not supported");
 
-	ksft_test_result_pass("tests for riscv_v_vstate_ctrl pass\n");
-	ksft_exit_pass();
-	return 0;
+	rc = prctl(PR_RISCV_V_SET_CONTROL, 0xc);
+	EXPECT_EQ(-1, rc);
+	EXPECT_EQ(EINVAL, errno);
 }
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From c384c5d4a2aed5b6a10de1fcc2f5b46ad4aeeea8 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Wed, 13 Nov 2024 18:21:19 -0800
Subject: selftests: riscv: Support xtheadvector in vector tests

Extend existing vector tests to be compatible with the xtheadvector
instructions.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Yangyu Chen <cyy@cyyself.name>
Link: https://lore.kernel.org/r/20241113-xtheadvector-v11-13-236c22791ef9@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 .../selftests/riscv/vector/v_exec_initval_nolibc.c | 23 ++++--
 tools/testing/selftests/riscv/vector/v_helpers.c   | 17 ++++-
 tools/testing/selftests/riscv/vector/v_helpers.h   |  4 +-
 tools/testing/selftests/riscv/vector/v_initval.c   | 12 ++-
 .../selftests/riscv/vector/vstate_exec_nolibc.c    | 20 +++--
 .../testing/selftests/riscv/vector/vstate_prctl.c  | 89 ++++++++++++++--------
 6 files changed, 113 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
index 4a39cab29c34..35c0812e32de 100644
--- a/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
+++ b/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
@@ -18,13 +18,22 @@ int main(int argc, char **argv)
 	unsigned long vl;
 	int first = 1;
 
-	asm volatile (
-		".option push\n\t"
-		".option arch, +v\n\t"
-		"vsetvli	%[vl], x0, e8, m1, ta, ma\n\t"
-		".option pop\n\t"
-		: [vl] "=r" (vl)
-	);
+	if (argc > 2 && strcmp(argv[2], "x"))
+		asm volatile (
+			// 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli
+			// vsetvli	t4, x0, e8, m1, d1
+			".4byte		0b00000000000000000111111011010111\n\t"
+			"mv		%[vl], t4\n\t"
+			: [vl] "=r" (vl) : : "t4"
+		);
+	else
+		asm volatile (
+			".option push\n\t"
+			".option arch, +v\n\t"
+			"vsetvli	%[vl], x0, e8, m1, ta, ma\n\t"
+			".option pop\n\t"
+			: [vl] "=r" (vl)
+		);
 
 #define CHECK_VECTOR_REGISTER(register) ({					\
 	for (int i = 0; i < vl; i++) {						\
diff --git a/tools/testing/selftests/riscv/vector/v_helpers.c b/tools/testing/selftests/riscv/vector/v_helpers.c
index d50f4dfbf9e5..01a8799dcb78 100644
--- a/tools/testing/selftests/riscv/vector/v_helpers.c
+++ b/tools/testing/selftests/riscv/vector/v_helpers.c
@@ -1,12 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include "../hwprobe/hwprobe.h"
+#include <asm/vendor/thead.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/wait.h>
 
+bool is_xtheadvector_supported(void)
+{
+	struct riscv_hwprobe pair;
+
+	pair.key = RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0;
+	riscv_hwprobe(&pair, 1, 0, NULL, 0);
+	return pair.value & RISCV_HWPROBE_VENDOR_EXT_XTHEADVECTOR;
+}
+
 bool is_vector_supported(void)
 {
 	struct riscv_hwprobe pair;
@@ -16,9 +26,9 @@ bool is_vector_supported(void)
 	return pair.value & RISCV_HWPROBE_EXT_ZVE32X;
 }
 
-int launch_test(char *next_program, int test_inherit)
+int launch_test(char *next_program, int test_inherit, int xtheadvector)
 {
-	char *exec_argv[3], *exec_envp[1];
+	char *exec_argv[4], *exec_envp[1];
 	int rc, pid, status;
 
 	pid = fork();
@@ -30,7 +40,8 @@ int launch_test(char *next_program, int test_inherit)
 	if (!pid) {
 		exec_argv[0] = next_program;
 		exec_argv[1] = test_inherit != 0 ? "x" : NULL;
-		exec_argv[2] = NULL;
+		exec_argv[2] = xtheadvector != 0 ? "x" : NULL;
+		exec_argv[3] = NULL;
 		exec_envp[0] = NULL;
 		/* launch the program again to check inherit */
 		rc = execve(next_program, exec_argv, exec_envp);
diff --git a/tools/testing/selftests/riscv/vector/v_helpers.h b/tools/testing/selftests/riscv/vector/v_helpers.h
index faeeeb625b6e..763cddfe26da 100644
--- a/tools/testing/selftests/riscv/vector/v_helpers.h
+++ b/tools/testing/selftests/riscv/vector/v_helpers.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 #include <stdbool.h>
 
+bool is_xtheadvector_supported(void);
+
 bool is_vector_supported(void);
 
-int launch_test(char *next_program, int test_inherit);
+int launch_test(char *next_program, int test_inherit, int xtheadvector);
diff --git a/tools/testing/selftests/riscv/vector/v_initval.c b/tools/testing/selftests/riscv/vector/v_initval.c
index f38b5797fa31..be9e1d18ad29 100644
--- a/tools/testing/selftests/riscv/vector/v_initval.c
+++ b/tools/testing/selftests/riscv/vector/v_initval.c
@@ -7,10 +7,16 @@
 
 TEST(v_initval)
 {
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
+	int xtheadvector = 0;
 
-	ASSERT_EQ(0, launch_test(NEXT_PROGRAM, 0));
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
+
+	ASSERT_EQ(0, launch_test(NEXT_PROGRAM, 0, xtheadvector));
 }
 
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c
index 1f9969bed235..7b7d6f21acb4 100644
--- a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c
+++ b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c
@@ -6,13 +6,16 @@
 
 int main(int argc, char **argv)
 {
-	int rc, pid, status, test_inherit = 0;
+	int rc, pid, status, test_inherit = 0, xtheadvector = 0;
 	long ctrl, ctrl_c;
 	char *exec_argv[2], *exec_envp[2];
 
-	if (argc > 1)
+	if (argc > 1 && strcmp(argv[1], "x"))
 		test_inherit = 1;
 
+	if (argc > 2 && strcmp(argv[2], "x"))
+		xtheadvector = 1;
+
 	ctrl = my_syscall1(__NR_prctl, PR_RISCV_V_GET_CONTROL);
 	if (ctrl < 0) {
 		puts("PR_RISCV_V_GET_CONTROL is not supported\n");
@@ -53,11 +56,14 @@ int main(int argc, char **argv)
 				puts("child's vstate_ctrl not equal to parent's\n");
 				exit(-1);
 			}
-			asm volatile (".option push\n\t"
-				      ".option arch, +v\n\t"
-				      "vsetvli x0, x0, e32, m8, ta, ma\n\t"
-				      ".option pop\n\t"
-				      );
+			if (xtheadvector)
+				asm volatile (".4byte	0x00007ed7");
+			else
+				asm volatile (".option push\n\t"
+					".option arch, +v\n\t"
+					"vsetvli x0, x0, e32, m8, ta, ma\n\t"
+					".option pop\n\t"
+					);
 			exit(ctrl);
 		}
 	}
diff --git a/tools/testing/selftests/riscv/vector/vstate_prctl.c b/tools/testing/selftests/riscv/vector/vstate_prctl.c
index 2fc86924bf42..62fbb17a0556 100644
--- a/tools/testing/selftests/riscv/vector/vstate_prctl.c
+++ b/tools/testing/selftests/riscv/vector/vstate_prctl.c
@@ -11,7 +11,7 @@
 
 #define NEXT_PROGRAM "./vstate_exec_nolibc"
 
-int test_and_compare_child(long provided, long expected, int inherit)
+int test_and_compare_child(long provided, long expected, int inherit, int xtheadvector)
 {
 	int rc;
 
@@ -21,7 +21,7 @@ int test_and_compare_child(long provided, long expected, int inherit)
 		       provided, rc);
 		return -1;
 	}
-	rc = launch_test(NEXT_PROGRAM, inherit);
+	rc = launch_test(NEXT_PROGRAM, inherit, xtheadvector);
 	if (rc != expected) {
 		printf("Test failed, check %d != %ld\n", rc, expected);
 		return -2;
@@ -36,7 +36,7 @@ TEST(get_control_no_v)
 {
 	long rc;
 
-	if (is_vector_supported())
+	if (is_vector_supported() || is_xtheadvector_supported())
 		SKIP(return, "Test expects vector to be not supported");
 
 	rc = prctl(PR_RISCV_V_GET_CONTROL);
@@ -50,7 +50,7 @@ TEST(set_control_no_v)
 {
 	long rc;
 
-	if (is_vector_supported())
+	if (is_vector_supported() || is_xtheadvector_supported())
 		SKIP(return, "Test expects vector to be not supported");
 
 	rc = prctl(PR_RISCV_V_SET_CONTROL, PR_RISCV_V_VSTATE_CTRL_ON);
@@ -65,12 +65,12 @@ TEST(vstate_on_current)
 	long flag;
 	long rc;
 
-	if (!is_vector_supported())
+	if (!is_vector_supported() && !is_xtheadvector_supported())
 		SKIP(return, "Vector not supported");
 
 	flag = PR_RISCV_V_VSTATE_CTRL_ON;
 	rc = prctl(PR_RISCV_V_SET_CONTROL, flag);
-	EXPECT_EQ(0, rc) TH_LOG("Enabling V for current should always success");
+	EXPECT_EQ(0, rc) TH_LOG("Enabling V for current should always succeed");
 }
 
 TEST(vstate_off_eperm)
@@ -78,7 +78,7 @@ TEST(vstate_off_eperm)
 	long flag;
 	long rc;
 
-	if (!is_vector_supported())
+	if (!is_vector_supported() && !is_xtheadvector_supported())
 		SKIP(return, "Vector not supported");
 
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF;
@@ -92,89 +92,116 @@ TEST(vstate_off_eperm)
 TEST(vstate_on_no_nesting)
 {
 	long flag;
+	int xtheadvector = 0;
 
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
 
 	/* Turn on next's vector explicitly and test */
 	flag = PR_RISCV_V_VSTATE_CTRL_ON << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 
-	EXPECT_EQ(0,
-		  test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_ON, 0));
+	EXPECT_EQ(0, test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_ON, 0, xtheadvector));
 }
 
 TEST(vstate_off_nesting)
 {
 	long flag;
+	int xtheadvector = 0;
 
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
 
 	/* Turn off next's vector explicitly and test */
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 
-	EXPECT_EQ(0,
-		  test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_OFF, 1));
+	EXPECT_EQ(0, test_and_compare_child(flag, PR_RISCV_V_VSTATE_CTRL_OFF, 1, xtheadvector));
 }
 
 TEST(vstate_on_inherit_no_nesting)
 {
 	long flag, expected;
+	int xtheadvector = 0;
 
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
 
 	/* Turn on next's vector explicitly and test no inherit */
 	flag = PR_RISCV_V_VSTATE_CTRL_ON << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
 	expected = flag | PR_RISCV_V_VSTATE_CTRL_ON;
 
-	EXPECT_EQ(0, test_and_compare_child(flag, expected, 0));
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 0, xtheadvector));
 }
 
 TEST(vstate_on_inherit)
 {
 	long flag, expected;
+	int xtheadvector = 0;
 
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
 
 	/* Turn on next's vector explicitly and test inherit */
 	flag = PR_RISCV_V_VSTATE_CTRL_ON << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
 	expected = flag | PR_RISCV_V_VSTATE_CTRL_ON;
 
-	EXPECT_EQ(0, test_and_compare_child(flag, expected, 1));
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 1, xtheadvector));
 }
 
 TEST(vstate_off_inherit_no_nesting)
 {
 	long flag, expected;
+	int xtheadvector = 0;
 
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
-
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
 	/* Turn off next's vector explicitly and test no inherit */
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
 	expected = flag | PR_RISCV_V_VSTATE_CTRL_OFF;
 
-	EXPECT_EQ(0, test_and_compare_child(flag, expected, 0));
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 0, xtheadvector));
 }
 
 TEST(vstate_off_inherit)
 {
 	long flag, expected;
+	int xtheadvector = 0;
 
-	if (!is_vector_supported())
-		SKIP(return, "Vector not supported");
+	if (!is_vector_supported()) {
+		if (is_xtheadvector_supported())
+			xtheadvector = 1;
+		else
+			SKIP(return, "Vector not supported");
+	}
 
 	/* Turn off next's vector explicitly and test inherit */
 	flag = PR_RISCV_V_VSTATE_CTRL_OFF << PR_RISCV_V_VSTATE_CTRL_NEXT_SHIFT;
 	flag |= PR_RISCV_V_VSTATE_CTRL_INHERIT;
 	expected = flag | PR_RISCV_V_VSTATE_CTRL_OFF;
 
-	EXPECT_EQ(0, test_and_compare_child(flag, expected, 1));
+	EXPECT_EQ(0, test_and_compare_child(flag, expected, 1, xtheadvector));
 }
 
 /* arguments should fail with EINVAL */
@@ -182,7 +209,7 @@ TEST(inval_set_control_1)
 {
 	int rc;
 
-	if (!is_vector_supported())
+	if (!is_vector_supported() && !is_xtheadvector_supported())
 		SKIP(return, "Vector not supported");
 
 	rc = prctl(PR_RISCV_V_SET_CONTROL, 0xff0);
@@ -195,7 +222,7 @@ TEST(inval_set_control_2)
 {
 	int rc;
 
-	if (!is_vector_supported())
+	if (!is_vector_supported() && !is_xtheadvector_supported())
 		SKIP(return, "Vector not supported");
 
 	rc = prctl(PR_RISCV_V_SET_CONTROL, 0x3);
@@ -208,7 +235,7 @@ TEST(inval_set_control_3)
 {
 	int rc;
 
-	if (!is_vector_supported())
+	if (!is_vector_supported() && !is_xtheadvector_supported())
 		SKIP(return, "Vector not supported");
 
 	rc = prctl(PR_RISCV_V_SET_CONTROL, 0xc);
-- 
cgit v1.2.3


From 8a32d46b204396255462712afbef16e227423f68 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 09:59:41 -0500
Subject: selftests/efivarfs: add check for disallowing file truncation

Now that the ability of arbitrary writes to set the inode size is
fixed, verify that a variable file accepts a truncation operation but
does not change the stat size because of it.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 tools/testing/selftests/efivarfs/efivarfs.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/efivarfs/efivarfs.sh b/tools/testing/selftests/efivarfs/efivarfs.sh
index d374878cc0ba..96677282789b 100755
--- a/tools/testing/selftests/efivarfs/efivarfs.sh
+++ b/tools/testing/selftests/efivarfs/efivarfs.sh
@@ -202,6 +202,28 @@ test_invalid_filenames()
 	exit $ret
 }
 
+test_no_set_size()
+{
+	local attrs='\x07\x00\x00\x00'
+	local file=$efivarfs_mount/$FUNCNAME-$test_guid
+	local ret=0
+
+	printf "$attrs\x00" > $file
+	[ -e $file -a -s $file ] || exit 1
+	chattr -i $file
+	: > $file
+	if [ $? != 0 ]; then
+		echo "variable file failed to accept truncation"
+		ret=1
+	elif [ -e $file -a ! -s $file ]; then
+		echo "file can be truncated to zero size"
+		ret=1
+	fi
+	rm $file || exit 1
+
+	exit $ret
+}
+
 check_prereqs
 
 rc=0
@@ -214,5 +236,6 @@ run_test test_zero_size_delete
 run_test test_open_unlink
 run_test test_valid_filenames
 run_test test_invalid_filenames
+run_test test_no_set_size
 
 exit $rc
-- 
cgit v1.2.3


From fd3aa3d5e5dbbf4254cd4ac6c550a4da671e07cc Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 10:12:13 -0500
Subject: selftests/efivarfs: fix tests for failed write removal

The current self tests expect the zero size remnants that failed
variable creation leaves.  Update the tests to verify these are now
absent.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 tools/testing/selftests/efivarfs/efivarfs.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/efivarfs/efivarfs.sh b/tools/testing/selftests/efivarfs/efivarfs.sh
index 96677282789b..4a84a810dc2c 100755
--- a/tools/testing/selftests/efivarfs/efivarfs.sh
+++ b/tools/testing/selftests/efivarfs/efivarfs.sh
@@ -76,11 +76,11 @@ test_create_empty()
 
 	: > $file
 
-	if [ ! -e $file ]; then
-		echo "$file can not be created without writing" >&2
+	if [ -e $file ]; then
+		echo "$file can be created without writing" >&2
+		file_cleanup $file
 		exit 1
 	fi
-	file_cleanup $file
 }
 
 test_create_read()
@@ -89,10 +89,13 @@ test_create_read()
 	./create-read $file
 	if [ $? -ne 0 ]; then
 		echo "create and read $file failed"
+		exit 1
+	fi
+	if [ -e $file ]; then
+		echo "file still exists and should not"
 		file_cleanup $file
 		exit 1
 	fi
-	file_cleanup $file
 }
 
 test_delete()
-- 
cgit v1.2.3


From 0a5d2efa382751fc2b0a9a82f6cbdfe8aef29fb3 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 17 Jan 2025 18:18:16 +0800
Subject: selftests/bpf: Add test case for the freeing of bpf_timer

The main purpose of the test is to demonstrate the lock problem for the
free of bpf_timer under PREEMPT_RT. When freeing a bpf_timer which is
running on other CPU in bpf_timer_cancel_and_free(), hrtimer_cancel()
will try to acquire a spin-lock (namely softirq_expiry_lock), however
the freeing procedure has already held a raw-spin-lock.

The test first creates two threads: one to start timers and the other to
free timers. The start-timers thread will start the timer and then wake
up the free-timers thread to free these timers when the starts complete.
After freeing, the free-timer thread will wake up the start-timer thread
to complete the current iteration. A loop of 10 iterations is used.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20250117101816.2101857-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/free_timer.c  | 165 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/free_timer.c     |  71 +++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/free_timer.c
 create mode 100644 tools/testing/selftests/bpf/progs/free_timer.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c
new file mode 100644
index 000000000000..b7b77a6b2979
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <test_progs.h>
+
+#include "free_timer.skel.h"
+
+struct run_ctx {
+	struct bpf_program *start_prog;
+	struct bpf_program *overwrite_prog;
+	pthread_barrier_t notify;
+	int loop;
+	bool start;
+	bool stop;
+};
+
+static void start_threads(struct run_ctx *ctx)
+{
+	ctx->start = true;
+}
+
+static void stop_threads(struct run_ctx *ctx)
+{
+	ctx->stop = true;
+	/* Guarantee the order between ->stop and ->start */
+	__atomic_store_n(&ctx->start, true, __ATOMIC_RELEASE);
+}
+
+static int wait_for_start(struct run_ctx *ctx)
+{
+	while (!__atomic_load_n(&ctx->start, __ATOMIC_ACQUIRE))
+		usleep(10);
+
+	return ctx->stop;
+}
+
+static void *overwrite_timer_fn(void *arg)
+{
+	struct run_ctx *ctx = arg;
+	int loop, fd, err;
+	cpu_set_t cpuset;
+	long ret = 0;
+
+	/* Pin on CPU 0 */
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+
+	/* Is the thread being stopped ? */
+	err = wait_for_start(ctx);
+	if (err)
+		return NULL;
+
+	fd = bpf_program__fd(ctx->overwrite_prog);
+	loop = ctx->loop;
+	while (loop-- > 0) {
+		LIBBPF_OPTS(bpf_test_run_opts, opts);
+
+		/* Wait for start thread to complete */
+		pthread_barrier_wait(&ctx->notify);
+
+		/* Overwrite timers */
+		err = bpf_prog_test_run_opts(fd, &opts);
+		if (err)
+			ret |= 1;
+		else if (opts.retval)
+			ret |= 2;
+
+		/* Notify start thread to start timers */
+		pthread_barrier_wait(&ctx->notify);
+	}
+
+	return (void *)ret;
+}
+
+static void *start_timer_fn(void *arg)
+{
+	struct run_ctx *ctx = arg;
+	int loop, fd, err;
+	cpu_set_t cpuset;
+	long ret = 0;
+
+	/* Pin on CPU 1 */
+	CPU_ZERO(&cpuset);
+	CPU_SET(1, &cpuset);
+	pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+
+	/* Is the thread being stopped ? */
+	err = wait_for_start(ctx);
+	if (err)
+		return NULL;
+
+	fd = bpf_program__fd(ctx->start_prog);
+	loop = ctx->loop;
+	while (loop-- > 0) {
+		LIBBPF_OPTS(bpf_test_run_opts, opts);
+
+		/* Run the prog to start timer */
+		err = bpf_prog_test_run_opts(fd, &opts);
+		if (err)
+			ret |= 4;
+		else if (opts.retval)
+			ret |= 8;
+
+		/* Notify overwrite thread to do overwrite */
+		pthread_barrier_wait(&ctx->notify);
+
+		/* Wait for overwrite thread to complete */
+		pthread_barrier_wait(&ctx->notify);
+	}
+
+	return (void *)ret;
+}
+
+void test_free_timer(void)
+{
+	struct free_timer *skel;
+	struct bpf_program *prog;
+	struct run_ctx ctx;
+	pthread_t tid[2];
+	void *ret;
+	int err;
+
+	skel = free_timer__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_load"))
+		return;
+
+	memset(&ctx, 0, sizeof(ctx));
+
+	prog = bpf_object__find_program_by_name(skel->obj, "start_timer");
+	if (!ASSERT_OK_PTR(prog, "find start prog"))
+		goto out;
+	ctx.start_prog = prog;
+
+	prog = bpf_object__find_program_by_name(skel->obj, "overwrite_timer");
+	if (!ASSERT_OK_PTR(prog, "find overwrite prog"))
+		goto out;
+	ctx.overwrite_prog = prog;
+
+	pthread_barrier_init(&ctx.notify, NULL, 2);
+	ctx.loop = 10;
+
+	err = pthread_create(&tid[0], NULL, start_timer_fn, &ctx);
+	if (!ASSERT_OK(err, "create start_timer"))
+		goto out;
+
+	err = pthread_create(&tid[1], NULL, overwrite_timer_fn, &ctx);
+	if (!ASSERT_OK(err, "create overwrite_timer")) {
+		stop_threads(&ctx);
+		goto out;
+	}
+
+	start_threads(&ctx);
+
+	ret = NULL;
+	err = pthread_join(tid[0], &ret);
+	ASSERT_EQ(err | (long)ret, 0, "start_timer");
+	ret = NULL;
+	err = pthread_join(tid[1], &ret);
+	ASSERT_EQ(err | (long)ret, 0, "overwrite_timer");
+out:
+	free_timer__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/free_timer.c b/tools/testing/selftests/bpf/progs/free_timer.c
new file mode 100644
index 000000000000..4501ae8fc414
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/free_timer.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+#include <linux/bpf.h>
+#include <time.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+#define MAX_ENTRIES 8
+
+struct map_value {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int);
+	__type(value, struct map_value);
+	__uint(max_entries, MAX_ENTRIES);
+} map SEC(".maps");
+
+static int timer_cb(void *map, void *key, struct map_value *value)
+{
+	volatile int sum = 0;
+	int i;
+
+	bpf_for(i, 0, 1024 * 1024) sum += i;
+
+	return 0;
+}
+
+static int start_cb(int key)
+{
+	struct map_value *value;
+
+	value = bpf_map_lookup_elem(&map, (void *)&key);
+	if (!value)
+		return 0;
+
+	bpf_timer_init(&value->timer, &map, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(&value->timer, timer_cb);
+	/* Hope 100us will be enough to wake-up and run the overwrite thread */
+	bpf_timer_start(&value->timer, 100000, BPF_F_TIMER_CPU_PIN);
+
+	return 0;
+}
+
+static int overwrite_cb(int key)
+{
+	struct map_value zero = {};
+
+	/* Free the timer which may run on other CPU */
+	bpf_map_update_elem(&map, (void *)&key, &zero, BPF_ANY);
+
+	return 0;
+}
+
+SEC("syscall")
+int BPF_PROG(start_timer)
+{
+	bpf_loop(MAX_ENTRIES, start_cb, NULL, 0);
+	return 0;
+}
+
+SEC("syscall")
+int BPF_PROG(overwrite_timer)
+{
+	bpf_loop(MAX_ENTRIES, overwrite_cb, NULL, 0);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 14a627fe794a10cf507861e63cf9d46077699337 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sat, 18 Jan 2025 11:20:34 -0800
Subject: selftests/bpf: Add some tests related to 'may_goto 0' insns

Add both asm-based and C-based tests which have 'may_goto 0' insns.

For the following code in C-based test,
   int i, tmp[3];
   for (i = 0; i < 3 && can_loop; i++)
       tmp[i] = 0;

The clang compiler (clang 19 and 20) generates
   may_goto 2
   may_goto 1
   may_goto 0
   r1 = 0
   r2 = 0
   r3 = 0

The above asm codes are due to llvm pass SROAPass. This ensures the
successful verification since tmp[0-2] are initialized.  Otherwise,
the code without SROAPass like
   may_goto 5
   r1 = 0
   may_goto 3
   r2 = 0
   may_goto 1
   r3 = 0
will have verification failure.

Although from the source code C-based test should have verification
failure, clang compiler optimization generates code with successful
verification. If gcc generates different asm codes than clang, the
following code can be used for gcc:
   int i, tmp[3];
   for (i = 0; i < 3; i++)
       tmp[i] = 0;

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250118192034.2124952-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |  4 +
 .../selftests/bpf/progs/verifier_may_goto_1.c      | 97 ++++++++++++++++++++++
 .../selftests/bpf/progs/verifier_may_goto_2.c      | 28 +++++++
 3 files changed, 129 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_may_goto_1.c
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_may_goto_2.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 33cd3e035071..8a0e1ff8a2dc 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -52,6 +52,8 @@
 #include "verifier_map_ptr_mixing.skel.h"
 #include "verifier_map_ret_val.skel.h"
 #include "verifier_masking.skel.h"
+#include "verifier_may_goto_1.skel.h"
+#include "verifier_may_goto_2.skel.h"
 #include "verifier_meta_access.skel.h"
 #include "verifier_movsx.skel.h"
 #include "verifier_mtu.skel.h"
@@ -182,6 +184,8 @@ void test_verifier_map_ptr(void)              { RUN(verifier_map_ptr); }
 void test_verifier_map_ptr_mixing(void)       { RUN(verifier_map_ptr_mixing); }
 void test_verifier_map_ret_val(void)          { RUN(verifier_map_ret_val); }
 void test_verifier_masking(void)              { RUN(verifier_masking); }
+void test_verifier_may_goto_1(void)           { RUN(verifier_may_goto_1); }
+void test_verifier_may_goto_2(void)           { RUN(verifier_may_goto_2); }
 void test_verifier_meta_access(void)          { RUN(verifier_meta_access); }
 void test_verifier_movsx(void)                 { RUN(verifier_movsx); }
 void test_verifier_netfilter_ctx(void)        { RUN(verifier_netfilter_ctx); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c
new file mode 100644
index 000000000000..e81097c96fe2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "../../../include/linux/filter.h"
+#include "bpf_misc.h"
+
+SEC("raw_tp")
+__description("may_goto 0")
+__arch_x86_64
+__xlated("0: r0 = 1")
+__xlated("1: exit")
+__success
+__naked void may_goto_simple(void)
+{
+	asm volatile (
+	".8byte %[may_goto];"
+	"r0 = 1;"
+	".8byte %[may_goto];"
+	"exit;"
+	:
+	: __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0))
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__description("batch 2 of may_goto 0")
+__arch_x86_64
+__xlated("0: r0 = 1")
+__xlated("1: exit")
+__success
+__naked void may_goto_batch_0(void)
+{
+	asm volatile (
+	".8byte %[may_goto1];"
+	".8byte %[may_goto1];"
+	"r0 = 1;"
+	".8byte %[may_goto1];"
+	".8byte %[may_goto1];"
+	"exit;"
+	:
+	: __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0))
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__description("may_goto batch with offsets 2/1/0")
+__arch_x86_64
+__xlated("0: r0 = 1")
+__xlated("1: exit")
+__success
+__naked void may_goto_batch_1(void)
+{
+	asm volatile (
+	".8byte %[may_goto1];"
+	".8byte %[may_goto2];"
+	".8byte %[may_goto3];"
+	"r0 = 1;"
+	".8byte %[may_goto1];"
+	".8byte %[may_goto2];"
+	".8byte %[may_goto3];"
+	"exit;"
+	:
+	: __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)),
+	  __imm_insn(may_goto2, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 1 /* offset */, 0)),
+	  __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0))
+	: __clobber_all);
+}
+
+SEC("raw_tp")
+__description("may_goto batch with offsets 2/0")
+__arch_x86_64
+__xlated("0: *(u64 *)(r10 -8) = 8388608")
+__xlated("1: r11 = *(u64 *)(r10 -8)")
+__xlated("2: if r11 == 0x0 goto pc+3")
+__xlated("3: r11 -= 1")
+__xlated("4: *(u64 *)(r10 -8) = r11")
+__xlated("5: r0 = 1")
+__xlated("6: r0 = 2")
+__xlated("7: exit")
+__success
+__naked void may_goto_batch_2(void)
+{
+	asm volatile (
+	".8byte %[may_goto1];"
+	".8byte %[may_goto3];"
+	"r0 = 1;"
+	"r0 = 2;"
+	"exit;"
+	:
+	: __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)),
+	  __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0))
+	: __clobber_all);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_2.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_2.c
new file mode 100644
index 000000000000..b891faf50660
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_2.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+int gvar;
+
+SEC("raw_tp")
+__description("C code with may_goto 0")
+__success
+int may_goto_c_code(void)
+{
+	int i, tmp[3];
+
+	for (i = 0; i < 3 && can_loop; i++)
+		tmp[i] = 0;
+
+	for (i = 0; i < 3 && can_loop; i++)
+		tmp[i] = gvar - i;
+
+	for (i = 0; i < 3 && can_loop; i++)
+		gvar += tmp[i];
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 3a0b7fa095212b51ed63892540c4f249991a2d74 Mon Sep 17 00:00:00 2001
From: Liu Ye <liuye@kylinos.cn>
Date: Thu, 16 Jan 2025 09:30:37 +0800
Subject: selftests/net/ipsec: Fix Null pointer dereference in rtattr_pack()

Address Null pointer dereference / undefined behavior in rtattr_pack
(note that size is 0 in the bad case).

Flagged by cppcheck as:
    tools/testing/selftests/net/ipsec.c:230:25: warning: Possible null pointer
    dereference: payload [nullPointer]
    memcpy(RTA_DATA(attr), payload, size);
                           ^
    tools/testing/selftests/net/ipsec.c:1618:54: note: Calling function 'rtattr_pack',
    4th argument 'NULL' value is 0
    if (rtattr_pack(&req.nh, sizeof(req), XFRMA_IF_ID, NULL, 0)) {
                                                       ^
    tools/testing/selftests/net/ipsec.c:230:25: note: Null pointer dereference
    memcpy(RTA_DATA(attr), payload, size);
                           ^
Signed-off-by: Liu Ye <liuye@kylinos.cn>

Link: https://patch.msgid.link/20250116013037.29470-1-liuye@kylinos.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ipsec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index be4a30a0d02a..9b44a091802c 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -227,7 +227,8 @@ static int rtattr_pack(struct nlmsghdr *nh, size_t req_sz,
 
 	attr->rta_len = RTA_LENGTH(size);
 	attr->rta_type = rta_type;
-	memcpy(RTA_DATA(attr), payload, size);
+	if (payload)
+		memcpy(RTA_DATA(attr), payload, size);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 87e6cd7cdbe8d6cb233528d5163ac1cacd30b948 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 19 Jan 2025 10:12:14 -0500
Subject: selftests/efivarfs: add concurrent update tests

The delete on last close functionality can now only be tested properly
by using multiple threads to hold open the variable files and testing
what happens as they complete.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 tools/testing/selftests/efivarfs/efivarfs.sh | 134 +++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/efivarfs/efivarfs.sh b/tools/testing/selftests/efivarfs/efivarfs.sh
index 4a84a810dc2c..c62544b966ae 100755
--- a/tools/testing/selftests/efivarfs/efivarfs.sh
+++ b/tools/testing/selftests/efivarfs/efivarfs.sh
@@ -227,6 +227,136 @@ test_no_set_size()
 	exit $ret
 }
 
+setup_test_multiple()
+{
+       ##
+       # we're going to do multi-threaded tests, so create a set of
+       # pipes for synchronization.  We use pipes 1..3 to start the
+       # stalled shell job and pipes 4..6 as indicators that the job
+       # has started.  If you need more than 3 jobs the two +3's below
+       # need increasing
+       ##
+
+       declare -ag p
+
+       # empty is because arrays number from 0 but jobs number from 1
+       p[0]=""
+
+       for f in 1 2 3 4 5 6; do
+               p[$f]=/tmp/efivarfs_pipe${f}
+               mknod ${p[$f]} p
+       done
+
+       declare -g var=$efivarfs_mount/test_multiple-$test_guid
+
+       cleanup() {
+               for f in ${p[@]}; do
+                       rm -f ${f}
+               done
+               if [ -e $var ]; then
+                       file_cleanup $var
+               fi
+       }
+       trap cleanup exit
+
+       waitstart() {
+               cat ${p[$[$1+3]]} > /dev/null
+       }
+
+       waitpipe() {
+               echo 1 > ${p[$[$1+3]]}
+               cat ${p[$1]} > /dev/null
+       }
+
+       endjob() {
+               echo 1 > ${p[$1]}
+               wait -n %$1
+       }
+}
+
+test_multiple_zero_size()
+{
+       ##
+       # check for remove on last close, set up three threads all
+       # holding the variable (one write and two reads) and then
+       # close them sequentially (waiting for completion) and check
+       # the state of the variable
+       ##
+
+       { waitpipe 1; echo 1; } > $var 2> /dev/null &
+       waitstart 1
+       # zero length file should exist
+       [ -e $var ] || exit 1
+       # second and third delayed close
+       { waitpipe 2; } < $var &
+       waitstart 2
+       { waitpipe 3; } < $var &
+       waitstart 3
+       # close first fd
+       endjob 1
+       # var should only be deleted on last close
+       [ -e $var ] || exit 1
+       # close second fd
+       endjob 2
+       [ -e $var ] || exit 1
+       # file should go on last close
+       endjob 3
+       [ ! -e $var ] || exit 1
+}
+
+test_multiple_create()
+{
+       ##
+       # set multiple threads to access the variable but delay
+       # the final write to check the close of 2 and 3.  The
+       # final write should succeed in creating the variable
+       ##
+       { waitpipe 1; printf '\x07\x00\x00\x00\x54'; } > $var &
+       waitstart 1
+       [ -e $var -a ! -s $var ] || exit 1
+       { waitpipe 2; } < $var &
+       waitstart 2
+       { waitpipe 3; } < $var &
+       waitstart 3
+       # close second and third fds
+       endjob 2
+       # var should only be created (have size) on last close
+       [ -e $var -a ! -s $var ] || exit 1
+       endjob 3
+       [ -e $var -a ! -s $var ] || exit 1
+       # close first fd
+       endjob 1
+       # variable should still exist
+       [ -s $var ] || exit 1
+       file_cleanup $var
+}
+
+test_multiple_delete_on_write() {
+       ##
+       # delete the variable on final write; seqencing similar
+       # to test_multiple_create()
+       ##
+       printf '\x07\x00\x00\x00\x54' > $var
+       chattr -i $var
+       { waitpipe 1; printf '\x07\x00\x00\x00'; } > $var &
+       waitstart 1
+       [ -e $var -a -s $var ] || exit 1
+       { waitpipe 2; } < $var &
+       waitstart 2
+       { waitpipe 3; } < $var &
+       waitstart 3
+       # close first fd; write should set variable size to zero
+       endjob 1
+       # var should only be deleted on last close
+       [ -e $var -a ! -s $var ] || exit 1
+       endjob 2
+       [ -e $var ] || exit 1
+       # close last fd
+       endjob 3
+       # variable should now be removed
+       [ ! -e $var ] || exit 1
+}
+
 check_prereqs
 
 rc=0
@@ -240,5 +370,9 @@ run_test test_open_unlink
 run_test test_valid_filenames
 run_test test_invalid_filenames
 run_test test_no_set_size
+setup_test_multiple
+run_test test_multiple_zero_size
+run_test test_multiple_create
+run_test test_multiple_delete_on_write
 
 exit $rc
-- 
cgit v1.2.3


From d6658d3338f84173fb55c9d6c6cdfa57f879712d Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Sat, 16 Nov 2024 04:20:45 +0100
Subject: misc: pci_endpoint_test: Add consecutive BAR test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a more advanced BAR test that writes all BARs in one go, and then reads
them back and verifies that the value matches the BAR number bitwise OR'ed
with offset, this allows us to verify:

  - The BAR number was what we intended to read
  - The offset was what we intended to read

This allows us to detect potential address translation issues on the EP.

Reading back the BAR directly after writing will not allow us to detect the
case where inbound address translation on the endpoint incorrectly causes
multiple BARs to be redirected to the same memory region (within the EP).

Link: https://lore.kernel.org/r/20241116032045.2574168-2-cassel@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 tools/pci/pcitest.c  | 16 +++++++++++++++-
 tools/pci/pcitest.sh |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c
index 7b530d838d40..08f355083754 100644
--- a/tools/pci/pcitest.c
+++ b/tools/pci/pcitest.c
@@ -22,6 +22,7 @@ static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
 struct pci_test {
 	char		*device;
 	char		barnum;
+	bool		consecutive_bar_test;
 	bool		legacyirq;
 	unsigned int	msinum;
 	unsigned int	msixnum;
@@ -57,6 +58,15 @@ static int run_test(struct pci_test *test)
 			fprintf(stdout, "%s\n", result[ret]);
 	}
 
+	if (test->consecutive_bar_test) {
+		ret = ioctl(fd, PCITEST_BARS);
+		fprintf(stdout, "Consecutive BAR test:\t\t");
+		if (ret < 0)
+			fprintf(stdout, "TEST FAILED\n");
+		else
+			fprintf(stdout, "%s\n", result[ret]);
+	}
+
 	if (test->set_irqtype) {
 		ret = ioctl(fd, PCITEST_SET_IRQTYPE, test->irqtype);
 		fprintf(stdout, "SET IRQ TYPE TO %s:\t\t", irq[test->irqtype]);
@@ -172,7 +182,7 @@ int main(int argc, char **argv)
 	/* set default endpoint device */
 	test->device = "/dev/pci-endpoint-test.0";
 
-	while ((c = getopt(argc, argv, "D:b:m:x:i:deIlhrwcs:")) != EOF)
+	while ((c = getopt(argc, argv, "D:b:Cm:x:i:deIlhrwcs:")) != EOF)
 	switch (c) {
 	case 'D':
 		test->device = optarg;
@@ -182,6 +192,9 @@ int main(int argc, char **argv)
 		if (test->barnum < 0 || test->barnum > 5)
 			goto usage;
 		continue;
+	case 'C':
+		test->consecutive_bar_test = true;
+		continue;
 	case 'l':
 		test->legacyirq = true;
 		continue;
@@ -230,6 +243,7 @@ usage:
 			"Options:\n"
 			"\t-D <dev>		PCI endpoint test device {default: /dev/pci-endpoint-test.0}\n"
 			"\t-b <bar num>		BAR test (bar number between 0..5)\n"
+			"\t-C			Consecutive BAR test\n"
 			"\t-m <msi num>		MSI test (msi number between 1..32)\n"
 			"\t-x <msix num>	\tMSI-X test (msix number between 1..2048)\n"
 			"\t-i <irq type>	\tSet IRQ type (0 - Legacy, 1 - MSI, 2 - MSI-X)\n"
diff --git a/tools/pci/pcitest.sh b/tools/pci/pcitest.sh
index 75ed48ff2990..770f4d6df34b 100644
--- a/tools/pci/pcitest.sh
+++ b/tools/pci/pcitest.sh
@@ -11,6 +11,7 @@ do
 	pcitest -b $bar
 	bar=`expr $bar + 1`
 done
+pcitest -C
 echo
 
 echo "Interrupt tests"
-- 
cgit v1.2.3


From f26d37ee9bda938e968d0e11ba1f8f1588b2a135 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 16 Jan 2025 22:46:48 +0530
Subject: misc: pci_endpoint_test: Fix IOCTL return value

IOCTLs are supposed to return 0 for success and negative error codes for
failure. Currently, this driver is returning 0 for failure and 1 for
success, that's not correct. Hence, fix it!

Link: https://lore.kernel.org/r/20250116171650.33585-3-manivannan.sadhasivam@linaro.org
Fixes: 2c156ac71c6b ("misc: Add host side PCI driver for PCI test function device")
Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Closes: https://lore.kernel.org/r/YvzNg5ROnxEApDgS@kroah.com
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 tools/pci/pcitest.c | 51 ++++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c
index 08f355083754..b96cc118839b 100644
--- a/tools/pci/pcitest.c
+++ b/tools/pci/pcitest.c
@@ -16,7 +16,6 @@
 
 #include <linux/pcitest.h>
 
-static char *result[] = { "NOT OKAY", "OKAY" };
 static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
 
 struct pci_test {
@@ -53,72 +52,74 @@ static int run_test(struct pci_test *test)
 		ret = ioctl(fd, PCITEST_BAR, test->barnum);
 		fprintf(stdout, "BAR%d:\t\t", test->barnum);
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->consecutive_bar_test) {
 		ret = ioctl(fd, PCITEST_BARS);
 		fprintf(stdout, "Consecutive BAR test:\t\t");
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->set_irqtype) {
 		ret = ioctl(fd, PCITEST_SET_IRQTYPE, test->irqtype);
 		fprintf(stdout, "SET IRQ TYPE TO %s:\t\t", irq[test->irqtype]);
 		if (ret < 0)
-			fprintf(stdout, "FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->get_irqtype) {
 		ret = ioctl(fd, PCITEST_GET_IRQTYPE);
 		fprintf(stdout, "GET IRQ TYPE:\t\t");
-		if (ret < 0)
-			fprintf(stdout, "FAILED\n");
-		else
+		if (ret < 0) {
+			fprintf(stdout, "NOT OKAY\n");
+		} else {
 			fprintf(stdout, "%s\n", irq[ret]);
+			ret = 0;
+		}
 	}
 
 	if (test->clear_irq) {
 		ret = ioctl(fd, PCITEST_CLEAR_IRQ);
 		fprintf(stdout, "CLEAR IRQ:\t\t");
 		if (ret < 0)
-			fprintf(stdout, "FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->legacyirq) {
 		ret = ioctl(fd, PCITEST_LEGACY_IRQ, 0);
 		fprintf(stdout, "LEGACY IRQ:\t");
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->msinum > 0 && test->msinum <= 32) {
 		ret = ioctl(fd, PCITEST_MSI, test->msinum);
 		fprintf(stdout, "MSI%u:\t\t", test->msinum);
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->msixnum > 0 && test->msixnum <= 2048) {
 		ret = ioctl(fd, PCITEST_MSIX, test->msixnum);
 		fprintf(stdout, "MSI-X%u:\t\t", test->msixnum);
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->write) {
@@ -128,9 +129,9 @@ static int run_test(struct pci_test *test)
 		ret = ioctl(fd, PCITEST_WRITE, &param);
 		fprintf(stdout, "WRITE (%7lu bytes):\t\t", test->size);
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->read) {
@@ -140,9 +141,9 @@ static int run_test(struct pci_test *test)
 		ret = ioctl(fd, PCITEST_READ, &param);
 		fprintf(stdout, "READ (%7lu bytes):\t\t", test->size);
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	if (test->copy) {
@@ -152,14 +153,14 @@ static int run_test(struct pci_test *test)
 		ret = ioctl(fd, PCITEST_COPY, &param);
 		fprintf(stdout, "COPY (%7lu bytes):\t\t", test->size);
 		if (ret < 0)
-			fprintf(stdout, "TEST FAILED\n");
+			fprintf(stdout, "NOT OKAY\n");
 		else
-			fprintf(stdout, "%s\n", result[ret]);
+			fprintf(stdout, "OKAY\n");
 	}
 
 	fflush(stdout);
 	close(fd);
-	return (ret < 0) ? ret : 1 - ret; /* return 0 if test succeeded */
+	return ret;
 }
 
 int main(int argc, char **argv)
-- 
cgit v1.2.3


From e19bde2269ca3611156fd0c078a71af0b6956545 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 16 Jan 2025 22:46:49 +0530
Subject: selftests: Move PCI Endpoint tests from tools/pci to Kselftests

This just moves the existing tests under tools/pci to
tools/testing/selftests/pci_endpoint and adjusts the paths in Makefile
accordingly. Migration to Kselftest framework will be done in subsequent
commits.

Link: https://lore.kernel.org/r/20250116171650.33585-4-manivannan.sadhasivam@linaro.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 tools/pci/Build                                 |   1 -
 tools/pci/Makefile                              |  58 ------
 tools/pci/pcitest.c                             | 265 ------------------------
 tools/pci/pcitest.sh                            |  73 -------
 tools/testing/selftests/pci_endpoint/.gitignore |   3 +
 tools/testing/selftests/pci_endpoint/Build      |   1 +
 tools/testing/selftests/pci_endpoint/Makefile   |  58 ++++++
 tools/testing/selftests/pci_endpoint/pcitest.c  | 265 ++++++++++++++++++++++++
 tools/testing/selftests/pci_endpoint/pcitest.sh |  73 +++++++
 9 files changed, 400 insertions(+), 397 deletions(-)
 delete mode 100644 tools/pci/Build
 delete mode 100644 tools/pci/Makefile
 delete mode 100644 tools/pci/pcitest.c
 delete mode 100644 tools/pci/pcitest.sh
 create mode 100644 tools/testing/selftests/pci_endpoint/.gitignore
 create mode 100644 tools/testing/selftests/pci_endpoint/Build
 create mode 100644 tools/testing/selftests/pci_endpoint/Makefile
 create mode 100644 tools/testing/selftests/pci_endpoint/pcitest.c
 create mode 100644 tools/testing/selftests/pci_endpoint/pcitest.sh

(limited to 'tools')

diff --git a/tools/pci/Build b/tools/pci/Build
deleted file mode 100644
index c375aea21790..000000000000
--- a/tools/pci/Build
+++ /dev/null
@@ -1 +0,0 @@
-pcitest-y += pcitest.o
diff --git a/tools/pci/Makefile b/tools/pci/Makefile
deleted file mode 100644
index 62d41f1a1e2c..000000000000
--- a/tools/pci/Makefile
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-include ../scripts/Makefile.include
-
-bindir ?= /usr/bin
-
-ifeq ($(srctree),)
-srctree := $(patsubst %/,%,$(dir $(CURDIR)))
-srctree := $(patsubst %/,%,$(dir $(srctree)))
-endif
-
-# Do not use make's built-in rules
-# (this improves performance and avoids hard-to-debug behaviour);
-MAKEFLAGS += -r
-
-CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include
-
-ALL_TARGETS := pcitest
-ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
-
-SCRIPTS := pcitest.sh
-
-all: $(ALL_PROGRAMS)
-
-export srctree OUTPUT CC LD CFLAGS
-include $(srctree)/tools/build/Makefile.include
-
-#
-# We need the following to be outside of kernel tree
-#
-$(OUTPUT)include/linux/: ../../include/uapi/linux/
-	mkdir -p $(OUTPUT)include/linux/ 2>&1 || true
-	ln -sf $(CURDIR)/../../include/uapi/linux/pcitest.h $@
-
-prepare: $(OUTPUT)include/linux/
-
-PCITEST_IN := $(OUTPUT)pcitest-in.o
-$(PCITEST_IN): prepare FORCE
-	$(Q)$(MAKE) $(build)=pcitest
-$(OUTPUT)pcitest: $(PCITEST_IN)
-	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
-
-clean:
-	rm -f $(ALL_PROGRAMS)
-	rm -rf $(OUTPUT)include/
-	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
-
-install: $(ALL_PROGRAMS)
-	install -d -m 755 $(DESTDIR)$(bindir);		\
-	for program in $(ALL_PROGRAMS); do		\
-		install $$program $(DESTDIR)$(bindir);	\
-	done;						\
-	for script in $(SCRIPTS); do			\
-		install $$script $(DESTDIR)$(bindir);	\
-	done
-
-FORCE:
-
-.PHONY: all install clean FORCE prepare
diff --git a/tools/pci/pcitest.c b/tools/pci/pcitest.c
deleted file mode 100644
index b96cc118839b..000000000000
--- a/tools/pci/pcitest.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/**
- * Userspace PCI Endpoint Test Module
- *
- * Copyright (C) 2017 Texas Instruments
- * Author: Kishon Vijay Abraham I <kishon@ti.com>
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <unistd.h>
-
-#include <linux/pcitest.h>
-
-static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
-
-struct pci_test {
-	char		*device;
-	char		barnum;
-	bool		consecutive_bar_test;
-	bool		legacyirq;
-	unsigned int	msinum;
-	unsigned int	msixnum;
-	int		irqtype;
-	bool		set_irqtype;
-	bool		get_irqtype;
-	bool		clear_irq;
-	bool		read;
-	bool		write;
-	bool		copy;
-	unsigned long	size;
-	bool		use_dma;
-};
-
-static int run_test(struct pci_test *test)
-{
-	struct pci_endpoint_test_xfer_param param = {};
-	int ret = -EINVAL;
-	int fd;
-
-	fd = open(test->device, O_RDWR);
-	if (fd < 0) {
-		perror("can't open PCI Endpoint Test device");
-		return -ENODEV;
-	}
-
-	if (test->barnum >= 0 && test->barnum <= 5) {
-		ret = ioctl(fd, PCITEST_BAR, test->barnum);
-		fprintf(stdout, "BAR%d:\t\t", test->barnum);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->consecutive_bar_test) {
-		ret = ioctl(fd, PCITEST_BARS);
-		fprintf(stdout, "Consecutive BAR test:\t\t");
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->set_irqtype) {
-		ret = ioctl(fd, PCITEST_SET_IRQTYPE, test->irqtype);
-		fprintf(stdout, "SET IRQ TYPE TO %s:\t\t", irq[test->irqtype]);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->get_irqtype) {
-		ret = ioctl(fd, PCITEST_GET_IRQTYPE);
-		fprintf(stdout, "GET IRQ TYPE:\t\t");
-		if (ret < 0) {
-			fprintf(stdout, "NOT OKAY\n");
-		} else {
-			fprintf(stdout, "%s\n", irq[ret]);
-			ret = 0;
-		}
-	}
-
-	if (test->clear_irq) {
-		ret = ioctl(fd, PCITEST_CLEAR_IRQ);
-		fprintf(stdout, "CLEAR IRQ:\t\t");
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->legacyirq) {
-		ret = ioctl(fd, PCITEST_LEGACY_IRQ, 0);
-		fprintf(stdout, "LEGACY IRQ:\t");
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->msinum > 0 && test->msinum <= 32) {
-		ret = ioctl(fd, PCITEST_MSI, test->msinum);
-		fprintf(stdout, "MSI%u:\t\t", test->msinum);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->msixnum > 0 && test->msixnum <= 2048) {
-		ret = ioctl(fd, PCITEST_MSIX, test->msixnum);
-		fprintf(stdout, "MSI-X%u:\t\t", test->msixnum);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->write) {
-		param.size = test->size;
-		if (test->use_dma)
-			param.flags = PCITEST_FLAGS_USE_DMA;
-		ret = ioctl(fd, PCITEST_WRITE, &param);
-		fprintf(stdout, "WRITE (%7lu bytes):\t\t", test->size);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->read) {
-		param.size = test->size;
-		if (test->use_dma)
-			param.flags = PCITEST_FLAGS_USE_DMA;
-		ret = ioctl(fd, PCITEST_READ, &param);
-		fprintf(stdout, "READ (%7lu bytes):\t\t", test->size);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->copy) {
-		param.size = test->size;
-		if (test->use_dma)
-			param.flags = PCITEST_FLAGS_USE_DMA;
-		ret = ioctl(fd, PCITEST_COPY, &param);
-		fprintf(stdout, "COPY (%7lu bytes):\t\t", test->size);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	fflush(stdout);
-	close(fd);
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int c;
-	struct pci_test *test;
-
-	test = calloc(1, sizeof(*test));
-	if (!test) {
-		perror("Fail to allocate memory for pci_test\n");
-		return -ENOMEM;
-	}
-
-	/* since '0' is a valid BAR number, initialize it to -1 */
-	test->barnum = -1;
-
-	/* set default size as 100KB */
-	test->size = 0x19000;
-
-	/* set default endpoint device */
-	test->device = "/dev/pci-endpoint-test.0";
-
-	while ((c = getopt(argc, argv, "D:b:Cm:x:i:deIlhrwcs:")) != EOF)
-	switch (c) {
-	case 'D':
-		test->device = optarg;
-		continue;
-	case 'b':
-		test->barnum = atoi(optarg);
-		if (test->barnum < 0 || test->barnum > 5)
-			goto usage;
-		continue;
-	case 'C':
-		test->consecutive_bar_test = true;
-		continue;
-	case 'l':
-		test->legacyirq = true;
-		continue;
-	case 'm':
-		test->msinum = atoi(optarg);
-		if (test->msinum < 1 || test->msinum > 32)
-			goto usage;
-		continue;
-	case 'x':
-		test->msixnum = atoi(optarg);
-		if (test->msixnum < 1 || test->msixnum > 2048)
-			goto usage;
-		continue;
-	case 'i':
-		test->irqtype = atoi(optarg);
-		if (test->irqtype < 0 || test->irqtype > 2)
-			goto usage;
-		test->set_irqtype = true;
-		continue;
-	case 'I':
-		test->get_irqtype = true;
-		continue;
-	case 'r':
-		test->read = true;
-		continue;
-	case 'w':
-		test->write = true;
-		continue;
-	case 'c':
-		test->copy = true;
-		continue;
-	case 'e':
-		test->clear_irq = true;
-		continue;
-	case 's':
-		test->size = strtoul(optarg, NULL, 0);
-		continue;
-	case 'd':
-		test->use_dma = true;
-		continue;
-	case 'h':
-	default:
-usage:
-		fprintf(stderr,
-			"usage: %s [options]\n"
-			"Options:\n"
-			"\t-D <dev>		PCI endpoint test device {default: /dev/pci-endpoint-test.0}\n"
-			"\t-b <bar num>		BAR test (bar number between 0..5)\n"
-			"\t-C			Consecutive BAR test\n"
-			"\t-m <msi num>		MSI test (msi number between 1..32)\n"
-			"\t-x <msix num>	\tMSI-X test (msix number between 1..2048)\n"
-			"\t-i <irq type>	\tSet IRQ type (0 - Legacy, 1 - MSI, 2 - MSI-X)\n"
-			"\t-e			Clear IRQ\n"
-			"\t-I			Get current IRQ type configured\n"
-			"\t-d			Use DMA\n"
-			"\t-l			Legacy IRQ test\n"
-			"\t-r			Read buffer test\n"
-			"\t-w			Write buffer test\n"
-			"\t-c			Copy buffer test\n"
-			"\t-s <size>		Size of buffer {default: 100KB}\n"
-			"\t-h			Print this help message\n",
-			argv[0]);
-		return -EINVAL;
-	}
-
-	return run_test(test);
-}
diff --git a/tools/pci/pcitest.sh b/tools/pci/pcitest.sh
deleted file mode 100644
index 770f4d6df34b..000000000000
--- a/tools/pci/pcitest.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-echo "BAR tests"
-echo
-
-bar=0
-
-while [ $bar -lt 6 ]
-do
-	pcitest -b $bar
-	bar=`expr $bar + 1`
-done
-pcitest -C
-echo
-
-echo "Interrupt tests"
-echo
-
-pcitest -i 0
-pcitest -l
-
-pcitest -i 1
-msi=1
-
-while [ $msi -lt 33 ]
-do
-        pcitest -m $msi
-        msi=`expr $msi + 1`
-done
-echo
-
-pcitest -i 2
-msix=1
-
-while [ $msix -lt 2049 ]
-do
-        pcitest -x $msix
-        msix=`expr $msix + 1`
-done
-echo
-
-echo "Read Tests"
-echo
-
-pcitest -i 1
-
-pcitest -r -s 1
-pcitest -r -s 1024
-pcitest -r -s 1025
-pcitest -r -s 1024000
-pcitest -r -s 1024001
-echo
-
-echo "Write Tests"
-echo
-
-pcitest -w -s 1
-pcitest -w -s 1024
-pcitest -w -s 1025
-pcitest -w -s 1024000
-pcitest -w -s 1024001
-echo
-
-echo "Copy Tests"
-echo
-
-pcitest -c -s 1
-pcitest -c -s 1024
-pcitest -c -s 1025
-pcitest -c -s 1024000
-pcitest -c -s 1024001
-echo
diff --git a/tools/testing/selftests/pci_endpoint/.gitignore b/tools/testing/selftests/pci_endpoint/.gitignore
new file mode 100644
index 000000000000..29ab47c48484
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+*.o
+pcitest
diff --git a/tools/testing/selftests/pci_endpoint/Build b/tools/testing/selftests/pci_endpoint/Build
new file mode 100644
index 000000000000..c375aea21790
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/Build
@@ -0,0 +1 @@
+pcitest-y += pcitest.o
diff --git a/tools/testing/selftests/pci_endpoint/Makefile b/tools/testing/selftests/pci_endpoint/Makefile
new file mode 100644
index 000000000000..3c6fe18e32cc
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/Makefile
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../../../scripts/Makefile.include
+
+bindir ?= /usr/bin
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
+endif
+
+# Do not use make's built-in rules
+# (this improves performance and avoids hard-to-debug behaviour);
+MAKEFLAGS += -r
+
+CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include
+
+ALL_TARGETS := pcitest
+ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
+
+SCRIPTS := pcitest.sh
+
+all: $(ALL_PROGRAMS)
+
+export srctree OUTPUT CC LD CFLAGS
+include $(srctree)/tools/build/Makefile.include
+
+#
+# We need the following to be outside of kernel tree
+#
+$(OUTPUT)include/linux/: ../../../../include/uapi/linux/
+	mkdir -p $(OUTPUT)include/linux/ 2>&1 || true
+	ln -sf $(CURDIR)/../../../../include/uapi/linux/pcitest.h $@
+
+$(info ${CURDIR})
+prepare: $(OUTPUT)include/linux/
+
+PCITEST_IN := $(OUTPUT)pcitest-in.o
+$(PCITEST_IN): prepare FORCE
+	$(Q)$(MAKE) $(build)=pcitest
+$(OUTPUT)pcitest: $(PCITEST_IN)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
+
+clean:
+	rm -f $(ALL_PROGRAMS)
+	rm -rf $(OUTPUT)include/
+	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+
+install: $(ALL_PROGRAMS)
+	install -d -m 755 $(DESTDIR)$(bindir);		\
+	for program in $(ALL_PROGRAMS); do		\
+		install $$program $(DESTDIR)$(bindir);	\
+	done;						\
+	for script in $(SCRIPTS); do			\
+		install $$script $(DESTDIR)$(bindir);	\
+	done
+
+FORCE:
+
+.PHONY: all install clean FORCE prepare
diff --git a/tools/testing/selftests/pci_endpoint/pcitest.c b/tools/testing/selftests/pci_endpoint/pcitest.c
new file mode 100644
index 000000000000..b96cc118839b
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/pcitest.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/**
+ * Userspace PCI Endpoint Test Module
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <linux/pcitest.h>
+
+static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
+
+struct pci_test {
+	char		*device;
+	char		barnum;
+	bool		consecutive_bar_test;
+	bool		legacyirq;
+	unsigned int	msinum;
+	unsigned int	msixnum;
+	int		irqtype;
+	bool		set_irqtype;
+	bool		get_irqtype;
+	bool		clear_irq;
+	bool		read;
+	bool		write;
+	bool		copy;
+	unsigned long	size;
+	bool		use_dma;
+};
+
+static int run_test(struct pci_test *test)
+{
+	struct pci_endpoint_test_xfer_param param = {};
+	int ret = -EINVAL;
+	int fd;
+
+	fd = open(test->device, O_RDWR);
+	if (fd < 0) {
+		perror("can't open PCI Endpoint Test device");
+		return -ENODEV;
+	}
+
+	if (test->barnum >= 0 && test->barnum <= 5) {
+		ret = ioctl(fd, PCITEST_BAR, test->barnum);
+		fprintf(stdout, "BAR%d:\t\t", test->barnum);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->consecutive_bar_test) {
+		ret = ioctl(fd, PCITEST_BARS);
+		fprintf(stdout, "Consecutive BAR test:\t\t");
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->set_irqtype) {
+		ret = ioctl(fd, PCITEST_SET_IRQTYPE, test->irqtype);
+		fprintf(stdout, "SET IRQ TYPE TO %s:\t\t", irq[test->irqtype]);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->get_irqtype) {
+		ret = ioctl(fd, PCITEST_GET_IRQTYPE);
+		fprintf(stdout, "GET IRQ TYPE:\t\t");
+		if (ret < 0) {
+			fprintf(stdout, "NOT OKAY\n");
+		} else {
+			fprintf(stdout, "%s\n", irq[ret]);
+			ret = 0;
+		}
+	}
+
+	if (test->clear_irq) {
+		ret = ioctl(fd, PCITEST_CLEAR_IRQ);
+		fprintf(stdout, "CLEAR IRQ:\t\t");
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->legacyirq) {
+		ret = ioctl(fd, PCITEST_LEGACY_IRQ, 0);
+		fprintf(stdout, "LEGACY IRQ:\t");
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->msinum > 0 && test->msinum <= 32) {
+		ret = ioctl(fd, PCITEST_MSI, test->msinum);
+		fprintf(stdout, "MSI%u:\t\t", test->msinum);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->msixnum > 0 && test->msixnum <= 2048) {
+		ret = ioctl(fd, PCITEST_MSIX, test->msixnum);
+		fprintf(stdout, "MSI-X%u:\t\t", test->msixnum);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->write) {
+		param.size = test->size;
+		if (test->use_dma)
+			param.flags = PCITEST_FLAGS_USE_DMA;
+		ret = ioctl(fd, PCITEST_WRITE, &param);
+		fprintf(stdout, "WRITE (%7lu bytes):\t\t", test->size);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->read) {
+		param.size = test->size;
+		if (test->use_dma)
+			param.flags = PCITEST_FLAGS_USE_DMA;
+		ret = ioctl(fd, PCITEST_READ, &param);
+		fprintf(stdout, "READ (%7lu bytes):\t\t", test->size);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	if (test->copy) {
+		param.size = test->size;
+		if (test->use_dma)
+			param.flags = PCITEST_FLAGS_USE_DMA;
+		ret = ioctl(fd, PCITEST_COPY, &param);
+		fprintf(stdout, "COPY (%7lu bytes):\t\t", test->size);
+		if (ret < 0)
+			fprintf(stdout, "NOT OKAY\n");
+		else
+			fprintf(stdout, "OKAY\n");
+	}
+
+	fflush(stdout);
+	close(fd);
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int c;
+	struct pci_test *test;
+
+	test = calloc(1, sizeof(*test));
+	if (!test) {
+		perror("Fail to allocate memory for pci_test\n");
+		return -ENOMEM;
+	}
+
+	/* since '0' is a valid BAR number, initialize it to -1 */
+	test->barnum = -1;
+
+	/* set default size as 100KB */
+	test->size = 0x19000;
+
+	/* set default endpoint device */
+	test->device = "/dev/pci-endpoint-test.0";
+
+	while ((c = getopt(argc, argv, "D:b:Cm:x:i:deIlhrwcs:")) != EOF)
+	switch (c) {
+	case 'D':
+		test->device = optarg;
+		continue;
+	case 'b':
+		test->barnum = atoi(optarg);
+		if (test->barnum < 0 || test->barnum > 5)
+			goto usage;
+		continue;
+	case 'C':
+		test->consecutive_bar_test = true;
+		continue;
+	case 'l':
+		test->legacyirq = true;
+		continue;
+	case 'm':
+		test->msinum = atoi(optarg);
+		if (test->msinum < 1 || test->msinum > 32)
+			goto usage;
+		continue;
+	case 'x':
+		test->msixnum = atoi(optarg);
+		if (test->msixnum < 1 || test->msixnum > 2048)
+			goto usage;
+		continue;
+	case 'i':
+		test->irqtype = atoi(optarg);
+		if (test->irqtype < 0 || test->irqtype > 2)
+			goto usage;
+		test->set_irqtype = true;
+		continue;
+	case 'I':
+		test->get_irqtype = true;
+		continue;
+	case 'r':
+		test->read = true;
+		continue;
+	case 'w':
+		test->write = true;
+		continue;
+	case 'c':
+		test->copy = true;
+		continue;
+	case 'e':
+		test->clear_irq = true;
+		continue;
+	case 's':
+		test->size = strtoul(optarg, NULL, 0);
+		continue;
+	case 'd':
+		test->use_dma = true;
+		continue;
+	case 'h':
+	default:
+usage:
+		fprintf(stderr,
+			"usage: %s [options]\n"
+			"Options:\n"
+			"\t-D <dev>		PCI endpoint test device {default: /dev/pci-endpoint-test.0}\n"
+			"\t-b <bar num>		BAR test (bar number between 0..5)\n"
+			"\t-C			Consecutive BAR test\n"
+			"\t-m <msi num>		MSI test (msi number between 1..32)\n"
+			"\t-x <msix num>	\tMSI-X test (msix number between 1..2048)\n"
+			"\t-i <irq type>	\tSet IRQ type (0 - Legacy, 1 - MSI, 2 - MSI-X)\n"
+			"\t-e			Clear IRQ\n"
+			"\t-I			Get current IRQ type configured\n"
+			"\t-d			Use DMA\n"
+			"\t-l			Legacy IRQ test\n"
+			"\t-r			Read buffer test\n"
+			"\t-w			Write buffer test\n"
+			"\t-c			Copy buffer test\n"
+			"\t-s <size>		Size of buffer {default: 100KB}\n"
+			"\t-h			Print this help message\n",
+			argv[0]);
+		return -EINVAL;
+	}
+
+	return run_test(test);
+}
diff --git a/tools/testing/selftests/pci_endpoint/pcitest.sh b/tools/testing/selftests/pci_endpoint/pcitest.sh
new file mode 100644
index 000000000000..770f4d6df34b
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/pcitest.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+echo "BAR tests"
+echo
+
+bar=0
+
+while [ $bar -lt 6 ]
+do
+	pcitest -b $bar
+	bar=`expr $bar + 1`
+done
+pcitest -C
+echo
+
+echo "Interrupt tests"
+echo
+
+pcitest -i 0
+pcitest -l
+
+pcitest -i 1
+msi=1
+
+while [ $msi -lt 33 ]
+do
+        pcitest -m $msi
+        msi=`expr $msi + 1`
+done
+echo
+
+pcitest -i 2
+msix=1
+
+while [ $msix -lt 2049 ]
+do
+        pcitest -x $msix
+        msix=`expr $msix + 1`
+done
+echo
+
+echo "Read Tests"
+echo
+
+pcitest -i 1
+
+pcitest -r -s 1
+pcitest -r -s 1024
+pcitest -r -s 1025
+pcitest -r -s 1024000
+pcitest -r -s 1024001
+echo
+
+echo "Write Tests"
+echo
+
+pcitest -w -s 1
+pcitest -w -s 1024
+pcitest -w -s 1025
+pcitest -w -s 1024000
+pcitest -w -s 1024001
+echo
+
+echo "Copy Tests"
+echo
+
+pcitest -c -s 1
+pcitest -c -s 1024
+pcitest -c -s 1025
+pcitest -c -s 1024000
+pcitest -c -s 1024001
+echo
-- 
cgit v1.2.3


From 392188bb0f6ec5162edf457c062929a6abfa369a Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 16 Jan 2025 22:46:50 +0530
Subject: selftests: pci_endpoint: Migrate to Kselftest framework

Migrate the PCI endpoint test to Kselftest framework. All the tests that
were part of the previous pcitest.sh file were migrated.

Below is the list of tests converted:

   1. BAR0 Test
   2. BAR1 Test
   3. BAR2 Test
   4. BAR3 Test
   5. BAR4 Test
   6. BAR5 Test
   7. Consecutive BAR Tests
   8. Legacy IRQ Tests
   9. MSI Interrupt Tests (MSI1 to MSI32)
  10. MSI-X Interrupt Tests (MSI-X1 to MSI-X2048)
  11. Read Tests - MEMCPY (For 1, 1024, 1025, 1024000, 1024001 Bytes)
  12. Write Tests - MEMCPY (For 1, 1024, 1025, 1024000, 1024001 Bytes)
  13. Copy Tests - MEMCPY (For 1, 1024, 1025, 1024000, 1024001 Bytes)
  14. Read Tests - DMA (For 1, 1024, 1025, 1024000, 1024001 Bytes)
  15. Write Tests - DMA (For 1, 1024, 1025, 1024000, 1024001 Bytes)
  16. Copy Tests - DMA (For 1, 1024, 1025, 1024000, 1024001 Bytes)

BAR, DMA and MEMCPY tests are added as fixture variants and can be executed
separately as below:

  $ pci_endpoint_test -v BAR0
  $ pci_endpoint_test -v dma
  $ pci_endpoint_test -v memcpy

Link: https://lore.kernel.org/r/20250116171650.33585-5-manivannan.sadhasivam@linaro.org
Co-developed-by: Aman Gupta <aman1.gupta@samsung.com>
Co-developed-by: Padmanabhan Rajanbabu <p.rajanbabu@samsung.com>
[mani: reworked based on the IOCTL fix, cleanups, documentation, commit message]
Signed-off-by: Aman Gupta <aman1.gupta@samsung.com>
Signed-off-by: Padmanabhan Rajanbabu <p.rajanbabu@samsung.com>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 tools/testing/selftests/Makefile                   |   1 +
 tools/testing/selftests/pci_endpoint/.gitignore    |   3 +-
 tools/testing/selftests/pci_endpoint/Build         |   1 -
 tools/testing/selftests/pci_endpoint/Makefile      |  59 +----
 tools/testing/selftests/pci_endpoint/config        |   4 +
 .../selftests/pci_endpoint/pci_endpoint_test.c     | 221 +++++++++++++++++
 tools/testing/selftests/pci_endpoint/pcitest.c     | 265 ---------------------
 tools/testing/selftests/pci_endpoint/pcitest.sh    |  73 ------
 8 files changed, 231 insertions(+), 396 deletions(-)
 delete mode 100644 tools/testing/selftests/pci_endpoint/Build
 create mode 100644 tools/testing/selftests/pci_endpoint/config
 create mode 100644 tools/testing/selftests/pci_endpoint/pci_endpoint_test.c
 delete mode 100644 tools/testing/selftests/pci_endpoint/pcitest.c
 delete mode 100644 tools/testing/selftests/pci_endpoint/pcitest.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 2401e973c359..50931cd6aff2 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -72,6 +72,7 @@ TARGETS += net/packetdrill
 TARGETS += net/rds
 TARGETS += net/tcp_ao
 TARGETS += nsfs
+TARGETS += pci_endpoint
 TARGETS += pcie_bwctrl
 TARGETS += perf_events
 TARGETS += pidfd
diff --git a/tools/testing/selftests/pci_endpoint/.gitignore b/tools/testing/selftests/pci_endpoint/.gitignore
index 29ab47c48484..6a4837a3e034 100644
--- a/tools/testing/selftests/pci_endpoint/.gitignore
+++ b/tools/testing/selftests/pci_endpoint/.gitignore
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-*.o
-pcitest
+pci_endpoint_test
diff --git a/tools/testing/selftests/pci_endpoint/Build b/tools/testing/selftests/pci_endpoint/Build
deleted file mode 100644
index c375aea21790..000000000000
--- a/tools/testing/selftests/pci_endpoint/Build
+++ /dev/null
@@ -1 +0,0 @@
-pcitest-y += pcitest.o
diff --git a/tools/testing/selftests/pci_endpoint/Makefile b/tools/testing/selftests/pci_endpoint/Makefile
index 3c6fe18e32cc..bf21ebf20b4a 100644
--- a/tools/testing/selftests/pci_endpoint/Makefile
+++ b/tools/testing/selftests/pci_endpoint/Makefile
@@ -1,58 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
-include ../../../scripts/Makefile.include
+CFLAGS += -O2 -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
+LDFLAGS += -lrt -lpthread -lm
 
-bindir ?= /usr/bin
+TEST_GEN_PROGS = pci_endpoint_test
 
-ifeq ($(srctree),)
-srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
-endif
-
-# Do not use make's built-in rules
-# (this improves performance and avoids hard-to-debug behaviour);
-MAKEFLAGS += -r
-
-CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include
-
-ALL_TARGETS := pcitest
-ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
-
-SCRIPTS := pcitest.sh
-
-all: $(ALL_PROGRAMS)
-
-export srctree OUTPUT CC LD CFLAGS
-include $(srctree)/tools/build/Makefile.include
-
-#
-# We need the following to be outside of kernel tree
-#
-$(OUTPUT)include/linux/: ../../../../include/uapi/linux/
-	mkdir -p $(OUTPUT)include/linux/ 2>&1 || true
-	ln -sf $(CURDIR)/../../../../include/uapi/linux/pcitest.h $@
-
-$(info ${CURDIR})
-prepare: $(OUTPUT)include/linux/
-
-PCITEST_IN := $(OUTPUT)pcitest-in.o
-$(PCITEST_IN): prepare FORCE
-	$(Q)$(MAKE) $(build)=pcitest
-$(OUTPUT)pcitest: $(PCITEST_IN)
-	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
-
-clean:
-	rm -f $(ALL_PROGRAMS)
-	rm -rf $(OUTPUT)include/
-	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
-
-install: $(ALL_PROGRAMS)
-	install -d -m 755 $(DESTDIR)$(bindir);		\
-	for program in $(ALL_PROGRAMS); do		\
-		install $$program $(DESTDIR)$(bindir);	\
-	done;						\
-	for script in $(SCRIPTS); do			\
-		install $$script $(DESTDIR)$(bindir);	\
-	done
-
-FORCE:
-
-.PHONY: all install clean FORCE prepare
+include ../lib.mk
diff --git a/tools/testing/selftests/pci_endpoint/config b/tools/testing/selftests/pci_endpoint/config
new file mode 100644
index 000000000000..7cdcf117db8d
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/config
@@ -0,0 +1,4 @@
+CONFIG_PCI_ENDPOINT=y
+CONFIG_PCI_ENDPOINT_CONFIGFS=y
+CONFIG_PCI_EPF_TEST=m
+CONFIG_PCI_ENDPOINT_TEST=m
diff --git a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c
new file mode 100644
index 000000000000..c267b822c108
--- /dev/null
+++ b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Kselftest for PCI Endpoint Subsystem
+ *
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd.
+ *             https://www.samsung.com
+ * Author: Aman Gupta <aman1.gupta@samsung.com>
+ *
+ * Copyright (c) 2024, Linaro Ltd.
+ * Author: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include "../../../../include/uapi/linux/pcitest.h"
+
+#include "../kselftest_harness.h"
+
+#define pci_ep_ioctl(cmd, arg)			\
+({						\
+	ret = ioctl(self->fd, cmd, arg);	\
+	ret = ret < 0 ? -errno : 0;		\
+})
+
+static const char *test_device = "/dev/pci-endpoint-test.0";
+static const unsigned long test_size[5] = { 1, 1024, 1025, 1024000, 1024001 };
+
+FIXTURE(pci_ep_bar)
+{
+	int fd;
+};
+
+FIXTURE_SETUP(pci_ep_bar)
+{
+	self->fd = open(test_device, O_RDWR);
+
+	ASSERT_NE(-1, self->fd) TH_LOG("Can't open PCI Endpoint Test device");
+}
+
+FIXTURE_TEARDOWN(pci_ep_bar)
+{
+	close(self->fd);
+}
+
+FIXTURE_VARIANT(pci_ep_bar)
+{
+	int barno;
+};
+
+FIXTURE_VARIANT_ADD(pci_ep_bar, BAR0) { .barno = 0 };
+FIXTURE_VARIANT_ADD(pci_ep_bar, BAR1) { .barno = 1 };
+FIXTURE_VARIANT_ADD(pci_ep_bar, BAR2) { .barno = 2 };
+FIXTURE_VARIANT_ADD(pci_ep_bar, BAR3) { .barno = 3 };
+FIXTURE_VARIANT_ADD(pci_ep_bar, BAR4) { .barno = 4 };
+FIXTURE_VARIANT_ADD(pci_ep_bar, BAR5) { .barno = 5 };
+
+TEST_F(pci_ep_bar, BAR_TEST)
+{
+	int ret;
+
+	pci_ep_ioctl(PCITEST_BAR, variant->barno);
+	EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno);
+}
+
+FIXTURE(pci_ep_basic)
+{
+	int fd;
+};
+
+FIXTURE_SETUP(pci_ep_basic)
+{
+	self->fd = open(test_device, O_RDWR);
+
+	ASSERT_NE(-1, self->fd) TH_LOG("Can't open PCI Endpoint Test device");
+}
+
+FIXTURE_TEARDOWN(pci_ep_basic)
+{
+	close(self->fd);
+}
+
+TEST_F(pci_ep_basic, CONSECUTIVE_BAR_TEST)
+{
+	int ret;
+
+	pci_ep_ioctl(PCITEST_BARS, 0);
+	EXPECT_FALSE(ret) TH_LOG("Consecutive BAR test failed");
+}
+
+TEST_F(pci_ep_basic, LEGACY_IRQ_TEST)
+{
+	int ret;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, 0);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set Legacy IRQ type");
+
+	pci_ep_ioctl(PCITEST_LEGACY_IRQ, 0);
+	EXPECT_FALSE(ret) TH_LOG("Test failed for Legacy IRQ");
+}
+
+TEST_F(pci_ep_basic, MSI_TEST)
+{
+	int ret, i;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, 1);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set MSI IRQ type");
+
+	for (i = 1; i <= 32; i++) {
+		pci_ep_ioctl(PCITEST_MSI, i);
+		EXPECT_FALSE(ret) TH_LOG("Test failed for MSI%d", i);
+	}
+}
+
+TEST_F(pci_ep_basic, MSIX_TEST)
+{
+	int ret, i;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, 2);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set MSI-X IRQ type");
+
+	for (i = 1; i <= 2048; i++) {
+		pci_ep_ioctl(PCITEST_MSIX, i);
+		EXPECT_FALSE(ret) TH_LOG("Test failed for MSI-X%d", i);
+	}
+}
+
+FIXTURE(pci_ep_data_transfer)
+{
+	int fd;
+};
+
+FIXTURE_SETUP(pci_ep_data_transfer)
+{
+	self->fd = open(test_device, O_RDWR);
+
+	ASSERT_NE(-1, self->fd) TH_LOG("Can't open PCI Endpoint Test device");
+}
+
+FIXTURE_TEARDOWN(pci_ep_data_transfer)
+{
+	close(self->fd);
+}
+
+FIXTURE_VARIANT(pci_ep_data_transfer)
+{
+	bool use_dma;
+};
+
+FIXTURE_VARIANT_ADD(pci_ep_data_transfer, memcpy)
+{
+	.use_dma = false,
+};
+
+FIXTURE_VARIANT_ADD(pci_ep_data_transfer, dma)
+{
+	.use_dma = true,
+};
+
+TEST_F(pci_ep_data_transfer, READ_TEST)
+{
+	struct pci_endpoint_test_xfer_param param = {};
+	int ret, i;
+
+	if (variant->use_dma)
+		param.flags = PCITEST_FLAGS_USE_DMA;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, 1);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set MSI IRQ type");
+
+	for (i = 0; i < ARRAY_SIZE(test_size); i++) {
+		param.size = test_size[i];
+		pci_ep_ioctl(PCITEST_READ, &param);
+		EXPECT_FALSE(ret) TH_LOG("Test failed for size (%ld)",
+					 test_size[i]);
+	}
+}
+
+TEST_F(pci_ep_data_transfer, WRITE_TEST)
+{
+	struct pci_endpoint_test_xfer_param param = {};
+	int ret, i;
+
+	if (variant->use_dma)
+		param.flags = PCITEST_FLAGS_USE_DMA;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, 1);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set MSI IRQ type");
+
+	for (i = 0; i < ARRAY_SIZE(test_size); i++) {
+		param.size = test_size[i];
+		pci_ep_ioctl(PCITEST_WRITE, &param);
+		EXPECT_FALSE(ret) TH_LOG("Test failed for size (%ld)",
+					 test_size[i]);
+	}
+}
+
+TEST_F(pci_ep_data_transfer, COPY_TEST)
+{
+	struct pci_endpoint_test_xfer_param param = {};
+	int ret, i;
+
+	if (variant->use_dma)
+		param.flags = PCITEST_FLAGS_USE_DMA;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, 1);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set MSI IRQ type");
+
+	for (i = 0; i < ARRAY_SIZE(test_size); i++) {
+		param.size = test_size[i];
+		pci_ep_ioctl(PCITEST_COPY, &param);
+		EXPECT_FALSE(ret) TH_LOG("Test failed for size (%ld)",
+					 test_size[i]);
+	}
+}
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pci_endpoint/pcitest.c b/tools/testing/selftests/pci_endpoint/pcitest.c
deleted file mode 100644
index b96cc118839b..000000000000
--- a/tools/testing/selftests/pci_endpoint/pcitest.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/**
- * Userspace PCI Endpoint Test Module
- *
- * Copyright (C) 2017 Texas Instruments
- * Author: Kishon Vijay Abraham I <kishon@ti.com>
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <unistd.h>
-
-#include <linux/pcitest.h>
-
-static char *irq[] = { "LEGACY", "MSI", "MSI-X" };
-
-struct pci_test {
-	char		*device;
-	char		barnum;
-	bool		consecutive_bar_test;
-	bool		legacyirq;
-	unsigned int	msinum;
-	unsigned int	msixnum;
-	int		irqtype;
-	bool		set_irqtype;
-	bool		get_irqtype;
-	bool		clear_irq;
-	bool		read;
-	bool		write;
-	bool		copy;
-	unsigned long	size;
-	bool		use_dma;
-};
-
-static int run_test(struct pci_test *test)
-{
-	struct pci_endpoint_test_xfer_param param = {};
-	int ret = -EINVAL;
-	int fd;
-
-	fd = open(test->device, O_RDWR);
-	if (fd < 0) {
-		perror("can't open PCI Endpoint Test device");
-		return -ENODEV;
-	}
-
-	if (test->barnum >= 0 && test->barnum <= 5) {
-		ret = ioctl(fd, PCITEST_BAR, test->barnum);
-		fprintf(stdout, "BAR%d:\t\t", test->barnum);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->consecutive_bar_test) {
-		ret = ioctl(fd, PCITEST_BARS);
-		fprintf(stdout, "Consecutive BAR test:\t\t");
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->set_irqtype) {
-		ret = ioctl(fd, PCITEST_SET_IRQTYPE, test->irqtype);
-		fprintf(stdout, "SET IRQ TYPE TO %s:\t\t", irq[test->irqtype]);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->get_irqtype) {
-		ret = ioctl(fd, PCITEST_GET_IRQTYPE);
-		fprintf(stdout, "GET IRQ TYPE:\t\t");
-		if (ret < 0) {
-			fprintf(stdout, "NOT OKAY\n");
-		} else {
-			fprintf(stdout, "%s\n", irq[ret]);
-			ret = 0;
-		}
-	}
-
-	if (test->clear_irq) {
-		ret = ioctl(fd, PCITEST_CLEAR_IRQ);
-		fprintf(stdout, "CLEAR IRQ:\t\t");
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->legacyirq) {
-		ret = ioctl(fd, PCITEST_LEGACY_IRQ, 0);
-		fprintf(stdout, "LEGACY IRQ:\t");
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->msinum > 0 && test->msinum <= 32) {
-		ret = ioctl(fd, PCITEST_MSI, test->msinum);
-		fprintf(stdout, "MSI%u:\t\t", test->msinum);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->msixnum > 0 && test->msixnum <= 2048) {
-		ret = ioctl(fd, PCITEST_MSIX, test->msixnum);
-		fprintf(stdout, "MSI-X%u:\t\t", test->msixnum);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->write) {
-		param.size = test->size;
-		if (test->use_dma)
-			param.flags = PCITEST_FLAGS_USE_DMA;
-		ret = ioctl(fd, PCITEST_WRITE, &param);
-		fprintf(stdout, "WRITE (%7lu bytes):\t\t", test->size);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->read) {
-		param.size = test->size;
-		if (test->use_dma)
-			param.flags = PCITEST_FLAGS_USE_DMA;
-		ret = ioctl(fd, PCITEST_READ, &param);
-		fprintf(stdout, "READ (%7lu bytes):\t\t", test->size);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	if (test->copy) {
-		param.size = test->size;
-		if (test->use_dma)
-			param.flags = PCITEST_FLAGS_USE_DMA;
-		ret = ioctl(fd, PCITEST_COPY, &param);
-		fprintf(stdout, "COPY (%7lu bytes):\t\t", test->size);
-		if (ret < 0)
-			fprintf(stdout, "NOT OKAY\n");
-		else
-			fprintf(stdout, "OKAY\n");
-	}
-
-	fflush(stdout);
-	close(fd);
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int c;
-	struct pci_test *test;
-
-	test = calloc(1, sizeof(*test));
-	if (!test) {
-		perror("Fail to allocate memory for pci_test\n");
-		return -ENOMEM;
-	}
-
-	/* since '0' is a valid BAR number, initialize it to -1 */
-	test->barnum = -1;
-
-	/* set default size as 100KB */
-	test->size = 0x19000;
-
-	/* set default endpoint device */
-	test->device = "/dev/pci-endpoint-test.0";
-
-	while ((c = getopt(argc, argv, "D:b:Cm:x:i:deIlhrwcs:")) != EOF)
-	switch (c) {
-	case 'D':
-		test->device = optarg;
-		continue;
-	case 'b':
-		test->barnum = atoi(optarg);
-		if (test->barnum < 0 || test->barnum > 5)
-			goto usage;
-		continue;
-	case 'C':
-		test->consecutive_bar_test = true;
-		continue;
-	case 'l':
-		test->legacyirq = true;
-		continue;
-	case 'm':
-		test->msinum = atoi(optarg);
-		if (test->msinum < 1 || test->msinum > 32)
-			goto usage;
-		continue;
-	case 'x':
-		test->msixnum = atoi(optarg);
-		if (test->msixnum < 1 || test->msixnum > 2048)
-			goto usage;
-		continue;
-	case 'i':
-		test->irqtype = atoi(optarg);
-		if (test->irqtype < 0 || test->irqtype > 2)
-			goto usage;
-		test->set_irqtype = true;
-		continue;
-	case 'I':
-		test->get_irqtype = true;
-		continue;
-	case 'r':
-		test->read = true;
-		continue;
-	case 'w':
-		test->write = true;
-		continue;
-	case 'c':
-		test->copy = true;
-		continue;
-	case 'e':
-		test->clear_irq = true;
-		continue;
-	case 's':
-		test->size = strtoul(optarg, NULL, 0);
-		continue;
-	case 'd':
-		test->use_dma = true;
-		continue;
-	case 'h':
-	default:
-usage:
-		fprintf(stderr,
-			"usage: %s [options]\n"
-			"Options:\n"
-			"\t-D <dev>		PCI endpoint test device {default: /dev/pci-endpoint-test.0}\n"
-			"\t-b <bar num>		BAR test (bar number between 0..5)\n"
-			"\t-C			Consecutive BAR test\n"
-			"\t-m <msi num>		MSI test (msi number between 1..32)\n"
-			"\t-x <msix num>	\tMSI-X test (msix number between 1..2048)\n"
-			"\t-i <irq type>	\tSet IRQ type (0 - Legacy, 1 - MSI, 2 - MSI-X)\n"
-			"\t-e			Clear IRQ\n"
-			"\t-I			Get current IRQ type configured\n"
-			"\t-d			Use DMA\n"
-			"\t-l			Legacy IRQ test\n"
-			"\t-r			Read buffer test\n"
-			"\t-w			Write buffer test\n"
-			"\t-c			Copy buffer test\n"
-			"\t-s <size>		Size of buffer {default: 100KB}\n"
-			"\t-h			Print this help message\n",
-			argv[0]);
-		return -EINVAL;
-	}
-
-	return run_test(test);
-}
diff --git a/tools/testing/selftests/pci_endpoint/pcitest.sh b/tools/testing/selftests/pci_endpoint/pcitest.sh
deleted file mode 100644
index 770f4d6df34b..000000000000
--- a/tools/testing/selftests/pci_endpoint/pcitest.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-echo "BAR tests"
-echo
-
-bar=0
-
-while [ $bar -lt 6 ]
-do
-	pcitest -b $bar
-	bar=`expr $bar + 1`
-done
-pcitest -C
-echo
-
-echo "Interrupt tests"
-echo
-
-pcitest -i 0
-pcitest -l
-
-pcitest -i 1
-msi=1
-
-while [ $msi -lt 33 ]
-do
-        pcitest -m $msi
-        msi=`expr $msi + 1`
-done
-echo
-
-pcitest -i 2
-msix=1
-
-while [ $msix -lt 2049 ]
-do
-        pcitest -x $msix
-        msix=`expr $msix + 1`
-done
-echo
-
-echo "Read Tests"
-echo
-
-pcitest -i 1
-
-pcitest -r -s 1
-pcitest -r -s 1024
-pcitest -r -s 1025
-pcitest -r -s 1024000
-pcitest -r -s 1024001
-echo
-
-echo "Write Tests"
-echo
-
-pcitest -w -s 1
-pcitest -w -s 1024
-pcitest -w -s 1025
-pcitest -w -s 1024000
-pcitest -w -s 1024001
-echo
-
-echo "Copy Tests"
-echo
-
-pcitest -c -s 1
-pcitest -c -s 1024
-pcitest -c -s 1025
-pcitest -c -s 1024000
-pcitest -c -s 1024001
-echo
-- 
cgit v1.2.3


From f8524ac33cd452aef5384504b3264db6039a455e Mon Sep 17 00:00:00 2001
From: Koichiro Den <koichiro.den@canonical.com>
Date: Wed, 22 Jan 2025 13:33:09 +0900
Subject: selftests: gpio: gpio-sim: Fix missing chip disablements

Since upstream commit 8bd76b3d3f3a ("gpio: sim: lock up configfs that an
instantiated device depends on"), rmdir for an active virtual devices
been prohibited.

Update gpio-sim selftest to align with the change.

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202501221006.a1ca5dfa-lkp@intel.com
Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
Link: https://lore.kernel.org/r/20250122043309.304621-1-koichiro.den@canonical.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 tools/testing/selftests/gpio/gpio-sim.sh | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/gpio-sim.sh b/tools/testing/selftests/gpio/gpio-sim.sh
index 6fb66a687f17..bbc29ed9c60a 100755
--- a/tools/testing/selftests/gpio/gpio-sim.sh
+++ b/tools/testing/selftests/gpio/gpio-sim.sh
@@ -46,12 +46,6 @@ remove_chip() {
 	rmdir $CONFIGFS_DIR/$CHIP || fail "Unable to remove the chip"
 }
 
-configfs_cleanup() {
-	for CHIP in `ls $CONFIGFS_DIR/`; do
-		remove_chip $CHIP
-	done
-}
-
 create_chip() {
 	local CHIP=$1
 
@@ -105,6 +99,13 @@ disable_chip() {
 	echo 0 > $CONFIGFS_DIR/$CHIP/live || fail "Unable to disable the chip"
 }
 
+configfs_cleanup() {
+	for CHIP in `ls $CONFIGFS_DIR/`; do
+		disable_chip $CHIP
+		remove_chip $CHIP
+	done
+}
+
 configfs_chip_name() {
 	local CHIP=$1
 	local BANK=$2
@@ -181,6 +182,7 @@ create_chip chip
 create_bank chip bank
 enable_chip chip
 test -n `cat $CONFIGFS_DIR/chip/bank/chip_name` || fail "chip_name doesn't work"
+disable_chip chip
 remove_chip chip
 
 echo "1.2. chip_name returns 'none' if the chip is still pending"
@@ -195,6 +197,7 @@ create_chip chip
 create_bank chip bank
 enable_chip chip
 test -n `cat $CONFIGFS_DIR/chip/dev_name` || fail "dev_name doesn't work"
+disable_chip chip
 remove_chip chip
 
 echo "2. Creating and configuring simulated chips"
@@ -204,6 +207,7 @@ create_chip chip
 create_bank chip bank
 enable_chip chip
 test "`get_chip_num_lines chip bank`" = "1" || fail "default number of lines is not 1"
+disable_chip chip
 remove_chip chip
 
 echo "2.2. Number of lines can be specified"
@@ -212,6 +216,7 @@ create_bank chip bank
 set_num_lines chip bank 16
 enable_chip chip
 test "`get_chip_num_lines chip bank`" = "16" || fail "number of lines is not 16"
+disable_chip chip
 remove_chip chip
 
 echo "2.3. Label can be set"
@@ -220,6 +225,7 @@ create_bank chip bank
 set_label chip bank foobar
 enable_chip chip
 test "`get_chip_label chip bank`" = "foobar" || fail "label is incorrect"
+disable_chip chip
 remove_chip chip
 
 echo "2.4. Label can be left empty"
@@ -227,6 +233,7 @@ create_chip chip
 create_bank chip bank
 enable_chip chip
 test -z "`cat $CONFIGFS_DIR/chip/bank/label`" || fail "label is not empty"
+disable_chip chip
 remove_chip chip
 
 echo "2.5. Line names can be configured"
@@ -238,6 +245,7 @@ set_line_name chip bank 2 bar
 enable_chip chip
 test "`get_line_name chip bank 0`" = "foo" || fail "line name is incorrect"
 test "`get_line_name chip bank 2`" = "bar" || fail "line name is incorrect"
+disable_chip chip
 remove_chip chip
 
 echo "2.6. Line config can remain unused if offset is greater than number of lines"
@@ -248,6 +256,7 @@ set_line_name chip bank 5 foobar
 enable_chip chip
 test "`get_line_name chip bank 0`" = "" || fail "line name is incorrect"
 test "`get_line_name chip bank 1`" = "" || fail "line name is incorrect"
+disable_chip chip
 remove_chip chip
 
 echo "2.7. Line configfs directory names are sanitized"
@@ -267,6 +276,7 @@ for CHIP in $CHIPS; do
 	enable_chip $CHIP
 done
 for CHIP in $CHIPS; do
+  disable_chip $CHIP
 	remove_chip $CHIP
 done
 
@@ -278,6 +288,7 @@ echo foobar > $CONFIGFS_DIR/chip/bank/label 2> /dev/null && \
 	fail "Setting label of a live chip should fail"
 echo 8 > $CONFIGFS_DIR/chip/bank/num_lines 2> /dev/null && \
 	fail "Setting number of lines of a live chip should fail"
+disable_chip chip
 remove_chip chip
 
 echo "2.10. Can't create line items when chip is live"
@@ -285,6 +296,7 @@ create_chip chip
 create_bank chip bank
 enable_chip chip
 mkdir $CONFIGFS_DIR/chip/bank/line0 2> /dev/null && fail "Creating line item should fail"
+disable_chip chip
 remove_chip chip
 
 echo "2.11. Probe errors are propagated to user-space"
@@ -316,6 +328,7 @@ mkdir -p $CONFIGFS_DIR/chip/bank/line4/hog
 enable_chip chip
 $BASE_DIR/gpio-mockup-cdev -s 1 /dev/`configfs_chip_name chip bank` 4 2> /dev/null && \
 	fail "Setting the value of a hogged line shouldn't succeed"
+disable_chip chip
 remove_chip chip
 
 echo "3. Controlling simulated chips"
@@ -331,6 +344,7 @@ test "$?" = "1" || fail "pull set incorrectly"
 sysfs_set_pull chip bank 0 pull-down
 $BASE_DIR/gpio-mockup-cdev /dev/`configfs_chip_name chip bank` 1
 test "$?" = "0" || fail "pull set incorrectly"
+disable_chip chip
 remove_chip chip
 
 echo "3.2. Pull can be read from sysfs"
@@ -344,6 +358,7 @@ SYSFS_PATH=/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/pull
 test `cat $SYSFS_PATH` = "pull-down" || fail "reading the pull failed"
 sysfs_set_pull chip bank 0 pull-up
 test `cat $SYSFS_PATH` = "pull-up" || fail "reading the pull failed"
+disable_chip chip
 remove_chip chip
 
 echo "3.3. Incorrect input in sysfs is rejected"
@@ -355,6 +370,7 @@ DEVNAME=`configfs_dev_name chip`
 CHIPNAME=`configfs_chip_name chip bank`
 SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/pull"
 echo foobar > $SYSFS_PATH 2> /dev/null && fail "invalid input not detected"
+disable_chip chip
 remove_chip chip
 
 echo "3.4. Can't write to value"
@@ -365,6 +381,7 @@ DEVNAME=`configfs_dev_name chip`
 CHIPNAME=`configfs_chip_name chip bank`
 SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value"
 echo 1 > $SYSFS_PATH 2> /dev/null && fail "writing to 'value' succeeded unexpectedly"
+disable_chip chip
 remove_chip chip
 
 echo "4. Simulated GPIO chips are functional"
@@ -382,6 +399,7 @@ $BASE_DIR/gpio-mockup-cdev -s 1 /dev/`configfs_chip_name chip bank` 0 &
 sleep 0.1 # FIXME Any better way?
 test `cat $SYSFS_PATH` = "1" || fail "incorrect value read from sysfs"
 kill $!
+disable_chip chip
 remove_chip chip
 
 echo "4.2. Bias settings work correctly"
@@ -394,6 +412,7 @@ CHIPNAME=`configfs_chip_name chip bank`
 SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value"
 $BASE_DIR/gpio-mockup-cdev -b pull-up /dev/`configfs_chip_name chip bank` 0
 test `cat $SYSFS_PATH` = "1" || fail "bias setting does not work"
+disable_chip chip
 remove_chip chip
 
 echo "GPIO $MODULE test PASS"
-- 
cgit v1.2.3


From 965adae5a33a998e2b62ec1b37c5eb21f1875b15 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 21 Jan 2025 06:34:23 -0800
Subject: selftests/net: packetdrill: more xfail changes (and a correction)

Recent change to add more cases to XFAIL has a broken regex,
the matching needs a real regex not a glob pattern.

While at it add the cases Willem pointed out during review.

Fixes: 3030e3d57ba8 ("selftests/net: packetdrill: make tcp buf limited timing tests benign")
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250121143423.215261-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/packetdrill/ksft_runner.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index e15c43b7359b..ef8b25a606d8 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -39,11 +39,13 @@ if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
 	# xfail tests that are known flaky with dbg config, not fixable.
 	# still run them for coverage (and expect 100% pass without dbg).
 	declare -ar xfail_list=(
+		"tcp_eor_no-coalesce-retrans.pkt"
 		"tcp_fast_recovery_prr-ss.*.pkt"
+		"tcp_slow_start_slow-start-after-win-update.pkt"
 		"tcp_timestamping.*.pkt"
 		"tcp_user_timeout_user-timeout-probe.pkt"
 		"tcp_zerocopy_epoll_.*.pkt"
-		"tcp_tcp_info_tcp-info-*-limited.pkt"
+		"tcp_tcp_info_tcp-info-.*-limited.pkt"
 	)
 	readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$"
 	[[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail
-- 
cgit v1.2.3


From ab16714fcb0614c1bcedfa8fa3e5dbf5e032cff2 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Mon, 20 Jan 2025 14:56:30 +0100
Subject: tools/rtla: Add basic test suite

Implement a simple TAP-based test engine in bash and a few basic tests
using it, to be used to check for bugs and regressions.

A new "check" target is added to the rtla Makefile that runs the test suite
using the "prove" command implemented by Test::Harness.

The only test format currently supported is running rtla with defined
command arguments per test, checking its exit code. In case the exit
code is non-zero, the output of rtla is displayed, together with the
exit code.

The test cases are adopted from rtla tests in the Continuous Kernel
Integration (CKI) project [1] with the authors' approval.

[1] https://gitlab.com/redhat/centos-stream/tests/kernel/kernel-tests/-/blob/main/rt-tests/us/rtla/

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Chang Yin <cyin@redhat.com>
Cc: Qiao Zhao <qzhao@redhat.com>
Link: https://lore.kernel.org/20250120135630.802111-1-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/Makefile         |  4 +++-
 tools/tracing/rtla/tests/engine.sh  | 48 +++++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/tests/hwnoise.t  | 21 ++++++++++++++++
 tools/tracing/rtla/tests/osnoise.t  | 19 +++++++++++++++
 tools/tracing/rtla/tests/timerlat.t | 27 +++++++++++++++++++++
 5 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 tools/tracing/rtla/tests/engine.sh
 create mode 100644 tools/tracing/rtla/tests/hwnoise.t
 create mode 100644 tools/tracing/rtla/tests/osnoise.t
 create mode 100644 tools/tracing/rtla/tests/timerlat.t

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index a6a7dee16622..8b5101457c70 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -85,4 +85,6 @@ clean: doc_clean fixdep-clean
 	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
 	$(Q)rm -rf feature
-.PHONY: FORCE clean
+check: $(RTLA)
+	RTLA=$(RTLA) prove -o -f tests/
+.PHONY: FORCE clean check
diff --git a/tools/tracing/rtla/tests/engine.sh b/tools/tracing/rtla/tests/engine.sh
new file mode 100644
index 000000000000..64d0446dc28e
--- /dev/null
+++ b/tools/tracing/rtla/tests/engine.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+test_begin() {
+	# Count tests to allow the test harness to double-check if all were
+	# included correctly.
+	ctr=0
+	[ -z "$RTLA" ] && RTLA="./rtla"
+	[ -n "$TEST_COUNT" ] && echo "1..$TEST_COUNT"
+}
+
+check() {
+	# Simple check: run rtla with given arguments and test exit code.
+	# If TEST_COUNT is set, run the test. Otherwise, just count.
+	ctr=$(($ctr + 1))
+	if [ -n "$TEST_COUNT" ]
+	then
+		# Run rtla; in case of failure, include its output as comment
+		# in the test results.
+		result=$(stdbuf -oL $TIMEOUT "$RTLA" $2 2>&1); exitcode=$?
+		if [ $exitcode -eq 0 ]
+		then
+			echo "ok $ctr - $1"
+		else
+			echo "not ok $ctr - $1"
+			# Add rtla output and exit code as comments in case of failure
+			echo "$result" | col -b | while read line; do echo "# $line"; done
+			printf "#\n# exit code %s\n" $exitcode
+		fi
+	fi
+}
+
+set_timeout() {
+	TIMEOUT="timeout -v -k 15s $1"
+}
+
+unset_timeout() {
+	unset TIMEOUT
+}
+
+test_end() {
+	# If running without TEST_COUNT, tests are not actually run, just
+	# counted. In that case, re-run the test with the correct count.
+	[ -z "$TEST_COUNT" ] && TEST_COUNT=$ctr exec bash $0 || true
+}
+
+# Avoid any environmental discrepancies
+export LC_ALL=C
+unset_timeout
diff --git a/tools/tracing/rtla/tests/hwnoise.t b/tools/tracing/rtla/tests/hwnoise.t
new file mode 100644
index 000000000000..bbed17580537
--- /dev/null
+++ b/tools/tracing/rtla/tests/hwnoise.t
@@ -0,0 +1,21 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source tests/engine.sh
+test_begin
+
+set_timeout 2m
+
+check "verify help page" \
+	"hwnoise --help"
+check "detect noise higher than one microsecond" \
+	"hwnoise -c 0 -T 1 -d 5s -q"
+check "set the automatic trace mode" \
+	"hwnoise -a 5 -d 30s"
+check "set scheduling param to the osnoise tracer threads" \
+	"hwnoise -P F:1 -c 0 -r 900000 -d 1M -q"
+check "stop the trace if a single sample is higher than 1 us" \
+	"hwnoise -s 1 -T 1 -t -d 30s"
+check "enable a trace event trigger" \
+	"hwnoise -t -e osnoise:irq_noise trigger=\"hist:key=desc,duration:sort=desc,duration:vals=hitcount\" -d 1m"
+
+test_end
diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t
new file mode 100644
index 000000000000..86596e547893
--- /dev/null
+++ b/tools/tracing/rtla/tests/osnoise.t
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source tests/engine.sh
+test_begin
+
+set_timeout 2m
+
+check "verify help page" \
+	"osnoise --help"
+check "verify the --priority/-P param" \
+	"osnoise top -P F:1 -c 0 -r 900000 -d 1M -q"
+check "verify the --stop/-s param" \
+	"osnoise top -s 30 -T 1 -t"
+check "verify the  --trace param" \
+	"osnoise hist -s 30 -T 1 -t"
+check "verify the --entries/-E param" \
+	"osnoise hist -P F:1 -c 0 -r 900000 -d 1M -b 10 -E 25"
+
+test_end
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
new file mode 100644
index 000000000000..e86f40e5749e
--- /dev/null
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -0,0 +1,27 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source tests/engine.sh
+test_begin
+
+set_timeout 2m
+
+check "verify help page" \
+	"timerlat --help"
+check "verify -s/--stack" \
+	"timerlat top -s 3 -T 10 -t"
+check "verify -P/--priority" \
+	"timerlat top -P F:1 -c 0 -d 1M -q"
+check "test in nanoseconds" \
+	"timerlat top -i 2 -c 0 -n -d 30s"
+check "set the automatic trace mode" \
+	"timerlat top -a 5 --dump-tasks"
+check "print the auto-analysis if hits the stop tracing condition" \
+	"timerlat top --aa-only 5"
+check "disable auto-analysis" \
+	"timerlat top -s 3 -T 10 -t --no-aa"
+check "verify -c/--cpus" \
+	"timerlat hist -c 0 -d 30s"
+check "hist test in nanoseconds" \
+	"timerlat hist -i 2 -c 0 -n -d 30s"
+
+test_end
-- 
cgit v1.2.3


From 013eb043f37bd87c4d60d51034401a5a6d105bcf Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Thu, 12 Dec 2024 18:30:47 -0800
Subject: perf trace: Fix BPF loading failure (-E2BIG)

As reported by Namhyung Kim and acknowledged by Qiao Zhao (link:
https://lore.kernel.org/linux-perf-users/20241206001436.1947528-1-namhyung@kernel.org/),
on certain machines, perf trace failed to load the BPF program into the
kernel. The verifier runs perf trace's BPF program for up to 1 million
instructions, returning an E2BIG error, whereas the perf trace BPF
program should be much less complex than that. This patch aims to fix
the issue described above.

The E2BIG problem from clang-15 to clang-16 is cause by this line:
 } else if (size < 0 && size >= -6) { /* buffer */

Specifically this check: size < 0. seems like clang generates a cool
optimization to this sign check that breaks things.

Making 'size' s64, and use
 } else if ((int)size < 0 && size >= -6) { /* buffer */

Solves the problem. This is some Hogwarts magic.

And the unbounded access of clang-12 and clang-14 (clang-13 works this
time) is fixed by making variable 'aug_size' s64.

As for this:
-if (aug_size > TRACE_AUG_MAX_BUF)
-	aug_size = TRACE_AUG_MAX_BUF;
+aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];

This makes the BPF skel generated by clang-18 work. Yes, new clangs
introduce problems too.

Sorry, I only know that it works, but I don't know how it works. I'm not
an expert in the BPF verifier. I really hope this is not a kernel
version issue, as that would make the test case (kernel_nr) *
(clang_nr), a true horror story. I will test it on more kernel versions
in the future.

Fixes: 395d38419f18: ("perf trace augmented_raw_syscalls: Add more check s to pass the verifier")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Tested-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20241213023047.541218-1-howardchu95@gmail.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 4a62ed593e84..e4352881e3fa 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -431,9 +431,9 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
 	bool augmented, do_output = false;
-	int zero = 0, size, aug_size, index,
-	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
 	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
+	s64 aug_size, size;
 	unsigned int nr, *beauty_map;
 	struct beauty_payload_enter *payload;
 	void *arg, *payload_offset;
@@ -484,14 +484,11 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 		} else if (size > 0 && size <= value_size) { /* struct */
 			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
 				augmented = true;
-		} else if (size < 0 && size >= -6) { /* buffer */
+		} else if ((int)size < 0 && size >= -6) { /* buffer */
 			index = -(size + 1);
 			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
 			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
-			aug_size = args->args[index];
-
-			if (aug_size > TRACE_AUG_MAX_BUF)
-				aug_size = TRACE_AUG_MAX_BUF;
+			aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
 
 			if (aug_size > 0) {
 				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
-- 
cgit v1.2.3


From e879b5dcf8d044f3865a32d95cc5b213f314c54f Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 16 Jan 2025 15:49:27 +0100
Subject: rtla: Add trace_instance_stop

Support not only turning trace on for the timerlat tracer, but also
turning it off.

This will be used in subsequent patches to stop the timerlat tracer
without also wiping the trace buffer.

Cc: stable@vger.kernel.org
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-2-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/trace.c | 8 ++++++++
 tools/tracing/rtla/src/trace.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index 170a706248ab..440323a997c6 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -196,6 +196,14 @@ int trace_instance_start(struct trace_instance *trace)
 	return tracefs_trace_on(trace->inst);
 }
 
+/*
+ * trace_instance_stop - stop tracing a given rtla instance
+ */
+int trace_instance_stop(struct trace_instance *trace)
+{
+	return tracefs_trace_off(trace->inst);
+}
+
 /*
  * trace_events_free - free a list of trace events
  */
diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h
index c7c92dc9a18a..76e1b77291ba 100644
--- a/tools/tracing/rtla/src/trace.h
+++ b/tools/tracing/rtla/src/trace.h
@@ -21,6 +21,7 @@ struct trace_instance {
 
 int trace_instance_init(struct trace_instance *trace, char *tool_name);
 int trace_instance_start(struct trace_instance *trace);
+int trace_instance_stop(struct trace_instance *trace);
 void trace_instance_destroy(struct trace_instance *trace);
 
 struct trace_seq *get_trace_seq(void);
-- 
cgit v1.2.3


From c73cab9dbed04d8f65ca69177b4b21ed3e09dfa7 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 16 Jan 2025 15:49:28 +0100
Subject: rtla/timerlat_hist: Stop timerlat tracer on signal

Currently, when either SIGINT from the user or SIGALRM from the duration
timer is caught by rtla-timerlat, stop_tracing is set to break out of
the main loop. This is not sufficient for cases where the timerlat
tracer is producing more data than rtla can consume, since in that case,
rtla is looping indefinitely inside tracefs_iterate_raw_events, never
reaches the check of stop_tracing and hangs.

In addition to setting stop_tracing, also stop the timerlat tracer on
received signal (SIGINT or SIGALRM). This will stop new samples so that
the existing samples may be processed and tracefs_iterate_raw_events
eventually exits.

Cc: stable@vger.kernel.org
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-3-tglozar@redhat.com
Fixes: 1eeb6328e8b3 ("rtla/timerlat: Add timerlat hist mode")
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_hist.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 8b66387e5f35..f1edf1c8a7b0 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -1131,9 +1131,12 @@ out_err:
 }
 
 static int stop_tracing;
+static struct trace_instance *hist_inst = NULL;
 static void stop_hist(int sig)
 {
 	stop_tracing = 1;
+	if (hist_inst)
+		trace_instance_stop(hist_inst);
 }
 
 /*
@@ -1180,6 +1183,12 @@ int timerlat_hist_main(int argc, char *argv[])
 	}
 
 	trace = &tool->trace;
+	/*
+	 * Save trace instance into global variable so that SIGINT can stop
+	 * the timerlat tracer.
+	 * Otherwise, rtla could loop indefinitely when overloaded.
+	 */
+	hist_inst = trace;
 
 	retval = enable_timerlat(trace);
 	if (retval) {
@@ -1348,7 +1357,7 @@ int timerlat_hist_main(int argc, char *argv[])
 
 	return_value = 0;
 
-	if (trace_is_off(&tool->trace, &record->trace)) {
+	if (trace_is_off(&tool->trace, &record->trace) && !stop_tracing) {
 		printf("rtla timerlat hit stop tracing\n");
 
 		if (!params->no_aa)
-- 
cgit v1.2.3


From a4dfce7559d75430c464294ddee554be2a413c4a Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 16 Jan 2025 15:49:29 +0100
Subject: rtla/timerlat_top: Stop timerlat tracer on signal

Currently, when either SIGINT from the user or SIGALRM from the duration
timer is caught by rtla-timerlat, stop_tracing is set to break out of
the main loop. This is not sufficient for cases where the timerlat
tracer is producing more data than rtla can consume, since in that case,
rtla is looping indefinitely inside tracefs_iterate_raw_events, never
reaches the check of stop_tracing and hangs.

In addition to setting stop_tracing, also stop the timerlat tracer on
received signal (SIGINT or SIGALRM). This will stop new samples so that
the existing samples may be processed and tracefs_iterate_raw_events
eventually exits.

Cc: stable@vger.kernel.org
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-4-tglozar@redhat.com
Fixes: a828cd18bc4a ("rtla: Add timerlat tool and timelart top mode")
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_top.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 059b468981e4..d21a21053917 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -900,9 +900,12 @@ out_err:
 }
 
 static int stop_tracing;
+static struct trace_instance *top_inst = NULL;
 static void stop_top(int sig)
 {
 	stop_tracing = 1;
+	if (top_inst)
+		trace_instance_stop(top_inst);
 }
 
 /*
@@ -950,6 +953,13 @@ int timerlat_top_main(int argc, char *argv[])
 	}
 
 	trace = &top->trace;
+	/*
+	* Save trace instance into global variable so that SIGINT can stop
+	* the timerlat tracer.
+	* Otherwise, rtla could loop indefinitely when overloaded.
+	*/
+	top_inst = trace;
+
 
 	retval = enable_timerlat(trace);
 	if (retval) {
@@ -1131,7 +1141,7 @@ int timerlat_top_main(int argc, char *argv[])
 
 	return_value = 0;
 
-	if (trace_is_off(&top->trace, &record->trace)) {
+	if (trace_is_off(&top->trace, &record->trace) && !stop_tracing) {
 		printf("rtla timerlat hit stop tracing\n");
 
 		if (!params->no_aa)
-- 
cgit v1.2.3


From d6899e560366e10141189697502bc5521940c588 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 16 Jan 2025 15:49:30 +0100
Subject: rtla/timerlat_hist: Abort event processing on second signal

If either SIGINT is received twice, or after a SIGALRM (that is, after
timerlat was supposed to stop), abort processing events currently left
in the tracefs buffer and exit immediately.

This allows the user to exit rtla without waiting for processing all
events, should that take longer than wanted, at the cost of not
processing all samples.

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-5-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_hist.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index f1edf1c8a7b0..53ded187b33a 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -1134,6 +1134,14 @@ static int stop_tracing;
 static struct trace_instance *hist_inst = NULL;
 static void stop_hist(int sig)
 {
+	if (stop_tracing) {
+		/*
+		 * Stop requested twice in a row; abort event processing and
+		 * exit immediately
+		 */
+		tracefs_iterate_stop(hist_inst->inst);
+		return;
+	}
 	stop_tracing = 1;
 	if (hist_inst)
 		trace_instance_stop(hist_inst);
-- 
cgit v1.2.3


From 80967b354a76b360943af384c10d807d98bea5c4 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 16 Jan 2025 15:49:31 +0100
Subject: rtla/timerlat_top: Abort event processing on second signal

If either SIGINT is received twice, or after a SIGALRM (that is, after
timerlat was supposed to stop), abort processing events currently left
in the tracefs buffer and exit immediately.

This allows the user to exit rtla without waiting for processing all
events, should that take longer than wanted, at the cost of not
processing all samples.

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-6-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_top.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index d21a21053917..d358cd39f360 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -903,6 +903,14 @@ static int stop_tracing;
 static struct trace_instance *top_inst = NULL;
 static void stop_top(int sig)
 {
+	if (stop_tracing) {
+		/*
+		 * Stop requested twice in a row; abort event processing and
+		 * exit immediately
+		 */
+		tracefs_iterate_stop(top_inst->inst);
+		return;
+	}
 	stop_tracing = 1;
 	if (top_inst)
 		trace_instance_stop(top_inst);
-- 
cgit v1.2.3


From 80d3ba1cf51bfbbb3b098434f2b2c95cd7c0ae5c Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Tue, 7 Jan 2025 15:48:21 +0100
Subject: rtla/osnoise: Distinguish missing workload option

osnoise_set_workload returns -1 for both missing OSNOISE_WORKLOAD option
and failure in setting the option.

Return -1 for missing and -2 for failure to distinguish them.

Cc: stable@vger.kernel.org
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20250107144823.239782-2-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index 245e9344932b..699a83f538a8 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -867,7 +867,7 @@ int osnoise_set_workload(struct osnoise_context *context, bool onoff)
 
 	retval = osnoise_options_set_option("OSNOISE_WORKLOAD", onoff);
 	if (retval < 0)
-		return -1;
+		return -2;
 
 	context->opt_workload = onoff;
 
-- 
cgit v1.2.3


From d8d866171a414ed88bd0d720864095fd75461134 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Tue, 7 Jan 2025 15:48:22 +0100
Subject: rtla/timerlat_hist: Set OSNOISE_WORKLOAD for kernel threads

When using rtla timerlat with userspace threads (-u or -U), rtla
disables the OSNOISE_WORKLOAD option in
/sys/kernel/tracing/osnoise/options. This option is not re-enabled in a
subsequent run with kernel-space threads, leading to rtla collecting no
results if the previous run exited abnormally:

$ rtla timerlat hist -u
^\Quit (core dumped)
$ rtla timerlat hist -k -d 1s
Index
over:
count:
min:
avg:
max:
ALL:        IRQ       Thr       Usr
count:        0         0         0
min:          -         -         -
avg:          -         -         -
max:          -         -         -

The issue persists until OSNOISE_WORKLOAD is set manually by running:
$ echo OSNOISE_WORKLOAD > /sys/kernel/tracing/osnoise/options

Set OSNOISE_WORKLOAD when running rtla with kernel-space threads if
available to fix the issue.

Cc: stable@vger.kernel.org
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20250107144823.239782-3-tglozar@redhat.com
Fixes: ed774f7481fa ("rtla/timerlat_hist: Add timerlat user-space support")
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_hist.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 53ded187b33a..d4bd02c01c53 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -1085,12 +1085,15 @@ timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_param
 		}
 	}
 
-	if (params->user_hist) {
-		retval = osnoise_set_workload(tool->context, 0);
-		if (retval) {
-			err_msg("Failed to set OSNOISE_WORKLOAD option\n");
-			goto out_err;
-		}
+	/*
+	* Set workload according to type of thread if the kernel supports it.
+	* On kernels without support, user threads will have already failed
+	* on missing timerlat_fd, and kernel threads do not need it.
+	*/
+	retval = osnoise_set_workload(tool->context, params->kernel_workload);
+	if (retval < -1) {
+		err_msg("Failed to set OSNOISE_WORKLOAD option\n");
+		goto out_err;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 217f0b1e990e30a1f06f6d531fdb4530f4788d48 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Tue, 7 Jan 2025 15:48:23 +0100
Subject: rtla/timerlat_top: Set OSNOISE_WORKLOAD for kernel threads

When using rtla timerlat with userspace threads (-u or -U), rtla
disables the OSNOISE_WORKLOAD option in
/sys/kernel/tracing/osnoise/options. This option is not re-enabled in a
subsequent run with kernel-space threads, leading to rtla collecting no
results if the previous run exited abnormally:

$ rtla timerlat top -u
^\Quit (core dumped)
$ rtla timerlat top -k -d 1s
                                     Timer Latency
  0 00:00:01   |          IRQ Timer Latency (us)        |         Thread Timer Latency (us)
CPU COUNT      |      cur       min       avg       max |      cur       min       avg       max

The issue persists until OSNOISE_WORKLOAD is set manually by running:
$ echo OSNOISE_WORKLOAD > /sys/kernel/tracing/osnoise/options

Set OSNOISE_WORKLOAD when running rtla with kernel-space threads if
available to fix the issue.

Cc: stable@vger.kernel.org
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20250107144823.239782-4-tglozar@redhat.com
Fixes: cdca4f4e5e8e ("rtla/timerlat_top: Add timerlat user-space support")
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_top.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index d358cd39f360..f387597d3ac2 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -851,12 +851,15 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params *
 		}
 	}
 
-	if (params->user_top) {
-		retval = osnoise_set_workload(top->context, 0);
-		if (retval) {
-			err_msg("Failed to set OSNOISE_WORKLOAD option\n");
-			goto out_err;
-		}
+	/*
+	* Set workload according to type of thread if the kernel supports it.
+	* On kernels without support, user threads will have already failed
+	* on missing timerlat_fd, and kernel threads do not need it.
+	*/
+	retval = osnoise_set_workload(top->context, params->kernel_workload);
+	if (retval < -1) {
+		err_msg("Failed to set OSNOISE_WORKLOAD option\n");
+		goto out_err;
 	}
 
 	if (isatty(STDOUT_FILENO) && !params->quiet)
-- 
cgit v1.2.3


From b91cfd9f75c02e7f4463f8416d99138630c4dfd0 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Wed, 15 Jan 2025 19:58:30 +0200
Subject: tools/rtla: Add osnoise_trace_is_off()

All of the users of trace_is_off() passes in &record->trace as the second
parameter, where record is a pointer to a struct osnoise_tool. This record
could be NULL and there is a hidden dependency that the trace field is the
first field to allow &record->trace to work with a NULL record pointer.

In order to make this code a bit more robust, as record shouldn't be
dereferenced if it is NULL, even if the code does work, create a new
function called osnoise_trace_is_off() that takes the pointer to a
struct osnoise_tool as its second parameter. This way it can properly test
if it is NULL before it dereferences it.

The old function trace_is_off() is removed and the function
osnoise_trace_is_off() is added into osnoise.c which is what the
struct osnoise_tool is associated with.

Cc: John Kacur <jkacur@redhat.com>
Cc: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Cc: Eder Zulian <ezulian@redhat.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Tomas Glozar <tglozar@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250115180055.2136815-1-costa.shul@redhat.com
Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise.c       | 16 ++++++++++++++++
 tools/tracing/rtla/src/osnoise.h       |  1 +
 tools/tracing/rtla/src/osnoise_hist.c  |  4 ++--
 tools/tracing/rtla/src/osnoise_top.c   |  4 ++--
 tools/tracing/rtla/src/timerlat_hist.c |  4 ++--
 tools/tracing/rtla/src/timerlat_top.c  |  6 +++---
 tools/tracing/rtla/src/trace.c         | 19 -------------------
 tools/tracing/rtla/src/trace.h         |  1 -
 8 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index 699a83f538a8..f521f052cbd3 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -1079,6 +1079,22 @@ out_err:
 	return NULL;
 }
 
+bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record)
+{
+	/*
+	 * The tool instance is always present, it is the one used to collect
+	 * data.
+	 */
+	if (!tracefs_trace_is_on(tool->trace.inst))
+		return true;
+
+	/*
+	 * The trace record instance is only enabled when -t is set. IOW, when the system
+	 * is tracing.
+	 */
+	return record && !tracefs_trace_is_on(record->trace.inst);
+}
+
 static void osnoise_usage(int err)
 {
 	int i;
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index 555f4f4903cc..1dc188baddef 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -104,6 +104,7 @@ struct osnoise_tool {
 void osnoise_destroy_tool(struct osnoise_tool *top);
 struct osnoise_tool *osnoise_init_tool(char *tool_name);
 struct osnoise_tool *osnoise_init_trace_tool(char *tracer);
+bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record);
 
 int osnoise_hist_main(int argc, char *argv[]);
 int osnoise_top_main(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 214e2c93fde0..f250f999a4ee 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -970,7 +970,7 @@ int osnoise_hist_main(int argc, char *argv[])
 			goto out_hist;
 		}
 
-		if (trace_is_off(&tool->trace, &record->trace))
+		if (osnoise_trace_is_off(tool, record))
 			break;
 	}
 
@@ -980,7 +980,7 @@ int osnoise_hist_main(int argc, char *argv[])
 
 	return_value = 0;
 
-	if (trace_is_off(&tool->trace, &record->trace)) {
+	if (osnoise_trace_is_off(tool, record)) {
 		printf("rtla osnoise hit stop tracing\n");
 		if (params->trace_output) {
 			printf("  Saving trace to %s\n", params->trace_output);
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 45647495ce3b..6d50653ae224 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -801,7 +801,7 @@ int osnoise_top_main(int argc, char **argv)
 		if (!params->quiet)
 			osnoise_print_stats(params, tool);
 
-		if (trace_is_off(&tool->trace, &record->trace))
+		if (osnoise_trace_is_off(tool, record))
 			break;
 
 	}
@@ -810,7 +810,7 @@ int osnoise_top_main(int argc, char **argv)
 
 	return_value = 0;
 
-	if (trace_is_off(&tool->trace, &record->trace)) {
+	if (osnoise_trace_is_off(tool, record)) {
 		printf("osnoise hit stop tracing\n");
 		if (params->trace_output) {
 			printf("  Saving trace to %s\n", params->trace_output);
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index d4bd02c01c53..91aedb44da01 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -1347,7 +1347,7 @@ int timerlat_hist_main(int argc, char *argv[])
 			goto out_hist;
 		}
 
-		if (trace_is_off(&tool->trace, &record->trace))
+		if (osnoise_trace_is_off(tool, record))
 			break;
 
 		/* is there still any user-threads ? */
@@ -1368,7 +1368,7 @@ int timerlat_hist_main(int argc, char *argv[])
 
 	return_value = 0;
 
-	if (trace_is_off(&tool->trace, &record->trace) && !stop_tracing) {
+	if (osnoise_trace_is_off(tool, record) && !stop_tracing) {
 		printf("rtla timerlat hit stop tracing\n");
 
 		if (!params->no_aa)
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index f387597d3ac2..51115f92e15e 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -1114,7 +1114,7 @@ int timerlat_top_main(int argc, char *argv[])
 	while (!stop_tracing) {
 		sleep(params->sleep_time);
 
-		if (params->aa_only && !trace_is_off(&top->trace, &record->trace))
+		if (params->aa_only && !osnoise_trace_is_off(top, record))
 			continue;
 
 		retval = tracefs_iterate_raw_events(trace->tep,
@@ -1131,7 +1131,7 @@ int timerlat_top_main(int argc, char *argv[])
 		if (!params->quiet)
 			timerlat_print_stats(params, top);
 
-		if (trace_is_off(&top->trace, &record->trace))
+		if (osnoise_trace_is_off(top, record))
 			break;
 
 		/* is there still any user-threads ? */
@@ -1152,7 +1152,7 @@ int timerlat_top_main(int argc, char *argv[])
 
 	return_value = 0;
 
-	if (trace_is_off(&top->trace, &record->trace) && !stop_tracing) {
+	if (osnoise_trace_is_off(top, record) && !stop_tracing) {
 		printf("rtla timerlat hit stop tracing\n");
 
 		if (!params->no_aa)
diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index 440323a997c6..80b14b8a3c2e 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -530,25 +530,6 @@ void trace_events_destroy(struct trace_instance *instance,
 	trace_events_free(events);
 }
 
-int trace_is_off(struct trace_instance *tool, struct trace_instance *trace)
-{
-	/*
-	 * The tool instance is always present, it is the one used to collect
-	 * data.
-	 */
-	if (!tracefs_trace_is_on(tool->inst))
-		return 1;
-
-	/*
-	 * The trace instance is only enabled when -t is set. IOW, when the system
-	 * is tracing.
-	 */
-	if (trace && !tracefs_trace_is_on(trace->inst))
-		return 1;
-
-	return 0;
-}
-
 /*
  * trace_set_buffer_size - set the per-cpu tracing buffer size.
  */
diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h
index 76e1b77291ba..c3e03f7df770 100644
--- a/tools/tracing/rtla/src/trace.h
+++ b/tools/tracing/rtla/src/trace.h
@@ -48,5 +48,4 @@ int trace_events_enable(struct trace_instance *instance,
 
 int trace_event_add_filter(struct trace_events *event, char *filter);
 int trace_event_add_trigger(struct trace_events *event, char *trigger);
-int trace_is_off(struct trace_instance *tool, struct trace_instance *trace);
 int trace_set_buffer_size(struct trace_instance *trace, int size);
-- 
cgit v1.2.3


From d6fcd28ffeaaa6a1733303096637e6bf15704efb Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 23 Jan 2025 15:23:36 +0100
Subject: rtla: Count missed trace events

Add function collect_missed_events to trace.c to act as a callback for
tracefs_follow_missed_events, summing the number of total missed events
into a new field missing_events of struct trace_instance.

In case record->missed_events is negative, trace->missed_events is set
to UINT64_MAX to signify an unknown number of events was missed.

The callback is activated on initialization of the trace instance.

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250123142339.990300-2-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/trace.c | 34 ++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/trace.h |  1 +
 2 files changed, 35 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index 80b14b8a3c2e..94e490782f14 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -126,6 +126,31 @@ collect_registered_events(struct tep_event *event, struct tep_record *record,
 	return 0;
 }
 
+/*
+ * collect_missed_events - record number of missed events
+ *
+ * If rtla cannot keep up with events generated by tracer, events are going
+ * to fall out of the ring buffer.
+ * Collect how many events were missed so it can be reported to the user.
+ */
+static int
+collect_missed_events(struct tep_event *event, struct tep_record *record,
+		      int cpu, void *context)
+{
+	struct trace_instance *trace = context;
+
+	if (trace->missed_events == UINT64_MAX)
+		return 0;
+
+	if (record->missed_events > 0)
+		trace->missed_events += record->missed_events;
+	else
+		/* Events missed but no data on how many */
+		trace->missed_events = UINT64_MAX;
+
+	return 0;
+}
+
 /*
  * trace_instance_destroy - destroy and free a rtla trace instance
  */
@@ -181,6 +206,15 @@ int trace_instance_init(struct trace_instance *trace, char *tool_name)
 	 */
 	tracefs_trace_off(trace->inst);
 
+	/*
+	 * Collect the number of events missed due to tracefs buffer
+	 * overflow.
+	 */
+	trace->missed_events = 0;
+	tracefs_follow_missed_events(trace->inst,
+				     collect_missed_events,
+				     trace);
+
 	return 0;
 
 out_err:
diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h
index c3e03f7df770..a6e88709604b 100644
--- a/tools/tracing/rtla/src/trace.h
+++ b/tools/tracing/rtla/src/trace.h
@@ -17,6 +17,7 @@ struct trace_instance {
 	struct tracefs_instance		*inst;
 	struct tep_handle		*tep;
 	struct trace_seq		*seq;
+	unsigned long long		missed_events;
 };
 
 int trace_instance_init(struct trace_instance *trace, char *tool_name);
-- 
cgit v1.2.3


From 2aee44f721a75daebc55c372271221286efd79ec Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 23 Jan 2025 15:23:37 +0100
Subject: rtla: Count all processed events

Add a field processed_events to struct trace_instance and increment it
in collect_registered_events, regardless of whether a handler is
registered for the event.

The purpose is to calculate the percentage of events that were missed
due to tracefs buffer overflow.

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250123142339.990300-3-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/trace.c | 4 ++++
 tools/tracing/rtla/src/trace.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index 94e490782f14..728f5029d533 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -118,6 +118,8 @@ collect_registered_events(struct tep_event *event, struct tep_record *record,
 	struct trace_instance *trace = context;
 	struct trace_seq *s = trace->seq;
 
+	trace->processed_events++;
+
 	if (!event->handler)
 		return 0;
 
@@ -215,6 +217,8 @@ int trace_instance_init(struct trace_instance *trace, char *tool_name)
 				     collect_missed_events,
 				     trace);
 
+	trace->processed_events = 0;
+
 	return 0;
 
 out_err:
diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h
index a6e88709604b..3cd40dd3f06c 100644
--- a/tools/tracing/rtla/src/trace.h
+++ b/tools/tracing/rtla/src/trace.h
@@ -18,6 +18,7 @@ struct trace_instance {
 	struct tep_handle		*tep;
 	struct trace_seq		*seq;
 	unsigned long long		missed_events;
+	unsigned long long		processed_events;
 };
 
 int trace_instance_init(struct trace_instance *trace, char *tool_name);
-- 
cgit v1.2.3


From 8ccd9d8bb913577c7ec98061cd6e73380e538532 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 23 Jan 2025 15:23:38 +0100
Subject: rtla: Add function to report missed events

Add osnoise_report_missed_events to be used to report the number
of missed events either during or after an osnoise or timerlat run.
Also, display the percentage of missed events compared to the total
number of received events.

If an unknown number of missed events was reported during the run, the
entire number of missed events is reported as unknown.

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/20250123142339.990300-4-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise.c | 20 ++++++++++++++++++++
 tools/tracing/rtla/src/osnoise.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index f521f052cbd3..85f398b89597 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -1095,6 +1095,26 @@ bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record
 	return record && !tracefs_trace_is_on(record->trace.inst);
 }
 
+/*
+ * osnoise_report_missed_events - report number of events dropped by trace
+ * buffer
+ */
+void
+osnoise_report_missed_events(struct osnoise_tool *tool)
+{
+	unsigned long long total_events;
+
+	if (tool->trace.missed_events == UINT64_MAX)
+		printf("unknown number of events missed, results might not be accurate\n");
+	else if (tool->trace.missed_events > 0) {
+		total_events = tool->trace.processed_events + tool->trace.missed_events;
+
+		printf("%lld (%.2f%%) events missed, results might not be accurate\n",
+		       tool->trace.missed_events,
+		       (double) tool->trace.missed_events / total_events * 100.0);
+	}
+}
+
 static void osnoise_usage(int err)
 {
 	int i;
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index 1dc188baddef..91835a7d8c2b 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -104,6 +104,7 @@ struct osnoise_tool {
 void osnoise_destroy_tool(struct osnoise_tool *top);
 struct osnoise_tool *osnoise_init_tool(char *tool_name);
 struct osnoise_tool *osnoise_init_trace_tool(char *tracer);
+void osnoise_report_missed_events(struct osnoise_tool *tool);
 bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record);
 
 int osnoise_hist_main(int argc, char *argv[]);
-- 
cgit v1.2.3


From cf186201118c953c3a0256312a186b3d24ffdb9f Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Thu, 23 Jan 2025 15:23:39 +0100
Subject: rtla: Report missed event count

Print how many events were missed by trace buffer overflow in the main
instance at the end of the run (for hist) or during the run (for top).

Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Goncalves <lgoncalv@redhat.com>
Link: https://lore.kernel.org/20250123142339.990300-5-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Tested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise_hist.c  | 1 +
 tools/tracing/rtla/src/osnoise_top.c   | 1 +
 tools/tracing/rtla/src/timerlat_hist.c | 1 +
 tools/tracing/rtla/src/timerlat_top.c  | 1 +
 4 files changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index f250f999a4ee..b4930b835b0a 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -440,6 +440,7 @@ osnoise_print_stats(struct osnoise_hist_params *params, struct osnoise_tool *too
 	trace_seq_reset(trace->seq);
 
 	osnoise_print_summary(params, trace, data);
+	osnoise_report_missed_events(tool);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 6d50653ae224..4772677ac762 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -280,6 +280,7 @@ osnoise_print_stats(struct osnoise_top_params *params, struct osnoise_tool *top)
 
 	trace_seq_do_printf(trace->seq);
 	trace_seq_reset(trace->seq);
+	osnoise_report_missed_events(top);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 91aedb44da01..cdecc8f120dc 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -641,6 +641,7 @@ timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *t
 
 	timerlat_print_summary(params, trace, data);
 	timerlat_print_stats_all(params, trace, data);
+	osnoise_report_missed_events(tool);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 51115f92e15e..05a9403b01d2 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -435,6 +435,7 @@ timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *to
 
 	trace_seq_do_printf(trace->seq);
 	trace_seq_reset(trace->seq);
+	osnoise_report_missed_events(top);
 }
 
 /*
-- 
cgit v1.2.3


From 7e060df04f562b37bdc101fd06b16f013e2d989b Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Fri, 13 Dec 2024 20:25:19 +0800
Subject: Xarray: do not return sibling entries from xas_find_marked()

Patch series "Fixes and cleanups to xarray", v5.

This series contains some random fixes and cleanups to xarray.  Patch 1-2
are fixes and patch 3-6 are cleanups.  More details can be found in
respective patches.


This patch (of 5):

Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
sibling entries from xa_load()"), we may return sibling entries from
xas_find_marked as following:
    Thread A:               Thread B:
                            xa_store_range(xa, entry, 6, 7, gfp);
			    xa_set_mark(xa, 6, mark)
    XA_STATE(xas, xa, 6);
    xas_find_marked(&xas, 7, mark);
    offset = xas_find_chunk(xas, advance, mark);
    [offset is 6 which points to a valid entry]
                            xa_store_range(xa, entry, 4, 7, gfp);
    entry = xa_entry(xa, node, 6);
    [entry is a sibling of 4]
    if (!xa_is_node(entry))
        return entry;

Skip sibling entry like xas_find() does to protect caller from seeing
sibling entry from xas_find_marked() or caller may use sibling entry
as a valid entry and crash the kernel.

Besides, load_race() test is modified to catch mentioned issue and modified
load_race() only passes after this fix is merged.

Here is an example how this bug could be triggerred in tmpfs which
enables large folio in mapping:
Let's take a look at involved racer:
1. How pages could be created and dirtied in shmem file.
write
 ksys_write
  vfs_write
   new_sync_write
    shmem_file_write_iter
     generic_perform_write
      shmem_write_begin
       shmem_get_folio
        shmem_allowable_huge_orders
        shmem_alloc_and_add_folios
        shmem_alloc_folio
        __folio_set_locked
        shmem_add_to_page_cache
         XA_STATE_ORDER(..., index, order)
         xax_store()
      shmem_write_end
       folio_mark_dirty()

2. How dirty pages could be deleted in shmem file.
ioctl
 do_vfs_ioctl
  file_ioctl
   ioctl_preallocate
    vfs_fallocate
     shmem_fallocate
      shmem_truncate_range
       shmem_undo_range
        truncate_inode_folio
         filemap_remove_folio
          page_cache_delete
           xas_store(&xas, NULL);

3. How dirty pages could be lockless searched
sync_file_range
 ksys_sync_file_range
  __filemap_fdatawrite_range
   filemap_fdatawrite_wbc
    do_writepages
     writeback_use_writepage
      writeback_iter
       writeback_get_folio
        filemap_get_folios_tag
         find_get_entry
          folio = xas_find_marked()
          folio_try_get(folio)

Kernel will crash as following:
1.Create               2.Search             3.Delete
/* write page 2,3 */
write
 ...
  shmem_write_begin
   XA_STATE_ORDER(xas, i_pages, index = 2, order = 1)
   xa_store(&xas, folio)
  shmem_write_end
   folio_mark_dirty()

                       /* sync page 2 and page 3 */
                       sync_file_range
                        ...
                         find_get_entry
                          folio = xas_find_marked()
                          /* offset will be 2 */
                          offset = xas_find_chunk()

                                             /* delete page 2 and page 3 */
                                             ioctl
                                              ...
                                               xas_store(&xas, NULL);

/* write page 0-3 */
write
 ...
  shmem_write_begin
   XA_STATE_ORDER(xas, i_pages, index = 0, order = 2)
   xa_store(&xas, folio)
  shmem_write_end
   folio_mark_dirty(folio)

                          /* get sibling entry from offset 2 */
                          entry = xa_entry(.., 2)
                          /* use sibling entry as folio and crash kernel */
                          folio_try_get(folio)

Link: https://lkml.kernel.org/r/20241213122523.12764-1-shikemeng@huaweicloud.com
Link: https://lkml.kernel.org/r/20241213122523.12764-2-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Mattew Wilcox <willy@infradead.org> [English fixes]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/radix-tree/multiorder.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index cffaf2245d4f..eaff1b036989 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -227,6 +227,7 @@ static void *load_creator(void *ptr)
 			unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) -
 						(1 << order);
 			item_insert_order(tree, index, order);
+			xa_set_mark(tree, index, XA_MARK_1);
 			item_delete_rcu(tree, index);
 		}
 	}
@@ -242,8 +243,11 @@ static void *load_worker(void *ptr)
 
 	rcu_register_thread();
 	while (!stop_iteration) {
+		unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1;
 		struct item *item = xa_load(ptr, index);
 		assert(!xa_is_internal(item));
+		item = xa_find(ptr, &find_index, index, XA_MARK_1);
+		assert(!xa_is_internal(item));
 	}
 	rcu_unregister_thread();
 
-- 
cgit v1.2.3


From 7e8c8fd3487c41690ebaf1f37218d8e191d45166 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 13 Dec 2024 16:24:09 +0000
Subject: tools: testing: add simple __mmap_region() userland test

Introduce demonstrative, basic, __mmap_region() test upon which we can
base further work upon moving forwards.

This simply asserts that mappings can be made and merges occur as
expected.

As part of this change, fix the security_vm_enough_memory_mm() stub which
was previously incorrectly implemented.

Link: https://lkml.kernel.org/r/20241213162409.41498-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma.c          | 53 ++++++++++++++++++++++++++++++++++++++++
 tools/testing/vma/vma_internal.h |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 920fba58884e..04ab45e27fb8 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -1574,6 +1574,57 @@ static bool test_expand_only_mode(void)
 	return true;
 }
 
+static bool test_mmap_region_basic(void)
+{
+	struct mm_struct mm = {};
+	unsigned long addr;
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, &mm, 0);
+
+	current->mm = &mm;
+
+	/* Map at 0x300000, length 0x3000. */
+	addr = __mmap_region(NULL, 0x300000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x300, NULL);
+	ASSERT_EQ(addr, 0x300000);
+
+	/* Map at 0x250000, length 0x3000. */
+	addr = __mmap_region(NULL, 0x250000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x250, NULL);
+	ASSERT_EQ(addr, 0x250000);
+
+	/* Map at 0x303000, merging to 0x300000 of length 0x6000. */
+	addr = __mmap_region(NULL, 0x303000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x303, NULL);
+	ASSERT_EQ(addr, 0x303000);
+
+	/* Map at 0x24d000, merging to 0x250000 of length 0x6000. */
+	addr = __mmap_region(NULL, 0x24d000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x24d, NULL);
+	ASSERT_EQ(addr, 0x24d000);
+
+	ASSERT_EQ(mm.map_count, 2);
+
+	for_each_vma(vmi, vma) {
+		if (vma->vm_start == 0x300000) {
+			ASSERT_EQ(vma->vm_end, 0x306000);
+			ASSERT_EQ(vma->vm_pgoff, 0x300);
+		} else if (vma->vm_start == 0x24d000) {
+			ASSERT_EQ(vma->vm_end, 0x253000);
+			ASSERT_EQ(vma->vm_pgoff, 0x24d);
+		} else {
+			ASSERT_FALSE(true);
+		}
+	}
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
 int main(void)
 {
 	int num_tests = 0, num_fail = 0;
@@ -1607,6 +1658,8 @@ int main(void)
 	TEST(copy_vma);
 	TEST(expand_only_mode);
 
+	TEST(mmap_region_basic);
+
 #undef TEST
 
 	printf("%d tests run, %d passed, %d failed.\n",
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index b973b3e41c83..ae635eecbfa8 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -996,7 +996,7 @@ static inline bool is_file_hugepages(struct file *)
 
 static inline int security_vm_enough_memory_mm(struct mm_struct *, long)
 {
-	return true;
+	return 0;
 }
 
 static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long)
-- 
cgit v1.2.3


From 901083d8f5c55dc125834bbc5ac1720014e66758 Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Thu, 19 Dec 2024 04:27:20 -0600
Subject: selftests/mm: add new test cases to the migration test

Added three new test cases to the migration tests:

1. Shared anon THP migration test
This test will mmap shared anon memory, madvise it to
MADV_HUGEPAGE, then do migration entry testing. One thread
will move pages back and forth between nodes whilst other
threads try and access them.

2. Private anon hugetlb migration test
This test will mmap private anon hugetlb memory and then
do the migration entry testing.

3. Shared anon hugetlb migration test
This test will mmap shared anon hugetlb memory and then
do the migration entry testing.

Test results
============
 # ./tools/testing/selftests/mm/migration
 TAP version 13
 1..6
 # Starting 6 tests from 1 test cases.
 #  RUN           migration.private_anon ...
 #            OK  migration.private_anon
 ok 1 migration.private_anon
 #  RUN           migration.shared_anon ...
 #            OK  migration.shared_anon
 ok 2 migration.shared_anon
 #  RUN           migration.private_anon_thp ...
 #            OK  migration.private_anon_thp
 ok 3 migration.private_anon_thp
 #  RUN           migration.shared_anon_thp ...
 #            OK  migration.shared_anon_thp
 ok 4 migration.shared_anon_thp
 #  RUN           migration.private_anon_htlb ...
 #            OK  migration.private_anon_htlb
 ok 5 migration.private_anon_htlb
 #  RUN           migration.shared_anon_htlb ...
 #            OK  migration.shared_anon_htlb
 ok 6 migration.shared_anon_htlb
 # PASSED: 6 / 6 tests passed.
 # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0
 #

Link: https://lkml.kernel.org/r/20241219102720.4487-1-donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/migration.c | 99 ++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
index 64bcbb7151cf..1e3a595fbf01 100644
--- a/tools/testing/selftests/mm/migration.c
+++ b/tools/testing/selftests/mm/migration.c
@@ -204,4 +204,103 @@ TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME)
 		ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
 }
 
+/*
+ * migration test with shared anon THP page
+ */
+
+TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME)
+{
+	pid_t pid;
+	uint64_t *ptr;
+	int i;
+
+	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+		SKIP(return, "Not enough threads or NUMA nodes available");
+
+	ptr = mmap(NULL, 2 * TWOMEG, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
+	ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
+
+	memset(ptr, 0xde, TWOMEG);
+	for (i = 0; i < self->nthreads - 1; i++) {
+		pid = fork();
+		if (!pid) {
+			prctl(PR_SET_PDEATHSIG, SIGHUP);
+			/* Parent may have died before prctl so check now. */
+			if (getppid() == 1)
+				kill(getpid(), SIGHUP);
+			access_mem(ptr);
+		} else {
+			self->pids[i] = pid;
+		}
+	}
+
+	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+	for (i = 0; i < self->nthreads - 1; i++)
+		ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
+/*
+ * migration test with private anon hugetlb page
+ */
+TEST_F_TIMEOUT(migration, private_anon_htlb, 2*RUNTIME)
+{
+	uint64_t *ptr;
+	int i;
+
+	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+		SKIP(return, "Not enough threads or NUMA nodes available");
+
+	ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	memset(ptr, 0xde, TWOMEG);
+	for (i = 0; i < self->nthreads - 1; i++)
+		if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+			perror("Couldn't create thread");
+
+	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+	for (i = 0; i < self->nthreads - 1; i++)
+		ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+/*
+ * migration test with shared anon hugetlb page
+ */
+TEST_F_TIMEOUT(migration, shared_anon_htlb, 2*RUNTIME)
+{
+	pid_t pid;
+	uint64_t *ptr;
+	int i;
+
+	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+		SKIP(return, "Not enough threads or NUMA nodes available");
+
+	ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	memset(ptr, 0xde, TWOMEG);
+	for (i = 0; i < self->nthreads - 1; i++) {
+		pid = fork();
+		if (!pid) {
+			prctl(PR_SET_PDEATHSIG, SIGHUP);
+			/* Parent may have died before prctl so check now. */
+			if (getppid() == 1)
+				kill(getpid(), SIGHUP);
+			access_mem(ptr);
+		} else {
+			self->pids[i] = pid;
+		}
+	}
+
+	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+	for (i = 0; i < self->nthreads - 1; i++)
+		ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From ce1d750282e6c886c0d1ad0a836b18cfac9d4c61 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jan 2025 11:19:37 -0800
Subject: selftests/damon/config: remove configs for DAMON debugfs interface
 selftests

It's time to remove DAMON debugfs interface, which has deprecated long
before in February 2023.  Read the cover letter of this patch series for
more details.

Remove configs for selftests of it from DAMON selftests config file, to
prevent unnecessary noises from the tests.

[1] https://lore.kernel.org/20230209192009.7885-1-sj@kernel.org

Link: https://lkml.kernel.org/r/20250106191941.107070-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Rae Moar <rmoar@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/config | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/config b/tools/testing/selftests/damon/config
index 0daf38974eb0..a68a9fead5dc 100644
--- a/tools/testing/selftests/damon/config
+++ b/tools/testing/selftests/damon/config
@@ -1,6 +1,5 @@
 CONFIG_DAMON=y
 CONFIG_DAMON_SYSFS=y
-CONFIG_DAMON_DBGFS=y
 CONFIG_DAMON_PADDR=y
 CONFIG_DAMON_VADDR=y
 CONFIG_DAMON_RECLAIM=y
-- 
cgit v1.2.3


From 859de14931a6b2db02f6579a8c4c9bc34dd326e0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jan 2025 11:19:38 -0800
Subject: selftests/damon: remove tests for DAMON debugfs interface

It's time to remove DAMON debugfs interface, which has deprecated long
before in February 2023.  Read the cover letter of this patch series for
more details.

Remove selftests for the interface, to prevent causing unnecessary test
failures.

Link: https://lkml.kernel.org/r/20250106191941.107070-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Rae Moar <rmoar@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/.gitignore           |  3 -
 tools/testing/selftests/damon/Makefile             | 11 +--
 tools/testing/selftests/damon/debugfs_attrs.sh     | 17 -----
 .../damon/debugfs_duplicate_context_creation.sh    | 27 --------
 .../selftests/damon/debugfs_empty_targets.sh       | 21 ------
 .../damon/debugfs_huge_count_read_write.sh         | 22 ------
 .../selftests/damon/debugfs_rm_non_contexts.sh     | 19 -----
 tools/testing/selftests/damon/debugfs_schemes.sh   | 19 -----
 .../testing/selftests/damon/debugfs_target_ids.sh  | 19 -----
 .../selftests/damon/debugfs_target_ids_pid_leak.c  | 68 ------------------
 .../selftests/damon/debugfs_target_ids_pid_leak.sh | 22 ------
 ...debugfs_target_ids_read_before_terminate_race.c | 80 ----------------------
 ...ebugfs_target_ids_read_before_terminate_race.sh | 14 ----
 .../selftests/damon/huge_count_read_write.c        | 46 -------------
 14 files changed, 1 insertion(+), 387 deletions(-)
 delete mode 100755 tools/testing/selftests/damon/debugfs_attrs.sh
 delete mode 100755 tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
 delete mode 100755 tools/testing/selftests/damon/debugfs_empty_targets.sh
 delete mode 100755 tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
 delete mode 100755 tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
 delete mode 100755 tools/testing/selftests/damon/debugfs_schemes.sh
 delete mode 100755 tools/testing/selftests/damon/debugfs_target_ids.sh
 delete mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
 delete mode 100755 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
 delete mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
 delete mode 100755 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
 delete mode 100644 tools/testing/selftests/damon/huge_count_read_write.c

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
index 2ab675fecb6b..2f0297657c81 100644
--- a/tools/testing/selftests/damon/.gitignore
+++ b/tools/testing/selftests/damon/.gitignore
@@ -1,6 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-huge_count_read_write
-debugfs_target_ids_read_before_terminate_race
-debugfs_target_ids_pid_leak
 access_memory
 access_memory_even
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 812f656260fb..ecbf07afc6dd 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for damon selftests
 
-TEST_GEN_FILES += huge_count_read_write
-TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race
-TEST_GEN_FILES += debugfs_target_ids_pid_leak
 TEST_GEN_FILES += access_memory access_memory_even
 
-TEST_FILES = _chk_dependency.sh _debugfs_common.sh _damon_sysfs.py
+TEST_FILES = _chk_dependency.sh _damon_sysfs.py
 
 # functionality tests
-TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += sysfs.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
 TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py
@@ -17,11 +13,6 @@ TEST_PROGS += damos_tried_regions.py damon_nr_regions.py
 TEST_PROGS += reclaim.sh lru_sort.sh
 
 # regression tests (reproducers of previously found bugs)
-TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
-TEST_PROGS += debugfs_duplicate_context_creation.sh
-TEST_PROGS += debugfs_rm_non_contexts.sh
-TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh
-TEST_PROGS += debugfs_target_ids_pid_leak.sh
 TEST_PROGS += sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 
diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
deleted file mode 100755
index 902e312bca89..000000000000
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test attrs file
-# ===============
-
-file="$DBGFS/attrs"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input"
-test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields"
-test_write_fail "$file" "1 2 3 5 4" "$orig_content" \
-	"min_nr_regions > max_nr_regions"
-test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written"
-echo "$orig_content" > "$file"
diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
deleted file mode 100755
index bd6c22d96ead..000000000000
--- a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test duplicated context creation
-# ================================
-
-if ! echo foo > "$DBGFS/mk_contexts"
-then
-	echo "context creation failed"
-	exit 1
-fi
-
-if echo foo > "$DBGFS/mk_contexts" 2> /dev/null
-then
-	echo "duplicate context creation success"
-	exit 1
-fi
-
-if ! echo foo > "$DBGFS/rm_contexts"
-then
-	echo "context deletion failed"
-	exit 1
-fi
-
-exit 0
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
deleted file mode 100755
index effbea33dc16..000000000000
--- a/tools/testing/selftests/damon/debugfs_empty_targets.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test empty targets case
-# =======================
-
-orig_target_ids=$(cat "$DBGFS/target_ids")
-echo "" > "$DBGFS/target_ids"
-
-if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
-then
-	monitor_on_file="$DBGFS/monitor_on_DEPRECATED"
-else
-	monitor_on_file="$DBGFS/monitor_on"
-fi
-
-orig_monitor_on=$(cat "$monitor_on_file")
-test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids"
-echo "$orig_target_ids" > "$DBGFS/target_ids"
diff --git a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
deleted file mode 100755
index 922cadac2950..000000000000
--- a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test huge count read write
-# ==========================
-
-dmesg -C
-
-for file in "$DBGFS/"*
-do
-	./huge_count_read_write "$file"
-done
-
-if dmesg | grep -q WARNING
-then
-	dmesg
-	exit 1
-else
-	exit 0
-fi
diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
deleted file mode 100755
index f3ffeb1343cf..000000000000
--- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test putting non-ctx files/dirs to rm_contexts file
-# ===================================================
-
-dmesg -C
-
-for file in "$DBGFS/"*
-do
-	(echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null
-	if dmesg | grep -q BUG
-	then
-		dmesg
-		exit 1
-	fi
-done
diff --git a/tools/testing/selftests/damon/debugfs_schemes.sh b/tools/testing/selftests/damon/debugfs_schemes.sh
deleted file mode 100755
index 5b39ab44731c..000000000000
--- a/tools/testing/selftests/damon/debugfs_schemes.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test schemes file
-# =================
-
-file="$DBGFS/schemes"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
-	"$orig_content" "valid input"
-test_write_fail "$file" "1 2
-3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
-test_write_succ "$file" "" "$orig_content" "disabling"
-test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
-	"$orig_content" "wrong condition ranges"
-echo "$orig_content" > "$file"
diff --git a/tools/testing/selftests/damon/debugfs_target_ids.sh b/tools/testing/selftests/damon/debugfs_target_ids.sh
deleted file mode 100755
index 49aeabdb0aae..000000000000
--- a/tools/testing/selftests/damon/debugfs_target_ids.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source _debugfs_common.sh
-
-# Test target_ids file
-# ====================
-
-file="$DBGFS/target_ids"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input"
-test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input"
-test_content "$file" "$orig_content" "1 2" "non-integer was there"
-test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input"
-test_content "$file" "$orig_content" "" "wrong input written"
-test_write_succ "$file" "" "$orig_content" "empty input"
-test_content "$file" "$orig_content" "" "empty input written"
-echo "$orig_content" > "$file"
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
deleted file mode 100644
index 0cc2eef7d142..000000000000
--- a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#define _GNU_SOURCE
-
-#include <fcntl.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sys/time.h>
-#include <unistd.h>
-
-#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
-
-static void write_targetid_exit(void)
-{
-	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
-	char pid_str[128];
-
-	snprintf(pid_str, sizeof(pid_str), "%d", getpid());
-	write(target_ids_fd, pid_str, sizeof(pid_str));
-	close(target_ids_fd);
-	exit(0);
-}
-
-unsigned long msec_timestamp(void)
-{
-	struct timeval tv;
-
-	gettimeofday(&tv, NULL);
-	return tv.tv_sec * 1000UL + tv.tv_usec / 1000;
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned long start_ms;
-	int time_to_run, nr_forks = 0;
-
-	if (argc != 2) {
-		fprintf(stderr, "Usage: %s <msecs to run>\n", argv[0]);
-		exit(1);
-	}
-	time_to_run = atoi(argv[1]);
-
-	start_ms = msec_timestamp();
-	while (true) {
-		int pid = fork();
-
-		if (pid < 0) {
-			fprintf(stderr, "fork() failed\n");
-			exit(1);
-		}
-		if (pid == 0)
-			write_targetid_exit();
-		wait(NULL);
-		nr_forks++;
-
-		if (msec_timestamp() - start_ms > time_to_run)
-			break;
-	}
-	printf("%d\n", nr_forks);
-	return 0;
-}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
deleted file mode 100755
index 31fe33c2b032..000000000000
--- a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-before=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
-
-nr_leaks=$(./debugfs_target_ids_pid_leak 1000)
-expected_after_max=$((before + nr_leaks / 2))
-
-after=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
-
-echo > /sys/kernel/debug/damon/target_ids
-
-echo "tried $nr_leaks pid leak"
-echo "number of active pid slabs: $before -> $after"
-echo "(up to $expected_after_max expected)"
-if [ $after -gt $expected_after_max ]
-then
-	echo "maybe pids are leaking"
-	exit 1
-else
-	exit 0
-fi
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
deleted file mode 100644
index b06f52a8ce2d..000000000000
--- a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Author: SeongJae Park <sj@kernel.org>
- */
-#define _GNU_SOURCE
-
-#include <fcntl.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <time.h>
-#include <unistd.h>
-
-#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED"
-#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
-
-static void turn_damon_on_exit(void)
-{
-	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
-	int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR);
-	char pid_str[128];
-
-	snprintf(pid_str, sizeof(pid_str), "%d", getpid());
-	write(target_ids_fd, pid_str, sizeof(pid_str));
-	write(monitor_on_fd, "on\n", 3);
-	close(target_ids_fd);
-	close(monitor_on_fd);
-	usleep(1000);
-	exit(0);
-}
-
-static void try_race(void)
-{
-	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
-	int pid = fork();
-	int buf[256];
-
-	if (pid < 0) {
-		fprintf(stderr, "fork() failed\n");
-		exit(1);
-	}
-	if (pid == 0)
-		turn_damon_on_exit();
-	while (true) {
-		int status;
-
-		read(target_ids_fd, buf, sizeof(buf));
-		if (waitpid(-1, &status, WNOHANG) == pid)
-			break;
-	}
-	close(target_ids_fd);
-}
-
-static inline uint64_t ts_to_ms(struct timespec *ts)
-{
-	return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000;
-}
-
-int main(int argc, char *argv[])
-{
-	struct timespec start_time, now;
-	int runtime_ms;
-
-	if (argc != 2) {
-		fprintf(stderr, "Usage: %s <runtime in ms>\n", argv[0]);
-		exit(1);
-	}
-	runtime_ms = atoi(argv[1]);
-	clock_gettime(CLOCK_MONOTONIC, &start_time);
-	while (true) {
-		try_race();
-		clock_gettime(CLOCK_MONOTONIC, &now);
-		if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms)
-			break;
-	}
-	return 0;
-}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
deleted file mode 100755
index fc793c4c9aea..000000000000
--- a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-dmesg -C
-
-./debugfs_target_ids_read_before_terminate_race 5000
-
-if dmesg | grep -q dbgfs_target_ids_read
-then
-	dmesg
-	exit 1
-else
-	exit 0
-fi
diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
deleted file mode 100644
index 53e69a669668..000000000000
--- a/tools/testing/selftests/damon/huge_count_read_write.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#include <fcntl.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdio.h>
-
-#pragma GCC diagnostic push
-#if __GNUC__ >= 11 && __GNUC_MINOR__ >= 1
-/* Ignore read(2) overflow and write(2) overread compile warnings */
-#pragma GCC diagnostic ignored "-Wstringop-overread"
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif
-
-void write_read_with_huge_count(char *file)
-{
-	int filedesc = open(file, O_RDWR);
-	char buf[256];
-	int ret;
-
-	printf("%s %s\n", __func__, file);
-	if (filedesc < 0) {
-		fprintf(stderr, "failed opening %s\n", file);
-		exit(1);
-	}
-
-	write(filedesc, "", 0xfffffffful);
-	ret = read(filedesc, buf, 0xfffffffful);
-	close(filedesc);
-}
-
-#pragma GCC diagnostic pop
-
-int main(int argc, char *argv[])
-{
-	if (argc != 2) {
-		fprintf(stderr, "Usage: %s <file>\n", argv[0]);
-		exit(1);
-	}
-	write_read_with_huge_count(argv[1]);
-
-	return 0;
-}
-- 
cgit v1.2.3


From d8a142058f39a32ae6c7cd5a786c656133c717fb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jan 2025 11:19:39 -0800
Subject: kunit: configs: remove configs for DAMON debugfs interface tests

It's time to remove DAMON debugfs interface, which has deprecated long
before in February 2023.  Read the cover letter of this patch series for
more details.

Remove kernel configs for running DAMON debugfs interface kunit tests from
the kunit all_tests configuration, to prevent unnecessary noises from
tests.

Link: https://lkml.kernel.org/r/20250106191941.107070-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Rae Moar <rmoar@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/kunit/configs/all_tests.config | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config
index b3b00269a52a..b0049be00c70 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -38,9 +38,6 @@ CONFIG_IWLWIFI=y
 CONFIG_DAMON=y
 CONFIG_DAMON_VADDR=y
 CONFIG_DAMON_PADDR=y
-CONFIG_DEBUG_FS=y
-CONFIG_DAMON_DBGFS=y
-CONFIG_DAMON_DBGFS_DEPRECATED=y
 
 CONFIG_REGMAP_BUILD=y
 
-- 
cgit v1.2.3


From b2466bb3b4955350ee102b29c904ef301cb72bd2 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 7 Jan 2025 14:47:53 +0000
Subject: selftests/mm: introduce uffd-wp-mremap regression test

Introduce a test that registers a range of memory for
UFFDIO_WRITEPROTECT_MODE_WP without UFFD_FEATURE_EVENT_REMAP.  First check
that the uffd-wp bit is set for every PTE in the range.  Then mremap() the
range to a new location and check that the uffd-wp bit is clear for every
PTE in the range.

Run the test for small folios, all supported THP sizes and all supported
hugetlb sizes, and for swapped out memory, shared and private.

There was previously a bug in the kernel where the uffd-wp bits remained
set in all PTEs for this case, after fixing the kernel, the tests all
pass.

Link: https://lkml.kernel.org/r/20250107144755.1871363-3-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore       |   1 +
 tools/testing/selftests/mm/Makefile         |   2 +
 tools/testing/selftests/mm/run_vmtests.sh   |   1 +
 tools/testing/selftests/mm/uffd-wp-mremap.c | 380 ++++++++++++++++++++++++++++
 4 files changed, 384 insertions(+)
 create mode 100644 tools/testing/selftests/mm/uffd-wp-mremap.c

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index a51a947b2d1d..121000c28c10 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -27,6 +27,7 @@ protection_keys_64
 madv_populate
 uffd-stress
 uffd-unit-tests
+uffd-wp-mremap
 mlock-intersect-test
 mlock-random-test
 virtual_address_range
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index f430c4303c0d..63ce39d024bb 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -88,6 +88,7 @@ TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += uffd-stress
 TEST_GEN_FILES += uffd-unit-tests
+TEST_GEN_FILES += uffd-wp-mremap
 TEST_GEN_FILES += split_huge_page_test
 TEST_GEN_FILES += ksm_tests
 TEST_GEN_FILES += ksm_functional_tests
@@ -158,6 +159,7 @@ $(TEST_GEN_FILES): vm_util.c thp_settings.c
 
 $(OUTPUT)/uffd-stress: uffd-common.c
 $(OUTPUT)/uffd-unit-tests: uffd-common.c
+$(OUTPUT)/uffd-wp-mremap: uffd-common.c
 $(OUTPUT)/protection_keys: pkey_util.c
 $(OUTPUT)/pkey_sighandler_tests: pkey_util.c
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 00c3f07ea100..333c468c2699 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -309,6 +309,7 @@ CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 3
 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32
 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16
 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16
+CATEGORY="userfaultfd" run_test ./uffd-wp-mremap
 
 #cleanup
 echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c
new file mode 100644
index 000000000000..2c4f984bd73c
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-wp-mremap.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include "../kselftest.h"
+#include "thp_settings.h"
+#include "uffd-common.h"
+
+static int pagemap_fd;
+static size_t pagesize;
+static int nr_pagesizes = 1;
+static int nr_thpsizes;
+static size_t thpsizes[20];
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+
+static int sz2ord(size_t size)
+{
+	return __builtin_ctzll(size / pagesize);
+}
+
+static int detect_thp_sizes(size_t sizes[], int max)
+{
+	int count = 0;
+	unsigned long orders;
+	size_t kb;
+	int i;
+
+	/* thp not supported at all. */
+	if (!read_pmd_pagesize())
+		return 0;
+
+	orders = thp_supported_orders();
+
+	for (i = 0; orders && count < max; i++) {
+		if (!(orders & (1UL << i)))
+			continue;
+		orders &= ~(1UL << i);
+		kb = (pagesize >> 10) << i;
+		sizes[count++] = kb * 1024;
+		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
+	}
+
+	return count;
+}
+
+static void *mmap_aligned(size_t size, int prot, int flags)
+{
+	size_t mmap_size = size * 2;
+	char *mmap_mem, *mem;
+
+	mmap_mem = mmap(NULL, mmap_size, prot, flags, -1, 0);
+	if (mmap_mem == MAP_FAILED)
+		return mmap_mem;
+
+	mem = (char *)(((uintptr_t)mmap_mem + size - 1) & ~(size - 1));
+	munmap(mmap_mem, mem - mmap_mem);
+	munmap(mem + size, mmap_mem + mmap_size - mem - size);
+
+	return mem;
+}
+
+static void *alloc_one_folio(size_t size, bool private, bool hugetlb)
+{
+	bool thp = !hugetlb && size > pagesize;
+	int flags = MAP_ANONYMOUS;
+	int prot = PROT_READ | PROT_WRITE;
+	char *mem, *addr;
+
+	assert((size & (size - 1)) == 0);
+
+	if (private)
+		flags |= MAP_PRIVATE;
+	else
+		flags |= MAP_SHARED;
+
+	/*
+	 * For THP, we must explicitly enable the THP size, allocate twice the
+	 * required space then manually align.
+	 */
+	if (thp) {
+		struct thp_settings settings = *thp_current_settings();
+
+		if (private)
+			settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
+		else
+			settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS;
+
+		thp_push_settings(&settings);
+
+		mem = mmap_aligned(size, prot, flags);
+	} else {
+		if (hugetlb) {
+			flags |= MAP_HUGETLB;
+			flags |= __builtin_ctzll(size) << MAP_HUGE_SHIFT;
+		}
+
+		mem = mmap(NULL, size, prot, flags, -1, 0);
+	}
+
+	if (mem == MAP_FAILED) {
+		mem = NULL;
+		goto out;
+	}
+
+	assert(((uintptr_t)mem & (size - 1)) == 0);
+
+	/*
+	 * Populate the folio by writing the first byte and check that all pages
+	 * are populated. Finally set the whole thing to non-zero data to avoid
+	 * kernel from mapping it back to the zero page.
+	 */
+	mem[0] = 1;
+	for (addr = mem; addr < mem + size; addr += pagesize) {
+		if (!pagemap_is_populated(pagemap_fd, addr)) {
+			munmap(mem, size);
+			mem = NULL;
+			goto out;
+		}
+	}
+	memset(mem, 1, size);
+out:
+	if (thp)
+		thp_pop_settings();
+
+	return mem;
+}
+
+static bool check_uffd_wp_state(void *mem, size_t size, bool expect)
+{
+	uint64_t pte;
+	void *addr;
+
+	for (addr = mem; addr < mem + size; addr += pagesize) {
+		pte = pagemap_get_entry(pagemap_fd, addr);
+		if (!!(pte & PM_UFFD_WP) != expect) {
+			ksft_test_result_fail("uffd-wp not %s for pte %lu!\n",
+					      expect ? "set" : "clear",
+					      (addr - mem) / pagesize);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+	for (; size; addr += pagesize, size -= pagesize)
+		if (!pagemap_is_swapped(pagemap_fd, addr))
+			return false;
+	return true;
+}
+
+static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb)
+{
+	struct uffdio_writeprotect wp_prms;
+	uint64_t features = 0;
+	void *addr = NULL;
+	void *mem = NULL;
+
+	assert(!(hugetlb && swapout));
+
+	ksft_print_msg("[RUN] %s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n",
+				__func__,
+				size,
+				private ? "true" : "false",
+				swapout ? "true" : "false",
+				hugetlb ? "true" : "false");
+
+	/* Allocate a folio of required size and type. */
+	mem = alloc_one_folio(size, private, hugetlb);
+	if (!mem) {
+		ksft_test_result_fail("alloc_one_folio() failed\n");
+		goto out;
+	}
+
+	/* Register range for uffd-wp. */
+	if (userfaultfd_open(&features)) {
+		ksft_test_result_fail("userfaultfd_open() failed\n");
+		goto out;
+	}
+	if (uffd_register(uffd, mem, size, false, true, false)) {
+		ksft_test_result_fail("uffd_register() failed\n");
+		goto out;
+	}
+	wp_prms.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+	wp_prms.range.start = (uintptr_t)mem;
+	wp_prms.range.len = size;
+	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp_prms)) {
+		ksft_test_result_fail("ioctl(UFFDIO_WRITEPROTECT) failed\n");
+		goto out;
+	}
+
+	if (swapout) {
+		madvise(mem, size, MADV_PAGEOUT);
+		if (!range_is_swapped(mem, size)) {
+			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+			goto out;
+		}
+	}
+
+	/* Check that uffd-wp is set for all PTEs in range. */
+	if (!check_uffd_wp_state(mem, size, true))
+		goto out;
+
+	/*
+	 * Move the mapping to a new, aligned location. Since
+	 * UFFD_FEATURE_EVENT_REMAP is not set, we expect the uffd-wp bit for
+	 * each PTE to be cleared in the new mapping.
+	 */
+	addr = mmap_aligned(size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS);
+	if (addr == MAP_FAILED) {
+		ksft_test_result_fail("mmap_aligned() failed\n");
+		goto out;
+	}
+	if (mremap(mem, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, addr) == MAP_FAILED) {
+		ksft_test_result_fail("mremap() failed\n");
+		munmap(addr, size);
+		goto out;
+	}
+	mem = addr;
+
+	/* Check that uffd-wp is cleared for all PTEs in range. */
+	if (!check_uffd_wp_state(mem, size, false))
+		goto out;
+
+	ksft_test_result_pass("%s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n",
+				__func__,
+				size,
+				private ? "true" : "false",
+				swapout ? "true" : "false",
+				hugetlb ? "true" : "false");
+out:
+	if (mem)
+		munmap(mem, size);
+	if (uffd >= 0) {
+		close(uffd);
+		uffd = -1;
+	}
+}
+
+struct testcase {
+	size_t *sizes;
+	int *nr_sizes;
+	bool private;
+	bool swapout;
+	bool hugetlb;
+};
+
+static const struct testcase testcases[] = {
+	/* base pages. */
+	{
+		.sizes = &pagesize,
+		.nr_sizes = &nr_pagesizes,
+		.private = false,
+		.swapout = false,
+		.hugetlb = false,
+	},
+	{
+		.sizes = &pagesize,
+		.nr_sizes = &nr_pagesizes,
+		.private = true,
+		.swapout = false,
+		.hugetlb = false,
+	},
+	{
+		.sizes = &pagesize,
+		.nr_sizes = &nr_pagesizes,
+		.private = false,
+		.swapout = true,
+		.hugetlb = false,
+	},
+	{
+		.sizes = &pagesize,
+		.nr_sizes = &nr_pagesizes,
+		.private = true,
+		.swapout = true,
+		.hugetlb = false,
+	},
+
+	/* thp. */
+	{
+		.sizes = thpsizes,
+		.nr_sizes = &nr_thpsizes,
+		.private = false,
+		.swapout = false,
+		.hugetlb = false,
+	},
+	{
+		.sizes = thpsizes,
+		.nr_sizes = &nr_thpsizes,
+		.private = true,
+		.swapout = false,
+		.hugetlb = false,
+	},
+	{
+		.sizes = thpsizes,
+		.nr_sizes = &nr_thpsizes,
+		.private = false,
+		.swapout = true,
+		.hugetlb = false,
+	},
+	{
+		.sizes = thpsizes,
+		.nr_sizes = &nr_thpsizes,
+		.private = true,
+		.swapout = true,
+		.hugetlb = false,
+	},
+
+	/* hugetlb. */
+	{
+		.sizes = hugetlbsizes,
+		.nr_sizes = &nr_hugetlbsizes,
+		.private = false,
+		.swapout = false,
+		.hugetlb = true,
+	},
+	{
+		.sizes = hugetlbsizes,
+		.nr_sizes = &nr_hugetlbsizes,
+		.private = true,
+		.swapout = false,
+		.hugetlb = true,
+	},
+};
+
+int main(int argc, char **argv)
+{
+	struct thp_settings settings;
+	int i, j, plan = 0;
+
+	pagesize = getpagesize();
+	nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
+	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
+						    ARRAY_SIZE(hugetlbsizes));
+
+	/* If THP is supported, save THP settings and initially disable THP. */
+	if (nr_thpsizes) {
+		thp_save_settings();
+		thp_read_settings(&settings);
+		for (i = 0; i < NR_ORDERS; i++) {
+			settings.hugepages[i].enabled = THP_NEVER;
+			settings.shmem_hugepages[i].enabled = SHMEM_NEVER;
+		}
+		thp_push_settings(&settings);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(testcases); i++)
+		plan += *testcases[i].nr_sizes;
+	ksft_set_plan(plan);
+
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		ksft_exit_fail_msg("opening pagemap failed\n");
+
+	for (i = 0; i < ARRAY_SIZE(testcases); i++) {
+		const struct testcase *tc = &testcases[i];
+
+		for (j = 0; j < *tc->nr_sizes; j++)
+			test_one_folio(tc->sizes[j], tc->private, tc->swapout,
+				       tc->hugetlb);
+	}
+
+	/* If THP is supported, restore original THP settings. */
+	if (nr_thpsizes)
+		thp_restore_settings();
+
+	i = ksft_get_fail_cnt();
+	if (i)
+		ksft_exit_fail_msg("%d out of %d tests failed\n",
+				   i, ksft_test_num());
+	ksft_exit_pass();
+}
-- 
cgit v1.2.3


From f8d4a6cabb74f82c37ccb7c5e9dc3fdad50393d4 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 2 Jan 2025 12:10:52 +0000
Subject: mm: make mmap_region() internal

Now that we have removed the one user of mmap_region() outside of mm, make
it internal and add it to vma.c so it can be userland tested.

This ensures that all external memory mappings are performed using the
appropriate interfaces and allows us to modify memory mapping logic as we
see fit.

Additionally expand test stubs to allow for the mmap_region() code to
compile and be userland testable.

Link: https://lkml.kernel.org/r/de5a3c574d35c26237edf20a1d8652d7305709c9.1735819274.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 65 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index ae635eecbfa8..2404347fa2c7 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -41,6 +41,8 @@ extern unsigned long dac_mmap_min_addr;
 #define VM_BUG_ON(_expr) (BUG_ON(_expr))
 #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
 
+#define MMF_HAS_MDWE	28
+
 #define VM_NONE		0x00000000
 #define VM_READ		0x00000001
 #define VM_WRITE	0x00000002
@@ -226,6 +228,8 @@ struct mm_struct {
 	unsigned long stack_vm;	   /* VM_STACK */
 
 	unsigned long def_flags;
+
+	unsigned long flags; /* Must use atomic bitops to access */
 };
 
 struct vma_lock {
@@ -1185,4 +1189,65 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 {
 }
 
+/*
+ * Denies creating a writable executable mapping or gaining executable permissions.
+ *
+ * This denies the following:
+ *
+ *     a)      mmap(PROT_WRITE | PROT_EXEC)
+ *
+ *     b)      mmap(PROT_WRITE)
+ *             mprotect(PROT_EXEC)
+ *
+ *     c)      mmap(PROT_WRITE)
+ *             mprotect(PROT_READ)
+ *             mprotect(PROT_EXEC)
+ *
+ * But allows the following:
+ *
+ *     d)      mmap(PROT_READ | PROT_EXEC)
+ *             mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
+ */
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
+{
+	/* If MDWE is disabled, we have nothing to deny. */
+	if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+		return false;
+
+	/* If the new VMA is not executable, we have nothing to deny. */
+	if (!(new & VM_EXEC))
+		return false;
+
+	/* Under MDWE we do not accept newly writably executable VMAs... */
+	if (new & VM_WRITE)
+		return true;
+
+	/* ...nor previously non-executable VMAs becoming executable. */
+	if (!(old & VM_EXEC))
+		return true;
+
+	return false;
+}
+
+static inline int mapping_map_writable(struct address_space *mapping)
+{
+	int c = atomic_read(&mapping->i_mmap_writable);
+
+	/* Derived from the raw_atomic_inc_unless_negative() implementation. */
+	do {
+		if (c < 0)
+			return -EPERM;
+	} while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1));
+
+	return 0;
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From 136c5b40e0ad84f4b4a38584089cd565b97f799c Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 10 Jan 2025 18:50:27 -0500
Subject: selftests/mm: use selftests framework to print test result

Otherwise the number of tests does not match the reality.

Link: https://lkml.kernel.org/r/20250110235028.96824-1-ziy@nvidia.com
Fixes: 391e86971161 ("mm: selftest to verify zero-filled pages are mapped to zeropage")
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Alexander Zhu <alexlzhu@fb.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/split_huge_page_test.c | 34 ++++++++---------------
 1 file changed, 12 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index eb6d1b9fc362..cd74ea9b1295 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -108,38 +108,28 @@ static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hp
 	unsigned long rss_anon_before, rss_anon_after;
 	size_t i;
 
-	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 4, pmd_pagesize))
+		ksft_exit_fail_msg("No THP is allocated\n");
 
 	rss_anon_before = rss_anon();
-	if (!rss_anon_before) {
-		printf("No RssAnon is allocated before split\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!rss_anon_before)
+		ksft_exit_fail_msg("No RssAnon is allocated before split\n");
 
 	/* split all THPs */
 	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
 		      (uint64_t)one_page + len, 0);
 
 	for (i = 0; i < len; i++)
-		if (one_page[i] != (char)0) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
+		if (one_page[i] != (char)0)
+			ksft_exit_fail_msg("%ld byte corrupted\n", i);
 
-	if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
-		printf("Still AnonHugePages not split\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 0, pmd_pagesize))
+		ksft_exit_fail_msg("Still AnonHugePages not split\n");
 
 	rss_anon_after = rss_anon();
-	if (rss_anon_after >= rss_anon_before) {
-		printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
+	if (rss_anon_after >= rss_anon_before)
+		ksft_exit_fail_msg("Incorrect RssAnon value. Before: %ld After: %ld\n",
 		       rss_anon_before, rss_anon_after);
-		exit(EXIT_FAILURE);
-	}
 }
 
 void split_pmd_zero_pages(void)
@@ -150,7 +140,7 @@ void split_pmd_zero_pages(void)
 
 	one_page = allocate_zero_filled_hugepage(len);
 	verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
-	printf("Split zero filled huge pages successful\n");
+	ksft_test_result_pass("Split zero filled huge pages successful\n");
 	free(one_page);
 }
 
@@ -491,7 +481,7 @@ int main(int argc, char **argv)
 	if (argc > 1)
 		optional_xfs_path = argv[1];
 
-	ksft_set_plan(3+9);
+	ksft_set_plan(4+9);
 
 	pagesize = getpagesize();
 	pageshift = ffs(pagesize) - 1;
-- 
cgit v1.2.3


From dfe61db4a132f229706dee64045c49144128be91 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 10 Jan 2025 18:50:28 -0500
Subject: selftests/mm: add tests for splitting pmd THPs to all lower orders

Kernel already supports splitting a folio to any lower order. Test it.

[ziy@nvidia.com: no need to test splitting to order-1]
 Link: https://lkml.kernel.org/r/DDA202EA-4664-4F50-A7FD-B00CBB7A624B@nvidia.com
Link: https://lkml.kernel.org/r/20250110235028.96824-2-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Alexander Zhu <alexlzhu@fb.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/split_huge_page_test.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index cd74ea9b1295..3f353f3d070f 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -144,7 +144,7 @@ void split_pmd_zero_pages(void)
 	free(one_page);
 }
 
-void split_pmd_thp(void)
+void split_pmd_thp_to_order(int order)
 {
 	char *one_page;
 	size_t len = 4 * pmd_pagesize;
@@ -164,7 +164,7 @@ void split_pmd_thp(void)
 
 	/* split all THPs */
 	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
-		(uint64_t)one_page + len, 0);
+		(uint64_t)one_page + len, order);
 
 	for (i = 0; i < len; i++)
 		if (one_page[i] != (char)i)
@@ -174,7 +174,7 @@ void split_pmd_thp(void)
 	if (!check_huge_anon(one_page, 0, pmd_pagesize))
 		ksft_exit_fail_msg("Still AnonHugePages not split\n");
 
-	ksft_test_result_pass("Split huge pages successful\n");
+	ksft_test_result_pass("Split huge pages to order %d successful\n", order);
 	free(one_page);
 }
 
@@ -481,7 +481,7 @@ int main(int argc, char **argv)
 	if (argc > 1)
 		optional_xfs_path = argv[1];
 
-	ksft_set_plan(4+9);
+	ksft_set_plan(1+8+2+9);
 
 	pagesize = getpagesize();
 	pageshift = ffs(pagesize) - 1;
@@ -492,7 +492,11 @@ int main(int argc, char **argv)
 	fd_size = 2 * pmd_pagesize;
 
 	split_pmd_zero_pages();
-	split_pmd_thp();
+
+	for (i = 0; i < 9; i++)
+		if (i != 1)
+			split_pmd_thp_to_order(i);
+
 	split_pte_mapped_thp();
 	split_file_backed_thp();
 
-- 
cgit v1.2.3


From bf069012df19cf80b460a03c92bfe6320dc268b0 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Mon, 13 Jan 2025 11:28:58 +0800
Subject: selftests/mm/cow: modify the incorrect checking parameters

In run_with_memfd_hugetlb(), some error handle have passed incorrect
parameters.  It should be "smem", but it was mistakenly written as "mem".

Let's fix it.

[gehao@kylinos.cn: fix other errant sites, per Anshuman]
  Link: https://lkml.kernel.org/r/20250113050908.93638-1-hao.ge@linux.dev
Link: https://lkml.kernel.org/r/20250113032858.63670-1-hao.ge@linux.dev
Fixes: f8664f3c4a08 ("selftests/vm: cow: basic COW tests for non-anonymous pages")
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/cow.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index 1238e1c5aae1..9446673645eb 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -1482,7 +1482,7 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
 	}
 
 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
-	if (mem == MAP_FAILED) {
+	if (smem == MAP_FAILED) {
 		ksft_test_result_fail("mmap() failed\n");
 		goto munmap;
 	}
@@ -1583,7 +1583,7 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc)
 		goto close;
 	}
 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
-	if (mem == MAP_FAILED) {
+	if (smem == MAP_FAILED) {
 		ksft_test_result_fail("mmap() failed\n");
 		goto munmap;
 	}
@@ -1634,7 +1634,7 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
 		goto close;
 	}
 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
-	if (mem == MAP_FAILED) {
+	if (smem == MAP_FAILED) {
 		ksft_test_result_fail("mmap() failed\n");
 		goto munmap;
 	}
@@ -1684,7 +1684,7 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
 		goto close;
 	}
 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
-	if (mem == MAP_FAILED) {
+	if (smem == MAP_FAILED) {
 		ksft_test_result_fail("mmap() failed\n");
 		goto munmap;
 	}
@@ -1696,7 +1696,7 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
 	fn(mem, smem, hugetlbsize);
 munmap:
 	munmap(mem, hugetlbsize);
-	if (mem != MAP_FAILED)
+	if (smem != MAP_FAILED)
 		munmap(smem, hugetlbsize);
 close:
 	close(fd);
-- 
cgit v1.2.3


From 73519ded992fc9dda2807450d6931002bb93cb16 Mon Sep 17 00:00:00 2001
From: liuye <liuye@kylinos.cn>
Date: Tue, 14 Jan 2025 11:21:15 +0800
Subject: selftests/memfd/memfd_test: fix possible NULL pointer dereference

If `name' is NULL, a NULL pointer may be accessed in printf.

Link: https://lkml.kernel.org/r/20250114032115.58638-1-liuye@kylinos.cn
Signed-off-by: liuye <liuye@kylinos.cn>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Saurav Shah <sauravshah.31@gmail.com>
Cc: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index c0c53451a16d..5b993924cc3f 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -171,7 +171,7 @@ static void mfd_fail_new(const char *name, unsigned int flags)
 	r = sys_memfd_create(name, flags);
 	if (r >= 0) {
 		printf("memfd_create(\"%s\", %u) succeeded, but failure expected\n",
-		       name, flags);
+		       name ? name : "NULL", flags);
 		close(r);
 		abort();
 	}
-- 
cgit v1.2.3


From a005145b9c969651a8997725e1df35c81040f76b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <thomas.weissschuh@linutronix.de>
Date: Tue, 14 Jan 2025 17:06:45 +0100
Subject: selftests/mm: virtual_address_range: mmap() without PROT_WRITE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "selftests/mm: virtual_address_range: Reduce memory", v4.

The selftest started failing since commit e93d2521b27f ("x86/vdso: Split
virtual clock pages into dedicated mapping") was merged.  While debugging
I stumbled upon some memory usage optimizations.

With these test now runs on a VM with only 60MiB of memory.


This patch (of 4):

When mapping a larger chunk than physical memory is available with
PROT_WRITE and overcommit is disabled, the mapping will fail.  This will
prevent the test from running on systems with less then ~1GiB of memory
and triggering an inscrutinable test failure.  As the mappings are never
written to anyways, the flag can be removed.

Link: https://lkml.kernel.org/r/20250114-virtual_address_range-tests-v4-0-6fd7269934a5@linutronix.de
Link: https://lkml.kernel.org/r/20250114-virtual_address_range-tests-v4-1-6fd7269934a5@linutronix.de
Fixes: 4e5ce33ceb32 ("selftests/vm: add a test for virtual address range mapping")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Dev Jain <dev.jain@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/virtual_address_range.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index 2a2b69e91950..ea6ccf49ef4c 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -166,7 +166,7 @@ int main(int argc, char *argv[])
 	ksft_set_plan(1);
 
 	for (i = 0; i < NR_CHUNKS_LOW; i++) {
-		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ,
 			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
 		if (ptr[i] == MAP_FAILED) {
@@ -186,7 +186,7 @@ int main(int argc, char *argv[])
 
 	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
 		hint = hint_addr();
-		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ,
 			       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
 		if (hptr[i] == MAP_FAILED)
-- 
cgit v1.2.3


From b2a79f62133aa687d8d966dd524192d9706bf3de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <thomas.weissschuh@linutronix.de>
Date: Tue, 14 Jan 2025 17:06:46 +0100
Subject: selftests/mm: virtual_address_range: unmap chunks after validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For each accessed chunk a PTE is created.  More than 1GiB of PTEs is used
in this way.  Remove each PTE after validating a chunk to reduce peak
memory usage.

It is important to only unmap memory that previously mmap()ed, as
unmapping other mappings like the stack, heap or executable mappings will
crash the process.

The mappings read from /proc/self/maps and the return values from mmap()
don't allow a simple correlation due to merging and no guaranteed order.
To correlate the pointers and mappings use prctl(PR_SET_VMA_ANON_NAME).
While it introduces a test dependency, other alternatives would introduce
runtime or development overhead.

Link: https://lkml.kernel.org/r/20250114-virtual_address_range-tests-v4-2-6fd7269934a5@linutronix.de
Fixes: 010409649885 ("selftests/mm: confirm VA exhaustion without reliance on correctness of mmap()")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/config                  |  1 +
 tools/testing/selftests/mm/virtual_address_range.c | 33 ++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
index 4309916f629e..a28baa536332 100644
--- a/tools/testing/selftests/mm/config
+++ b/tools/testing/selftests/mm/config
@@ -7,3 +7,4 @@ CONFIG_TEST_HMM=m
 CONFIG_GUP_TEST=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_MEM_SOFT_DIRTY=y
+CONFIG_ANON_VMA_NAME=y
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index ea6ccf49ef4c..386e4e46fa65 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -10,6 +10,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <errno.h>
+#include <sys/prctl.h>
 #include <sys/mman.h>
 #include <sys/time.h>
 #include <fcntl.h>
@@ -82,6 +83,24 @@ static void validate_addr(char *ptr, int high_addr)
 		ksft_exit_fail_msg("Bad address %lx\n", addr);
 }
 
+static void mark_range(char *ptr, size_t size)
+{
+	if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) {
+		if (errno == EINVAL) {
+			/* Depends on CONFIG_ANON_VMA_NAME */
+			ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n");
+			ksft_finished();
+		} else {
+			ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n");
+		}
+	}
+}
+
+static int is_marked_vma(const char *vma_name)
+{
+	return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n");
+}
+
 static int validate_lower_address_hint(void)
 {
 	char *ptr;
@@ -116,12 +135,17 @@ static int validate_complete_va_space(void)
 
 	prev_end_addr = 0;
 	while (fgets(line, sizeof(line), file)) {
+		const char *vma_name = NULL;
+		int vma_name_start = 0;
 		unsigned long hop;
 
-		if (sscanf(line, "%lx-%lx %s[rwxp-]",
-			   &start_addr, &end_addr, prot) != 3)
+		if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n",
+			   &start_addr, &end_addr, prot, &vma_name_start) != 3)
 			ksft_exit_fail_msg("cannot parse /proc/self/maps\n");
 
+		if (vma_name_start)
+			vma_name = line + vma_name_start;
+
 		/* end of userspace mappings; ignore vsyscall mapping */
 		if (start_addr & (1UL << 63))
 			return 0;
@@ -149,6 +173,9 @@ static int validate_complete_va_space(void)
 				return 1;
 			lseek(fd, 0, SEEK_SET);
 
+			if (is_marked_vma(vma_name))
+				munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE);
+
 			hop += MAP_CHUNK_SIZE;
 		}
 	}
@@ -175,6 +202,7 @@ int main(int argc, char *argv[])
 			break;
 		}
 
+		mark_range(ptr[i], MAP_CHUNK_SIZE);
 		validate_addr(ptr[i], 0);
 	}
 	lchunks = i;
@@ -192,6 +220,7 @@ int main(int argc, char *argv[])
 		if (hptr[i] == MAP_FAILED)
 			break;
 
+		mark_range(ptr[i], MAP_CHUNK_SIZE);
 		validate_addr(hptr[i], 1);
 	}
 	hchunks = i;
-- 
cgit v1.2.3


From 3c479b5dc60b297a9e327ac2734475c1d68b4443 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <thomas.weissschuh@linutronix.de>
Date: Tue, 14 Jan 2025 17:06:47 +0100
Subject: selftests/mm: vm_util: split up /proc/self/smaps parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upcoming changes want to reuse the /proc/self/smaps parsing logic to parse
the VmFlags field.

As that works differently from the currently parsed HugePage counters,
split up the logic so common functionality can be shared.

While reworking this code, also use the correct sscanf placeholder for the
"uint64_t thp" variable.

Link: https://lkml.kernel.org/r/20250114-virtual_address_range-tests-v4-3-6fd7269934a5@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/vm_util.c | 42 +++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 7519c9a892f0..b16f9d3adc69 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -2,6 +2,7 @@
 #include <string.h>
 #include <fcntl.h>
 #include <dirent.h>
+#include <inttypes.h>
 #include <sys/ioctl.h>
 #include <linux/userfaultfd.h>
 #include <linux/fs.h>
@@ -193,13 +194,11 @@ err_out:
 	return rss_anon;
 }
 
-bool __check_huge(void *addr, char *pattern, int nr_hpages,
-		  uint64_t hpage_size)
+char *__get_smap_entry(void *addr, const char *pattern, char *buf, size_t len)
 {
-	uint64_t thp = -1;
 	int ret;
 	FILE *fp;
-	char buffer[MAX_LINE_LENGTH];
+	char *entry = NULL;
 	char addr_pattern[MAX_LINE_LENGTH];
 
 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
@@ -211,23 +210,40 @@ bool __check_huge(void *addr, char *pattern, int nr_hpages,
 	if (!fp)
 		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
 
-	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+	if (!check_for_pattern(fp, addr_pattern, buf, len))
 		goto err_out;
 
-	/*
-	 * Fetch the pattern in the same block and check the number of
-	 * hugepages.
-	 */
-	if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer)))
+	/* Fetch the pattern in the same block */
+	if (!check_for_pattern(fp, pattern, buf, len))
 		goto err_out;
 
-	snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern);
+	/* Trim trailing newline */
+	entry = strchr(buf, '\n');
+	if (entry)
+		*entry = '\0';
 
-	if (sscanf(buffer, addr_pattern, &thp) != 1)
-		ksft_exit_fail_msg("Reading smap error\n");
+	entry = buf + strlen(pattern);
 
 err_out:
 	fclose(fp);
+	return entry;
+}
+
+bool __check_huge(void *addr, char *pattern, int nr_hpages,
+		  uint64_t hpage_size)
+{
+	char buffer[MAX_LINE_LENGTH];
+	uint64_t thp = -1;
+	char *entry;
+
+	entry = __get_smap_entry(addr, pattern, buffer, sizeof(buffer));
+	if (!entry)
+		goto err_out;
+
+	if (sscanf(entry, "%9" SCNu64 " kB", &thp) != 1)
+		ksft_exit_fail_msg("Reading smap error\n");
+
+err_out:
 	return thp == (nr_hpages * (hpage_size >> 10));
 }
 
-- 
cgit v1.2.3


From 3bd6137220bb5cf4114d038cf90cb20375b31124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <thomas.weissschuh@linutronix.de>
Date: Tue, 14 Jan 2025 17:06:48 +0100
Subject: selftests/mm: virtual_address_range: avoid reading from VM_IO
 mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The virtual_address_range selftest reads from the start of each mapping
listed in /proc/self/maps.  However not all mappings are valid to be
arbitrarily accessed.

For example the vvar data used for virtual clocks on x86 [vvar_vclock] can
only be accessed if 1) the kernel configuration enables virtual clocks and
2) the hypervisor provided the data for it.  Only the VDSO itself has the
necessary information to know this.  Since commit e93d2521b27f ("x86/vdso:
Split virtual clock pages into dedicated mapping") the virtual clock data
was split out into its own mapping, leading to EFAULT from read() during
the validation.

Check for the VM_IO flag as a proxy.  It is present for the VVAR mappings
and MMIO ranges can be dangerous to access arbitrarily.

Link: https://lkml.kernel.org/r/20250114-virtual_address_range-tests-v4-4-6fd7269934a5@linutronix.de
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202412271148.2656e485-lkp@intel.com
Fixes: e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping")
Fixes: 010409649885 ("selftests/mm: confirm VA exhaustion without reliance on correctness of mmap()")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Suggested-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/lkml/e97c2a5d-c815-4936-a767-ac42a3220a90@redhat.com/
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Shuah Khan (Samsung OSG) <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/virtual_address_range.c |  4 ++++
 tools/testing/selftests/mm/vm_util.c               | 24 ++++++++++++++++++++++
 tools/testing/selftests/mm/vm_util.h               |  1 +
 3 files changed, 29 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index 386e4e46fa65..b380e102b22f 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -15,6 +15,7 @@
 #include <sys/time.h>
 #include <fcntl.h>
 
+#include "vm_util.h"
 #include "../kselftest.h"
 
 /*
@@ -159,6 +160,9 @@ static int validate_complete_va_space(void)
 		if (prot[0] != 'r')
 			continue;
 
+		if (check_vmflag_io((void *)start_addr))
+			continue;
+
 		/*
 		 * Confirm whether MAP_CHUNK_SIZE chunk can be found or not.
 		 * If write succeeds, no need to check MAP_CHUNK_SIZE - 1
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index b16f9d3adc69..a36734fb62f3 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -400,3 +400,27 @@ unsigned long get_free_hugepages(void)
 	fclose(f);
 	return fhp;
 }
+
+bool check_vmflag_io(void *addr)
+{
+	char buffer[MAX_LINE_LENGTH];
+	const char *flags;
+	size_t flaglen;
+
+	flags = __get_smap_entry(addr, "VmFlags:", buffer, sizeof(buffer));
+	if (!flags)
+		ksft_exit_fail_msg("%s: No VmFlags for %p\n", __func__, addr);
+
+	while (true) {
+		flags += strspn(flags, " ");
+
+		flaglen = strcspn(flags, " ");
+		if (!flaglen)
+			return false;
+
+		if (flaglen == strlen("io") && !memcmp(flags, "io", flaglen))
+			return true;
+
+		flags += flaglen;
+	}
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 2eaed8209925..b60ac68a9dc8 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -53,6 +53,7 @@ int uffd_unregister(int uffd, void *addr, uint64_t len);
 int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
 			      bool miss, bool wp, bool minor, uint64_t *ioctls);
 unsigned long get_free_hugepages(void);
+bool check_vmflag_io(void *addr);
 
 /*
  * On ppc64 this will only work with radix 2M hugepage size
-- 
cgit v1.2.3


From 7882d8fc8fe0c2b2a01f09e56edf82df6b3013fd Mon Sep 17 00:00:00 2001
From: liuye <liuye@kylinos.cn>
Date: Tue, 14 Jan 2025 10:38:38 +0800
Subject: selftests/mm/mkdirty: fix memory leak in test_uffdio_copy()

Release memory before exception branch returns to prevent memory leaks

Checking tools/testing/selftests/mm/mkdirty.c ...
tools/testing/selftests/mm/mkdirty.c:283:3: error: Memory leak: src [memleak]
  return;
  ^

Link: https://lkml.kernel.org/r/20250114023838.48589-1-liuye@kylinos.cn
Signed-off-by: liuye <liuye@kylinos.cn>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mkdirty.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c
index 1db134063c38..af2fce496912 100644
--- a/tools/testing/selftests/mm/mkdirty.c
+++ b/tools/testing/selftests/mm/mkdirty.c
@@ -280,6 +280,7 @@ static void test_uffdio_copy(void)
 	dst = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0);
 	if (dst == MAP_FAILED) {
 		ksft_test_result_fail("mmap() failed\n");
+		free(src);
 		return;
 	}
 
-- 
cgit v1.2.3


From cf929a2863bff58608ab30a88b8c7a5fc93ff437 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 16 Jan 2025 10:15:38 -0800
Subject: tools: add VM_WARN_ON_VMG definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vma tests compilation yields the following error:

vma.c:732:9: error: implicit declaration of function ‘VM_WARN_ON_VMG’

Fix it by adding missing VM_WARN_ON_VMG() definition.

Link: https://lkml.kernel.org/r/20250116181538.759469-1-surenb@google.com
Fixes: e3a7ae85f87c ("mm/debug: prefer VM_WARN_ON_VMG() to report VMG debug warnings")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/vma_internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 2404347fa2c7..1eae23039854 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -38,6 +38,7 @@ extern unsigned long dac_mmap_min_addr;
 
 #define VM_WARN_ON(_expr) (WARN_ON(_expr))
 #define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
+#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
 #define VM_BUG_ON(_expr) (BUG_ON(_expr))
 #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
 
-- 
cgit v1.2.3


From debe797c1e972ebe434c90f3fa7f54d9cf7ab251 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Thu, 5 Dec 2024 19:26:14 +0100
Subject: tools/power turbostat: Add fixed RAPL PSYS divisor for SPR

Intel Sapphire Rapids is an exception and has fixed divisor for RAPL PSYS
counter set to 1.0. Add a platform bit and enable it for SPR.

Reported-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a2ca1c6c3638..1bcecfed721b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -358,7 +358,7 @@ unsigned long long cpuidle_cur_sys_lpi_us;
 unsigned int tj_max;
 unsigned int tj_max_override;
 double rapl_power_units, rapl_time_units;
-double rapl_dram_energy_units, rapl_energy_units;
+double rapl_dram_energy_units, rapl_energy_units, rapl_psys_energy_units;
 double rapl_joule_counter_range;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
@@ -424,6 +424,7 @@ struct platform_features {
 	bool has_per_core_rapl;	/* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
 	bool has_rapl_divisor;	/* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
 	bool has_fixed_rapl_unit;	/* Fixed Energy Unit used for DRAM RAPL Domain */
+	bool has_fixed_rapl_psys_unit;	/* Fixed Energy Unit used for PSYS RAPL Domain */
 	int rapl_quirk_tdp;	/* Hardcoded TDP value when cannot be retrieved from hardware */
 	int tcc_offset_bits;	/* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
 	bool enable_tsc_tweak;	/* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
@@ -824,6 +825,7 @@ static const struct platform_features spr_features = {
 	.has_msr_core_c1_res = 1,
 	.has_irtl_msrs = 1,
 	.has_cst_prewake_bit = 1,
+	.has_fixed_rapl_psys_unit = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
 };
@@ -1292,7 +1294,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
 	 .msr = MSR_PLATFORM_ENERGY_STATUS,
 	 .msr_mask = 0x00000000FFFFFFFF,
 	 .msr_shift = 0,
-	 .platform_rapl_msr_scale = &rapl_energy_units,
+	 .platform_rapl_msr_scale = &rapl_psys_energy_units,
 	 .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM,
 	 .bic = BIC_SysWatt | BIC_Sys_J,
 	 .compat_scale = 1.0,
@@ -7112,6 +7114,11 @@ void rapl_probe_intel(void)
 	else
 		rapl_dram_energy_units = rapl_energy_units;
 
+	if (platform->has_fixed_rapl_psys_unit)
+		rapl_psys_energy_units = 1.0;
+	else
+		rapl_psys_energy_units = rapl_energy_units;
+
 	time_unit = msr >> 16 & 0xF;
 	if (time_unit == 0)
 		time_unit = 0xA;
-- 
cgit v1.2.3


From 1af5baeda512d0940748fdf9b559e1041dbab0cf Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 8 Jan 2025 14:19:42 +0800
Subject: tools/power turbostat: Enhance turbostat self-performance visibility

Include procfs and sysfs data collection time in the system summary
row of the "usec" column.  This is useful for isolating where the
time goes during turbostat data collection.

Background:

Column "usec" shows
1. the number of microseconds elapsed during counter collection,
   including thread migration -- if any, for each CPU row.
2. total elapsed time to collect the counters on all cpus, for the
   summary row.
This can be used to check the time cost of a give column. For example,
run below commands separately
   turbostat --show usec sleep 1
   turbostat --show usec,CoreTmp sleep 1
and the delta in the usec column will tell the time cost for CoreTmp
(Thermal MSR read)

Problem:

Some of the kernel procfs/sysfs accesses are expensive, especially on
high core count systems. "usec" column cannot tell this because it only
includes the time cost of the counters.

Solution:

Leave the per CPU "usec" as it is and modify the summary "usec" to
include the time cost of the procfs/sysfs snapshot.

With it, the "usec" column can be used to get
1. the baseline, e.g.
	turbostat --show usec sleep 1
2. the baseline + some per CPU counter cost, e.g.
	turbostat --show usec,CoreTmp sleep 1
3. the baseline + some per CPU sysfs cost, e.g.
	turbostat --show usec,C1 sleep 1
4. the baseline + /proc/interrupts cost, e.g
	turbostat --show usec,IRQ sleep 1

Man-page update is also included.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 2 +-
 tools/power/x86/turbostat/turbostat.c | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index f043a93defd4..99bf905ade81 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -136,7 +136,7 @@ displays the statistics gathered since it was forked.
 The system configuration dump (if --quiet is not used) is followed by statistics.  The first row of the statistics labels the content of each column (below).  The second row of statistics is the system summary line.  The system summary line has a '-' in the columns for the Package, Core, and CPU.  The contents of the system summary line depends on the type of column.  Columns that count items (eg. IRQ) show the sum across all CPUs in the system.  Columns that show a percentage show the average across all CPUs in the system.  Columns that dump raw MSR values simply show 0 in the summary.  After the system summary row, each row describes a specific Package/Core/CPU.  Note that if the --cpu parameter is used to limit which specific CPUs are displayed, turbostat will still collect statistics for all CPUs in the system and will still show the system summary for all CPUs in the system.
 .SH COLUMN DESCRIPTIONS
 .PP
-\fBusec\fP For each CPU, the number of microseconds elapsed during counter collection, including thread migration -- if any.  This counter is disabled by default, and is enabled with "--enable usec", or --debug.  On the summary row, usec refers to the total elapsed time to collect the counters on all cpus.
+\fBusec\fP For each CPU, the number of microseconds elapsed during counter collection, including thread migration -- if any.  This counter is disabled by default, and is enabled with "--enable usec", or --debug.  On the summary row, usec refers to the total elapsed time to snapshot the procfs/sysfs and collect the counters on all cpus.
 .PP
 \fBTime_Of_Day_Seconds\fP For each CPU, the gettimeofday(2) value (seconds.subsec since Epoch) when the counters ending the measurement interval were collected.  This column is disabled by default, and can be enabled with "--enable Time_Of_Day_Seconds" or "--debug".  On the summary row, Time_Of_Day_Seconds refers to the timestamp following collection of counters on the last CPU.
 .PP
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1bcecfed721b..adcf5f0a0633 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -370,6 +370,9 @@ unsigned int has_hwp_activity_window;	/* IA32_HWP_REQUEST[bits 41:32] */
 unsigned int has_hwp_epp;	/* IA32_HWP_REQUEST[bits 31:24] */
 unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
 unsigned int first_counter_read = 1;
+
+static struct timeval procsysfs_tv_begin;
+
 int ignore_stdin;
 bool no_msr;
 bool no_perf;
@@ -3638,7 +3641,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	/* remember first tv_begin */
 	if (average.threads.tv_begin.tv_sec == 0)
-		average.threads.tv_begin = t->tv_begin;
+		average.threads.tv_begin = procsysfs_tv_begin;
 
 	/* remember last tv_end */
 	average.threads.tv_end = t->tv_end;
@@ -5983,6 +5986,8 @@ int snapshot_sys_lpi_us(void)
  */
 int snapshot_proc_sysfs_files(void)
 {
+	gettimeofday(&procsysfs_tv_begin, (struct timezone *)NULL);
+
 	if (DO_BIC(BIC_IRQ) || DO_BIC(BIC_NMI))
 		if (snapshot_proc_interrupts())
 			return 1;
-- 
cgit v1.2.3


From 7c6fee25bdf5c8f8a1bcc6fa3566fffb7fe9eb9a Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 14 Jan 2025 16:11:28 +0100
Subject: tools/power turbostat: Check for non-zero value when MSR probing

For some MSRs, for example, the Platform Energy Counter (RAPL PSYS), it
is required to additionally check for a non-zero value to confirm that
it is present.

From Intel SDM vol. 4:

    Platform Energy Counter (R/O)
    This MSR is valid only if both platform vendor hardware
    implementation and BIOS enablement support it.
    This MSR will read 0 if not valid.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index adcf5f0a0633..6b72b922e2f5 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2113,13 +2113,17 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 int probe_msr(int cpu, off_t offset)
 {
 	ssize_t retval;
-	unsigned long long dummy;
+	unsigned long long value;
 
 	assert(!no_msr);
 
-	retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset);
+	retval = pread(get_msr_fd(cpu), &value, sizeof(value), offset);
 
-	if (retval != sizeof(dummy))
+	/*
+	 * Expect MSRs to accumulate some non-zero value since the system was powered on.
+	 * Treat zero as a read failure.
+	 */
+	if (retval != sizeof(value) || value == 0)
 		return 1;
 
 	return 0;
-- 
cgit v1.2.3


From 34537ddd208d614dbefeb97823ae1c79e7771588 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 10 Dec 2024 18:27:38 +0100
Subject: tools/power turbostat: Return default value for unmapped PMT domains

When requesting PMT counters with --add command, user may want to skip
specifying values for all the domains (that is, cpu, core, package etc).
For the domains that user did not provide information on how to read the
counter, return default value - zero.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6b72b922e2f5..60b1ade8659b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4615,7 +4615,8 @@ unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb)
 
 unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
 {
-	assert(domain_id < ppmt->num_domains);
+	if (domain_id >= ppmt->num_domains)
+		return 0;
 
 	const unsigned long *pmmio = ppmt->domains[domain_id].pcounter;
 	const unsigned long value = pmmio ? *pmmio : 0;
-- 
cgit v1.2.3


From 089134cb0502ba962bce9402ce96e0875876d401 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 10 Dec 2024 11:41:58 +0100
Subject: tools/power turbostat: Extend PMT identification with a sequence
 number

When platforms expose multiple PMT aggregators with the same GUID, the
only way to identify them and map to specific domain is by reading them
in an order they were exposed via PCIe. Intel PMT kernel driver does
keep the same order and numbers the telemetry directories accordingly.

Use GUID and sequence number (order) to uniquely identify PMT
aggregators.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 60b1ade8659b..14c495886746 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1536,6 +1536,7 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = {
 #define PMT_COUNTER_MTL_DC6_LSB    0
 #define PMT_COUNTER_MTL_DC6_MSB    63
 #define PMT_MTL_DC6_GUID           0x1a067102
+#define PMT_MTL_DC6_SEQ            0
 
 #define PMT_COUNTER_NAME_SIZE_BYTES      16
 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
@@ -9083,7 +9084,7 @@ void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offs
 	return ret;
 }
 
-struct pmt_mmio *pmt_add_guid(unsigned int guid)
+struct pmt_mmio *pmt_add_guid(unsigned int guid, unsigned int seq)
 {
 	struct pmt_mmio *ret;
 
@@ -9091,6 +9092,11 @@ struct pmt_mmio *pmt_add_guid(unsigned int guid)
 	if (!ret)
 		ret = pmt_mmio_open(guid);
 
+	while (ret && seq) {
+		ret = ret->next;
+		--seq;
+	}
+
 	return ret;
 }
 
@@ -9137,7 +9143,7 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio,
 	pcounter->domains[domain_id].pcounter = pmmio;
 }
 
-int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
+int pmt_add_counter(unsigned int guid, unsigned int seq, const char *name, enum pmt_datatype type,
 		    unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
 		    enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
 {
@@ -9157,10 +9163,10 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
 		exit(1);
 	}
 
-	mmio = pmt_add_guid(guid);
+	mmio = pmt_add_guid(guid, seq);
 	if (!mmio) {
 		if (mode != PMT_OPEN_TRY) {
-			fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid);
+			fprintf(stderr, "%s: failed to map PMT MMIO for guid %x, seq %u\n", __func__, guid, seq);
 			exit(1);
 		}
 
@@ -9216,9 +9222,9 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
 void pmt_init(void)
 {
 	if (BIC_IS_ENABLED(BIC_Diec6)) {
-		pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB,
-				PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA,
-				0, PMT_OPEN_TRY);
+		pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME,
+				PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET,
+				SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY);
 	}
 }
 
@@ -9699,6 +9705,7 @@ void parse_add_command_pmt(char *add_command)
 	unsigned int lsb;
 	unsigned int msb;
 	unsigned int guid;
+	unsigned int seq = 0; /* By default, pick first file in a sequence with a given GUID. */
 	unsigned int domain_id;
 	enum counter_scope scope = 0;
 	enum pmt_datatype type = PMT_TYPE_RAW;
@@ -9778,6 +9785,10 @@ void parse_add_command_pmt(char *add_command)
 			goto next;
 		}
 
+		if (sscanf(add_command, "seq=%x", &seq) == 1) {
+			goto next;
+		}
+
 next:
 		add_command = strchr(add_command, ',');
 		if (add_command) {
@@ -9864,7 +9875,7 @@ next:
 		exit(1);
 	}
 
-	pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED);
+	pmt_add_counter(guid, seq, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED);
 }
 
 void parse_add_command(char *add_command)
-- 
cgit v1.2.3


From 4265a86582eaa224d171328be0c71e2a7ccd194f Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Thu, 12 Dec 2024 18:59:25 +0100
Subject: tools/power turbostat: Add PMT directory iterator helper

PMT directories exposed in sysfs use the following pattern:
  telem%u
for example:
  telem0, telem2, telem3, ..., telem15, telem16

This naming scheme preserves the ordering from the PCIe discovery, which
is important to correctly map the telemetry directory to the specific
domain (cpu, core, package etc).

Because readdir() traverses the entries in alphabetical order, causing
for example "telem13" to be traversed before "telem3", it is necessary
to use scandir() with custom compare() callback to preserve the PCIe
ordering.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 87 +++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 14c495886746..6104d5bcca5c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1589,6 +1589,93 @@ struct pmt_counter {
 	struct pmt_domain_info *domains;
 };
 
+/*
+ * PMT telemetry directory iterator.
+ * Used to iterate telemetry files in sysfs in correct order.
+ */
+struct pmt_diriter_t
+{
+	DIR *dir;
+	struct dirent **namelist;
+	unsigned int num_names;
+	unsigned int current_name_idx;
+};
+
+int pmt_telemdir_filter(const struct dirent *e)
+{
+	unsigned int dummy;
+	return sscanf(e->d_name, "telem%u", &dummy);
+}
+
+int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b)
+{
+	unsigned int aidx = 0, bidx = 0;
+
+	sscanf((*a)->d_name, "telem%u", &aidx);
+	sscanf((*b)->d_name, "telem%u", &bidx);
+
+	return aidx >= bidx;
+}
+
+const struct dirent* pmt_diriter_next(struct pmt_diriter_t *iter)
+{
+	const struct dirent *ret = NULL;
+
+	if (!iter->dir)
+		return NULL;
+
+	if (iter->current_name_idx >= iter->num_names)
+		return NULL;
+
+	ret = iter->namelist[iter->current_name_idx];
+	++iter->current_name_idx;
+
+	return ret;
+}
+
+const struct dirent* pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path)
+{
+	int num_names = iter->num_names;
+
+	if (!iter->dir) {
+		iter->dir = opendir(pmt_root_path);
+		if (iter->dir == NULL)
+			return NULL;
+
+		num_names = scandir(pmt_root_path, &iter->namelist, pmt_telemdir_filter, pmt_telemdir_sort);
+		if (num_names == -1)
+			return NULL;
+	}
+
+	iter->current_name_idx = 0;
+	iter->num_names = num_names;
+
+	return pmt_diriter_next(iter);
+}
+
+void pmt_diriter_init(struct pmt_diriter_t *iter)
+{
+	memset(iter, 0, sizeof(*iter));
+}
+
+void pmt_diriter_remove(struct pmt_diriter_t *iter)
+{
+	if (iter->namelist) {
+		for (unsigned int i = 0; i < iter->num_names; i++) {
+			free(iter->namelist[i]);
+			iter->namelist[i] = NULL;
+		}
+	}
+
+	free(iter->namelist);
+	iter->namelist = NULL;
+	iter->num_names = 0;
+	iter->current_name_idx = 0;
+
+	closedir(iter->dir);
+	iter->dir = NULL;
+}
+
 unsigned int pmt_counter_get_width(const struct pmt_counter *p)
 {
 	return (p->msb - p->lsb) + 1;
-- 
cgit v1.2.3


From 16ce467875ef8572b82f9af30fcf7b2f65fc2e95 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 6 Dec 2024 11:22:00 +0100
Subject: tools/power turbostat: Allow mapping multiple PMT files with the same
 GUID

Some platforms may expose multiple telemetry files identified with the
same GUID. Interpreting it correctly, to associate given counter with a
CPU, core or a package requires more metadata from the user.

Parse and create  ordered, linked list of those PMT aggregators, so that
we can identify specific aggregator with GUID + sequence number.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 75 +++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6104d5bcca5c..f76e1de3f968 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -9033,46 +9033,35 @@ int parse_telem_info_file(int fd_dir, const char *info_filename, const char *for
 
 struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
 {
-	DIR *dirp;
-	struct dirent *entry;
+	struct pmt_diriter_t pmt_iter;
+	const struct dirent *entry;
 	struct stat st;
-	unsigned int telem_idx;
 	int fd_telem_dir, fd_pmt;
 	unsigned long guid, size, offset;
 	size_t mmap_size;
 	void *mmio;
-	struct pmt_mmio *ret = NULL;
+	struct pmt_mmio *head = NULL, *last = NULL;
+	struct pmt_mmio *new_pmt = NULL;
 
 	if (stat(SYSFS_TELEM_PATH, &st) == -1)
 		return NULL;
 
-	dirp = opendir(SYSFS_TELEM_PATH);
-	if (dirp == NULL)
+	pmt_diriter_init(&pmt_iter);
+	entry = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH);
+	if (!entry) {
+		pmt_diriter_remove(&pmt_iter);
 		return NULL;
+	}
 
-	for (;;) {
-		entry = readdir(dirp);
-
-		if (entry == NULL)
-			break;
-
-		if (strcmp(entry->d_name, ".") == 0)
-			continue;
-
-		if (strcmp(entry->d_name, "..") == 0)
-			continue;
-
-		if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1)
-			continue;
-
-		if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) {
+	for (;entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
+		if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1) {
 			break;
 		}
 
 		if (!S_ISDIR(st.st_mode))
 			continue;
 
-		fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY);
+		fd_telem_dir = openat(dirfd(pmt_iter.dir), entry->d_name, O_RDONLY);
 		if (fd_telem_dir == -1) {
 			break;
 		}
@@ -9106,35 +9095,51 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
 		mmap_size = ROUND_UP_TO_PAGE_SIZE(size);
 		mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0);
 		if (mmio != MAP_FAILED) {
-
 			if (debug)
 				fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio);
 
-			ret = calloc(1, sizeof(*ret));
+			new_pmt = calloc(1, sizeof(*new_pmt));
 
-			if (!ret) {
+			if (!new_pmt) {
 				fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__);
 				exit(1);
 			}
 
-			ret->guid = guid;
-			ret->mmio_base = mmio;
-			ret->pmt_offset = offset;
-			ret->size = size;
+			/*
+			 * Create linked list of mmaped regions,
+			 * but preserve the ordering from sysfs.
+			 * Ordering is important for the user to
+			 * use the seq=%u parameter when adding a counter.
+			 */
+			new_pmt->guid = guid;
+			new_pmt->mmio_base = mmio;
+			new_pmt->pmt_offset = offset;
+			new_pmt->size = size;
+			new_pmt->next = pmt_mmios;
+
+			if (last)
+				last->next = new_pmt;
+			else
+				head = new_pmt;
 
-			ret->next = pmt_mmios;
-			pmt_mmios = ret;
+			last = new_pmt;
 		}
 
 loop_cleanup_and_break:
 		close(fd_pmt);
 		close(fd_telem_dir);
-		break;
 	}
 
-	closedir(dirp);
+	pmt_diriter_remove(&pmt_iter);
 
-	return ret;
+	/*
+	 * If we found something, stick just
+	 * created linked list to the front.
+	 */
+	if (head)
+		pmt_mmios = head;
+
+	return head;
 }
 
 struct pmt_mmio *pmt_mmio_find(unsigned int guid)
-- 
cgit v1.2.3


From 83fbeb9f9776cd044d36af606127f56206337bab Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Thu, 12 Dec 2024 19:11:34 +0100
Subject: tools/power turbostat: Allow adding PMT counters directly by sysfs
 path

Allow user to add PMT counters by either identifying the source with:
  guid=%u,seq=%u
or, since this patch, with direct sysfs path:
  path=%s, for example path=/sys/class/intel_pmt/telem5

In the later case, the guid and sequence number will be infered
by turbostat.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 107 +++++++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f76e1de3f968..0f2475fa9fa4 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -9788,11 +9788,96 @@ bool starts_with(const char *str, const char *prefix)
 	return strncmp(prefix, str, strlen(prefix)) == 0;
 }
 
+int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigned int *out_seq)
+{
+	struct pmt_diriter_t pmt_iter;
+	const struct dirent *dirname;
+	struct stat stat, target_stat;
+	int fd_telem_dir = -1;
+	int fd_target_dir;
+	unsigned int seq = 0;
+	unsigned long guid, target_guid;
+	int ret = -1;
+
+	fd_target_dir = open(target_path, O_RDONLY | O_DIRECTORY);
+	if (fd_target_dir == -1) {
+		return -1;
+	}
+
+	if (fstat(fd_target_dir, &target_stat) == -1) {
+		fprintf(stderr, "%s: Failed to stat the target: %s", __func__, strerror(errno));
+		exit(1);
+	}
+
+	if (parse_telem_info_file(fd_target_dir, "guid", "%lx", &target_guid)) {
+		fprintf(stderr, "%s: Failed to parse the target guid file: %s", __func__, strerror(errno));
+		exit(1);
+	}
+
+	close(fd_target_dir);
+
+	pmt_diriter_init(&pmt_iter);
+
+	for (dirname = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH); dirname != NULL;
+	     dirname = pmt_diriter_next(&pmt_iter)) {
+
+		fd_telem_dir = openat(dirfd(pmt_iter.dir), dirname->d_name, O_RDONLY | O_DIRECTORY);
+		if (fd_telem_dir == -1) {
+			continue;
+		}
+
+		if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
+			fprintf(stderr, "%s: Failed to parse the guid file: %s", __func__, strerror(errno));
+			continue;
+		}
+
+		if (fstat(fd_telem_dir, &stat) == -1) {
+			fprintf(stderr, "%s: Failed to stat %s directory: %s", __func__,
+				dirname->d_name, strerror(errno));
+			continue;
+		}
+
+		/*
+		 * If reached the same directory as target, exit the loop.
+		 * Seq has the correct value now.
+		 */
+		if (stat.st_dev == target_stat.st_dev && stat.st_ino == target_stat.st_ino) {
+			ret = 0;
+			break;
+		}
+
+		/*
+		 * If reached directory with the same guid,
+		 * but it's not the target directory yet,
+		 * increment seq and continue the search.
+		 */
+		if (guid == target_guid)
+			++seq;
+
+		close(fd_telem_dir);
+		fd_telem_dir = -1;
+	}
+
+	pmt_diriter_remove(&pmt_iter);
+
+	if (fd_telem_dir != -1)
+		close(fd_telem_dir);
+
+	if (!ret) {
+		*out_guid = target_guid;
+		*out_seq = seq;
+	}
+
+	return ret;
+}
+
 void parse_add_command_pmt(char *add_command)
 {
 	char *name = NULL;
 	char *type_name = NULL;
 	char *format_name = NULL;
+	char *direct_path = NULL;
+	static const char direct_path_prefix[] = "path=";
 	unsigned int offset;
 	unsigned int lsb;
 	unsigned int msb;
@@ -9881,6 +9966,10 @@ void parse_add_command_pmt(char *add_command)
 			goto next;
 		}
 
+		if (strncmp(add_command, direct_path_prefix, strlen(direct_path_prefix)) == 0) {
+			direct_path = add_command + strlen(direct_path_prefix);
+			goto next;
+		}
 next:
 		add_command = strchr(add_command, ',');
 		if (add_command) {
@@ -9952,8 +10041,24 @@ next:
 		exit(1);
 	}
 
+	if (direct_path && has_guid) {
+		printf("%s: path and guid+seq parameters are mutually exclusive\n"
+		       "notice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path);
+		exit(1);
+	}
+
+	if (direct_path) {
+		if (pmt_parse_from_path(direct_path, &guid, &seq)) {
+			printf("%s: failed to parse PMT file from %s\n", __func__, direct_path);
+			exit(1);
+		}
+
+		/* GUID was just infered from the direct path. */
+		has_guid = true;
+	}
+
 	if (!has_guid) {
-		printf("%s: missing %s\n", __func__, "guid");
+		printf("%s: missing %s\n", __func__, "guid or path");
 		exit(1);
 	}
 
-- 
cgit v1.2.3


From a80e53472209b1c749e02e91ac62c053ac457099 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 3 Dec 2024 16:11:21 -0500
Subject: tools/power turbostat: version 2025.01.14

Fix checkpatch whitespace issues since 2024.11.30

Summary of Changes since 2024.11.30:

	Enable SysWatt by default.

	Add initial PTL, CWF platform support.

	Refuse to run on unsupported platforms without --force
	to avoid not-so-useful measurements mistakenly made
	using obsolete versions.

	Harden initial PMT code in response to early use.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 36 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0f2475fa9fa4..76d2632e60ac 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3,7 +3,7 @@
  * turbostat -- show CPU frequency and C-state residency
  * on modern Intel and AMD processors.
  *
- * Copyright (c) 2024 Intel Corporation.
+ * Copyright (c) 2025 Intel Corporation.
  * Len Brown <len.brown@intel.com>
  */
 
@@ -271,11 +271,11 @@ struct msr_counter bic[] = {
 #define	BIC_Sys_J		(1ULL << 60)
 #define	BIC_NMI			(1ULL << 61)
 
-#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
-#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt)
+#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die)
+#define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt)
 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
 #define BIC_IDLE (BIC_Busy | BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
-#define BIC_OTHER ( BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
+#define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
 
@@ -1593,8 +1593,7 @@ struct pmt_counter {
  * PMT telemetry directory iterator.
  * Used to iterate telemetry files in sysfs in correct order.
  */
-struct pmt_diriter_t
-{
+struct pmt_diriter_t {
 	DIR *dir;
 	struct dirent **namelist;
 	unsigned int num_names;
@@ -1604,6 +1603,7 @@ struct pmt_diriter_t
 int pmt_telemdir_filter(const struct dirent *e)
 {
 	unsigned int dummy;
+
 	return sscanf(e->d_name, "telem%u", &dummy);
 }
 
@@ -1617,7 +1617,7 @@ int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b)
 	return aidx >= bidx;
 }
 
-const struct dirent* pmt_diriter_next(struct pmt_diriter_t *iter)
+const struct dirent *pmt_diriter_next(struct pmt_diriter_t *iter)
 {
 	const struct dirent *ret = NULL;
 
@@ -1633,7 +1633,7 @@ const struct dirent* pmt_diriter_next(struct pmt_diriter_t *iter)
 	return ret;
 }
 
-const struct dirent* pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path)
+const struct dirent *pmt_diriter_begin(struct pmt_diriter_t *iter, const char *pmt_root_path)
 {
 	int num_names = iter->num_names;
 
@@ -2302,7 +2302,7 @@ void help(void)
 		"  -h, --help\n"
 		"		print this help message\n"
 		"  -v, --version\n"
-		"		print version information\n" "\n" "For more help, run \"man turbostat\"\n");
+		"		print version information\n\nFor more help, run \"man turbostat\"\n");
 }
 
 /*
@@ -9053,18 +9053,16 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
 		return NULL;
 	}
 
-	for (;entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
-		if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1) {
+	for ( ; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
+		if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1)
 			break;
-		}
 
 		if (!S_ISDIR(st.st_mode))
 			continue;
 
 		fd_telem_dir = openat(dirfd(pmt_iter.dir), entry->d_name, O_RDONLY);
-		if (fd_telem_dir == -1) {
+		if (fd_telem_dir == -1)
 			break;
-		}
 
 		if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
 			close(fd_telem_dir);
@@ -9425,7 +9423,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2024.11.30 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2025.01.14 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
@@ -9750,7 +9748,7 @@ next:
 
 	}
 	if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) {
-		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n");
+		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event) required\n");
 		fail++;
 	}
 
@@ -9822,9 +9820,8 @@ int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigne
 	     dirname = pmt_diriter_next(&pmt_iter)) {
 
 		fd_telem_dir = openat(dirfd(pmt_iter.dir), dirname->d_name, O_RDONLY | O_DIRECTORY);
-		if (fd_telem_dir == -1) {
+		if (fd_telem_dir == -1)
 			continue;
-		}
 
 		if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
 			fprintf(stderr, "%s: Failed to parse the guid file: %s", __func__, strerror(errno));
@@ -9962,9 +9959,8 @@ void parse_add_command_pmt(char *add_command)
 			goto next;
 		}
 
-		if (sscanf(add_command, "seq=%x", &seq) == 1) {
+		if (sscanf(add_command, "seq=%x", &seq) == 1)
 			goto next;
-		}
 
 		if (strncmp(add_command, direct_path_prefix, strlen(direct_path_prefix)) == 0) {
 			direct_path = add_command + strlen(direct_path_prefix);
-- 
cgit v1.2.3


From 1a202afeaa370970413846c2cb09b383875e753c Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 17 Jan 2025 13:36:59 +0100
Subject: tools/power turbostat: Add tcore clock PMT type

Some PMT counters, for example module c1e residency on Intel Clearwater
Forest, are reported using tcore clock type.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 76d2632e60ac..ecaa4e0fb2c0 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1538,6 +1538,8 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = {
 #define PMT_MTL_DC6_GUID           0x1a067102
 #define PMT_MTL_DC6_SEQ            0
 
+unsigned long long tcore_clock_freq_hz = 800000000;
+
 #define PMT_COUNTER_NAME_SIZE_BYTES      16
 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
 
@@ -1560,6 +1562,7 @@ struct pmt_mmio {
 enum pmt_datatype {
 	PMT_TYPE_RAW,
 	PMT_TYPE_XTAL_TIME,
+	PMT_TYPE_TCORE_CLOCK,
 };
 
 struct pmt_domain_info {
@@ -2474,6 +2477,7 @@ void print_header(char *delim)
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
+		case PMT_TYPE_TCORE_CLOCK:
 			outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name);
 			break;
 		}
@@ -2548,6 +2552,7 @@ void print_header(char *delim)
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
+		case PMT_TYPE_TCORE_CLOCK:
 			outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name);
 			break;
 		}
@@ -2679,6 +2684,7 @@ void print_header(char *delim)
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
+		case PMT_TYPE_TCORE_CLOCK:
 			outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name);
 			break;
 		}
@@ -2997,7 +3003,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
 		const unsigned long value_raw = t->pmt_counter[i];
-		const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+		double value_converted;
 		switch (ppmt->type) {
 		case PMT_TYPE_RAW:
 			if (pmt_counter_get_width(ppmt) <= 32)
@@ -3009,8 +3015,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
+			value_converted = 100.0 * value_raw / crystal_hz / interval_float;
 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 			break;
+
+		case PMT_TYPE_TCORE_CLOCK:
+			value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float;
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 		}
 	}
 
@@ -3077,7 +3088,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
 		const unsigned long value_raw = c->pmt_counter[i];
-		const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+		double value_converted;
 		switch (ppmt->type) {
 		case PMT_TYPE_RAW:
 			if (pmt_counter_get_width(ppmt) <= 32)
@@ -3089,8 +3100,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
+			value_converted = 100.0 * value_raw / crystal_hz / interval_float;
 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 			break;
+
+		case PMT_TYPE_TCORE_CLOCK:
+			value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float;
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 		}
 	}
 
@@ -3275,7 +3291,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
 		const unsigned long value_raw = p->pmt_counter[i];
-		const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+		double value_converted;
 		switch (ppmt->type) {
 		case PMT_TYPE_RAW:
 			if (pmt_counter_get_width(ppmt) <= 32)
@@ -3287,8 +3303,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
+			value_converted = 100.0 * value_raw / crystal_hz / interval_float;
 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 			break;
+
+		case PMT_TYPE_TCORE_CLOCK:
+			value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float;
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 		}
 	}
 
@@ -10016,6 +10037,11 @@ next:
 			has_type = true;
 		}
 
+		if (strcmp("tcore_clock", type_name) == 0) {
+			type = PMT_TYPE_TCORE_CLOCK;
+			has_type = true;
+		}
+
 		if (!has_type) {
 			printf("%s: invalid %s: %s\n", __func__, "type", type_name);
 			exit(1);
-- 
cgit v1.2.3


From 38567b972a22706e9a1a52b2c4bc9ea4b5ed00ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 15 Jan 2025 15:47:50 +0100
Subject: selftests: Handle old glibc without execveat(2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an execveat(2) wrapper because glibc < 2.34 does not have one.  This
fixes the check-exec tests and samples.

Cc: Günther Noack <gnoack@google.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Mimi Zohar <zohar@linux.ibm.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Roberto Sassu <roberto.sassu@huawei.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Stefan Berger <stefanb@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/r/20250114205645.GA2825031@ax162
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Reviewed-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20250115144753.311152-1-mic@digikod.net
Signed-off-by: Kees Cook <kees@kernel.org>
---
 tools/testing/selftests/exec/check-exec.c  | 11 +++++++++--
 tools/testing/selftests/landlock/fs_test.c | 10 ++++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/exec/check-exec.c b/tools/testing/selftests/exec/check-exec.c
index 4d3f4525e1e1..55bce47e56b7 100644
--- a/tools/testing/selftests/exec/check-exec.c
+++ b/tools/testing/selftests/exec/check-exec.c
@@ -22,6 +22,7 @@
 #include <sys/prctl.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
+#include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #include <unistd.h>
 
@@ -31,6 +32,12 @@
 
 #include "../kselftest_harness.h"
 
+static int sys_execveat(int dirfd, const char *pathname, char *const argv[],
+			char *const envp[], int flags)
+{
+	return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags);
+}
+
 static void drop_privileges(struct __test_metadata *const _metadata)
 {
 	const unsigned int noroot = SECBIT_NOROOT | SECBIT_NOROOT_LOCKED;
@@ -219,8 +226,8 @@ static void test_exec_fd(struct __test_metadata *_metadata, const int fd,
 	 * test framework as an error.  With AT_EXECVE_CHECK, we only check a
 	 * potential successful execution.
 	 */
-	access_ret =
-		execveat(fd, "", argv, NULL, AT_EMPTY_PATH | AT_EXECVE_CHECK);
+	access_ret = sys_execveat(fd, "", argv, NULL,
+				  AT_EMPTY_PATH | AT_EXECVE_CHECK);
 	access_errno = errno;
 	if (err_code) {
 		EXPECT_EQ(-1, access_ret);
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index cd66901be612..ac9701c018e0 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -59,6 +59,12 @@ int open_tree(int dfd, const char *filename, unsigned int flags)
 }
 #endif
 
+static int sys_execveat(int dirfd, const char *pathname, char *const argv[],
+			char *const envp[], int flags)
+{
+	return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags);
+}
+
 #ifndef RENAME_EXCHANGE
 #define RENAME_EXCHANGE (1 << 1)
 #endif
@@ -2018,8 +2024,8 @@ static void test_check_exec(struct __test_metadata *const _metadata,
 	int ret;
 	char *const argv[] = { (char *)path, NULL };
 
-	ret = execveat(AT_FDCWD, path, argv, NULL,
-		       AT_EMPTY_PATH | AT_EXECVE_CHECK);
+	ret = sys_execveat(AT_FDCWD, path, argv, NULL,
+			   AT_EMPTY_PATH | AT_EXECVE_CHECK);
 	if (err) {
 		EXPECT_EQ(-1, ret);
 		EXPECT_EQ(errno, err);
-- 
cgit v1.2.3


From 50bf398e1ceacb9a7f85bd3bdca065ebe5cb6159 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 22 Jan 2025 14:45:03 -0800
Subject: net: netdevsim: try to close UDP port harness races

syzbot discovered that we remove the debugfs files after we free
the netdev. Try to clean up the relevant dir while the device
is still around.

Reported-by: syzbot+2e5de9e3ab986b71d2bf@syzkaller.appspotmail.com
Fixes: 424be63ad831 ("netdevsim: add UDP tunnel port offload support")
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Link: https://patch.msgid.link/20250122224503.762705-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/netdevsim/udp_tunnel_nic.sh    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
index 384cfa3d38a6..92c2f0376c08 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
@@ -142,7 +142,7 @@ function pre_ethtool {
 }
 
 function check_table {
-    local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1
+    local path=$NSIM_DEV_DFS/ports/$port/udp_ports/table$1
     local -n expected=$2
     local last=$3
 
@@ -212,7 +212,7 @@ function check_tables {
 }
 
 function print_table {
-    local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1
+    local path=$NSIM_DEV_DFS/ports/$port/udp_ports/table$1
     read -a have < $path
 
     tree $NSIM_DEV_DFS/
@@ -641,7 +641,7 @@ for port in 0 1; do
     NSIM_NETDEV=`get_netdev_name old_netdevs`
     ip link set dev $NSIM_NETDEV up
 
-    echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+    echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports/inject_error
 
     msg="1 - create VxLANs v6"
     exp0=( 0 0 0 0 )
@@ -663,7 +663,7 @@ for port in 0 1; do
     new_geneve gnv0 20000
 
     msg="2 - destroy GENEVE"
-    echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+    echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports/inject_error
     exp1=( `mke 20000 2` 0 0 0 )
     del_dev gnv0
 
@@ -764,7 +764,7 @@ for port in 0 1; do
     msg="create VxLANs v4"
     new_vxlan vxlan0 10000 $NSIM_NETDEV
 
-    echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+    echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset
     check_tables
 
     msg="NIC device goes down"
@@ -775,7 +775,7 @@ for port in 0 1; do
     fi
     check_tables
 
-    echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+    echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset
     check_tables
 
     msg="NIC device goes up again"
@@ -789,7 +789,7 @@ for port in 0 1; do
     del_dev vxlan0
     check_tables
 
-    echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+    echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset
     check_tables
 
     msg="destroy NIC"
@@ -896,7 +896,7 @@ msg="vacate VxLAN in overflow table"
 exp0=( `mke 10000 1` `mke 10004 1` 0 `mke 10003 1` )
 del_dev vxlan2
 
-echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset
 check_tables
 
 msg="tunnels destroyed 2"
-- 
cgit v1.2.3


From 964417a5d4a06614ef7fb3ae69bb17c91a2dc016 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 23 Jan 2025 17:21:30 -0800
Subject: tools: ynl: c: correct reverse decode of empty attrs

netlink reports which attribute was incorrect by sending back
an attribute offset. Offset points to the address of struct nlattr,
but to interpret the type we also need the nesting path.
Attribute IDs have different meaning in different nests
of the same message.

Correct the condition for "is the offset within current attribute".
ynl_attr_data_len() does not include the attribute header,
so the end offset was off by 4 bytes.

This means that we'd always skip over flags and empty nests.

The devmem tests, for example, issues an invalid request with
empty queue nests, resulting in the following error:

  YNL failed: Kernel error: missing attribute: .queues.ifindex

The message is incorrect, "queues" nest does not have an "ifindex"
attribute defined. With this fix we decend correctly into the nest:

  YNL failed: Kernel error: missing attribute: .queues.id

Fixes: 86878f14d71a ("tools: ynl: user space helpers")
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250124012130.1121227-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c
index e16cef160bc2..ce32cb35007d 100644
--- a/tools/net/ynl/lib/ynl.c
+++ b/tools/net/ynl/lib/ynl.c
@@ -95,7 +95,7 @@ ynl_err_walk(struct ynl_sock *ys, void *start, void *end, unsigned int off,
 
 	ynl_attr_for_each_payload(start, data_len, attr) {
 		astart_off = (char *)attr - (char *)start;
-		aend_off = astart_off + ynl_attr_data_len(attr);
+		aend_off = (char *)ynl_attr_data_end(attr) - (char *)start;
 		if (aend_off <= off)
 			continue;
 
-- 
cgit v1.2.3


From 23b3a7c4a7583eac9e3976355928a832c87caa0f Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Thu, 23 Jan 2025 09:35:42 +0100
Subject: selftests: mptcp: extend CFLAGS to keep options from environment

Package build environments like Fedora rpmbuild introduced hardening
options (e.g. -pie -Wl,-z,now) by passing a -spec option to CFLAGS
and LDFLAGS.

mptcp Makefile currently overrides CFLAGS but not LDFLAGS, which leads
to a mismatch and build failure, for example:
  make[1]: *** [../../lib.mk:222: tools/testing/selftests/net/mptcp/mptcp_sockopt] Error 1
  /usr/bin/ld: /tmp/ccqyMVdb.o: relocation R_X86_64_32 against `.rodata.str1.8' can not be used when making a PIE object; recompile with -fPIE
  /usr/bin/ld: failed to set dynamic section sizes: bad value
  collect2: error: ld returned 1 exit status

Fixes: cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk")
Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/7abc701da9df39c2d6cd15bc3cf9e6cee445cb96.1737621162.git.jstancek@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 8e3fc05a5397..c76525fe2b84 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -2,7 +2,7 @@
 
 top_srcdir = ../../../../..
 
-CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
 
 TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
 	      simult_flows.sh mptcp_sockopt.sh userspace_pm.sh
-- 
cgit v1.2.3


From 9b06d5b956131bde535f5c045cf8c1ff6bfba76c Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Thu, 23 Jan 2025 13:38:51 +0100
Subject: selftests: net/{lib,openvswitch}: extend CFLAGS to keep options from
 environment

Package build environments like Fedora rpmbuild introduced hardening
options (e.g. -pie -Wl,-z,now) by passing a -spec option to CFLAGS
and LDFLAGS.

Some Makefiles currently override CFLAGS but not LDFLAGS, which leads
to a mismatch and build failure, for example:
  /usr/bin/ld: /tmp/ccd2apay.o: relocation R_X86_64_32 against
    `.rodata.str1.1' can not be used when making a PIE object; recompile with -fPIE
  /usr/bin/ld: failed to set dynamic section sizes: bad value
  collect2: error: ld returned 1 exit status
  make[1]: *** [../../lib.mk:222: tools/testing/selftests/net/lib/csum] Error 1

openvswitch/Makefile CFLAGS currently do not appear to be used, but
fix it anyway for the case when new tests are introduced in future.

Fixes: 1d0dc857b5d8 ("selftests: drv-net: add checksum tests")
Signed-off-by: Jan Stancek <jstancek@redhat.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/3d173603ee258f419d0403363765c9f9494ff79a.1737635092.git.jstancek@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/Makefile         | 2 +-
 tools/testing/selftests/net/openvswitch/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile
index 18b9443454a9..bc6b6762baf3 100644
--- a/tools/testing/selftests/net/lib/Makefile
+++ b/tools/testing/selftests/net/lib/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES)
 # Additional include paths needed by kselftest.h
 CFLAGS += -I../../
diff --git a/tools/testing/selftests/net/openvswitch/Makefile b/tools/testing/selftests/net/openvswitch/Makefile
index 2f1508abc826..3fd1da2ec07d 100644
--- a/tools/testing/selftests/net/openvswitch/Makefile
+++ b/tools/testing/selftests/net/openvswitch/Makefile
@@ -2,7 +2,7 @@
 
 top_srcdir = ../../../../..
 
-CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
 
 TEST_PROGS := openvswitch.sh
 
-- 
cgit v1.2.3


From b32c36975da48afc9089f8b61f7b2dcc40e479d2 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 27 Jan 2025 16:42:19 -0600
Subject: tools/power turbostat: Fix forked child affinity regression

In "one-shot" mode, turbostat
1. takes a counter snapshot
2. forks and waits for a child
3. takes the end counter snapshot and prints the result.

But turbostat counter snapshots currently use affinity to travel
around the system so that counter reads are "local", and this
affinity must be cleared between #1 and #2 above.

The offending commit removed that reset that allowed the child
to run on cpu_present_set.

Fix that issue, and improve upon the original by using
cpu_possible_set for the child.  This allows the child
to also run on CPUs that hotplug online during its runtime.

Reported-by: Zhang Rui <rui.zhang@intel.com>
Fixes: 7bb3fe27ad4f ("tools/power/turbostat: Obey allowed CPUs during startup")
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 54 +++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index ecaa4e0fb2c0..1f188a0908da 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1120,8 +1120,8 @@ int backwards_count;
 char *progname;
 
 #define CPU_SUBSET_MAXCPUS	1024	/* need to use before probe... */
-cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
-size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
+cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
+size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define MAX_ADDED_CORE_COUNTERS 8
 #define MAX_ADDED_PACKAGE_COUNTERS 16
@@ -8488,6 +8488,33 @@ int dir_filter(const struct dirent *dirp)
 		return 0;
 }
 
+char *possible_file = "/sys/devices/system/cpu/possible";
+char possible_buf[1024];
+
+int initialize_cpu_possible_set(void)
+{
+	FILE *fp;
+
+	fp = fopen(possible_file, "r");
+	if (!fp) {
+		warn("open %s", possible_file);
+		return -1;
+	}
+	if (fread(possible_buf, sizeof(char), 1024, fp) == 0) {
+		warn("read %s", possible_file);
+		goto err;
+	}
+	if (parse_cpu_str(possible_buf, cpu_possible_set, cpu_possible_setsize)) {
+		warnx("%s: cpu str malformat %s\n", possible_file, cpu_effective_str);
+		goto err;
+	}
+	return 0;
+
+err:
+	fclose(fp);
+	return -1;
+}
+
 void topology_probe(bool startup)
 {
 	int i;
@@ -8519,6 +8546,16 @@ void topology_probe(bool startup)
 	CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
 	for_all_proc_cpus(mark_cpu_present);
 
+	/*
+	 * Allocate and initialize cpu_possible_set
+	 */
+	cpu_possible_set = CPU_ALLOC((topo.max_cpu_num + 1));
+	if (cpu_possible_set == NULL)
+		err(3, "CPU_ALLOC");
+	cpu_possible_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
+	CPU_ZERO_S(cpu_possible_setsize, cpu_possible_set);
+	initialize_cpu_possible_set();
+
 	/*
 	 * Allocate and initialize cpu_effective_set
 	 */
@@ -9371,6 +9408,18 @@ void turbostat_init()
 	}
 }
 
+void affinitize_child(void)
+{
+	/* Prefer cpu_possible_set, if available */
+	if (sched_setaffinity(0, cpu_possible_setsize, cpu_possible_set)) {
+		warn("sched_setaffinity cpu_possible_set");
+
+		/* Otherwise, allow child to run on same cpu set as turbostat */
+		if (sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set))
+			warn("sched_setaffinity cpu_allowed_set");
+	}
+}
+
 int fork_it(char **argv)
 {
 	pid_t child_pid;
@@ -9386,6 +9435,7 @@ int fork_it(char **argv)
 	child_pid = fork();
 	if (!child_pid) {
 		/* child */
+		affinitize_child();
 		execvp(argv[0], argv);
 		err(errno, "exec %s", argv[0]);
 	} else {
-- 
cgit v1.2.3


From bde4ccfd5ab5361490514fc4af7497989cfbee17 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 23 Jan 2025 20:38:56 -0800
Subject: perf annotate: Use an array for the disassembler preference

Prior to this change a string was used which could cause issues with
an unrecognized disassembler in symbol__disassembler. Change to
initializing an array of perf_disassembler enum values. If a value
already exists then adding it a second time is ignored to avoid array
out of bounds problems present in the previous code, it also allows a
statically sized array and removes memory allocation needs. Errors in
the disassembler string are reported when the config is parsed during
perf annotate or perf top start up. If the array is uninitialized
after processing the config file the default llvm, capstone then
objdump values are added but without a need to parse a string.

Fixes: a6e8a58de629 ("perf disasm: Allow configuring what disassemblers to use")
Closes: https://lore.kernel.org/lkml/CAP-5=fUdfCyxmEiTpzS2uumUp3-SyQOseX2xZo81-dQtWXj6vA@mail.gmail.com/
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20250124043856.1177264-1-irogers@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate.c | 76 +++++++++++++++++++++++++++++++++++++++---
 tools/perf/util/annotate.h | 15 ++++++---
 tools/perf/util/disasm.c   | 83 +++++++++-------------------------------------
 3 files changed, 96 insertions(+), 78 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 0d2ea22bd9e4..31bb326b07a6 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2100,6 +2100,57 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel,
 	return 0;
 }
 
+const char * const perf_disassembler__strs[] = {
+	[PERF_DISASM_UNKNOWN]  = "unknown",
+	[PERF_DISASM_LLVM]     = "llvm",
+	[PERF_DISASM_CAPSTONE] = "capstone",
+	[PERF_DISASM_OBJDUMP]  = "objdump",
+};
+
+
+static void annotation_options__add_disassembler(struct annotation_options *options,
+						 enum perf_disassembler dis)
+{
+	for (u8 i = 0; i < ARRAY_SIZE(options->disassemblers); i++) {
+		if (options->disassemblers[i] == dis) {
+			/* Disassembler is already present then don't add again. */
+			return;
+		}
+		if (options->disassemblers[i] == PERF_DISASM_UNKNOWN) {
+			/* Found a free slot. */
+			options->disassemblers[i] = dis;
+			return;
+		}
+	}
+	pr_err("Failed to add disassembler %d\n", dis);
+}
+
+static int annotation_options__add_disassemblers_str(struct annotation_options *options,
+						const char *str)
+{
+	while (str && *str != '\0') {
+		const char *comma = strchr(str, ',');
+		int len = comma ? comma - str : (int)strlen(str);
+		bool match = false;
+
+		for (u8 i = 0; i < ARRAY_SIZE(perf_disassembler__strs); i++) {
+			const char *dis_str = perf_disassembler__strs[i];
+
+			if (len == (int)strlen(dis_str) && !strncmp(str, dis_str, len)) {
+				annotation_options__add_disassembler(options, i);
+				match = true;
+				break;
+			}
+		}
+		if (!match) {
+			pr_err("Invalid disassembler '%.*s'\n", len, str);
+			return -1;
+		}
+		str = comma ? comma + 1 : NULL;
+	}
+	return 0;
+}
+
 static int annotation__config(const char *var, const char *value, void *data)
 {
 	struct annotation_options *opt = data;
@@ -2115,11 +2166,10 @@ static int annotation__config(const char *var, const char *value, void *data)
 		else if (opt->offset_level < ANNOTATION__MIN_OFFSET_LEVEL)
 			opt->offset_level = ANNOTATION__MIN_OFFSET_LEVEL;
 	} else if (!strcmp(var, "annotate.disassemblers")) {
-		opt->disassemblers_str = strdup(value);
-		if (!opt->disassemblers_str) {
-			pr_err("Not enough memory for annotate.disassemblers\n");
-			return -1;
-		}
+		int err = annotation_options__add_disassemblers_str(opt, value);
+
+		if (err)
+			return err;
 	} else if (!strcmp(var, "annotate.hide_src_code")) {
 		opt->hide_src_code = perf_config_bool("hide_src_code", value);
 	} else if (!strcmp(var, "annotate.jump_arrows")) {
@@ -2185,9 +2235,25 @@ void annotation_options__exit(void)
 	zfree(&annotate_opts.objdump_path);
 }
 
+static void annotation_options__default_init_disassemblers(struct annotation_options *options)
+{
+	if (options->disassemblers[0] != PERF_DISASM_UNKNOWN) {
+		/* Already initialized. */
+		return;
+	}
+#ifdef HAVE_LIBLLVM_SUPPORT
+	annotation_options__add_disassembler(options, PERF_DISASM_LLVM);
+#endif
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
+	annotation_options__add_disassembler(options, PERF_DISASM_CAPSTONE);
+#endif
+	annotation_options__add_disassembler(options, PERF_DISASM_OBJDUMP);
+}
+
 void annotation_config__init(void)
 {
 	perf_config(annotation__config, &annotate_opts);
+	annotation_options__default_init_disassemblers(&annotate_opts);
 }
 
 static unsigned int parse_percent_type(char *str1, char *str2)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 0ba5846dad4d..98db1b88daf4 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -34,8 +34,13 @@ struct annotated_data_type;
 #define ANNOTATION__BR_CNTR_WIDTH 30
 #define ANNOTATION_DUMMY_LEN	256
 
-// llvm, capstone, objdump
-#define MAX_DISASSEMBLERS 3
+enum perf_disassembler {
+	PERF_DISASM_UNKNOWN = 0,
+	PERF_DISASM_LLVM,
+	PERF_DISASM_CAPSTONE,
+	PERF_DISASM_OBJDUMP,
+};
+#define MAX_DISASSEMBLERS (PERF_DISASM_OBJDUMP + 1)
 
 struct annotation_options {
 	bool hide_src_code,
@@ -52,14 +57,12 @@ struct annotation_options {
 	     annotate_src,
 	     full_addr;
 	u8   offset_level;
-	u8   nr_disassemblers;
+	u8   disassemblers[MAX_DISASSEMBLERS];
 	int  min_pcnt;
 	int  max_lines;
 	int  context;
 	char *objdump_path;
 	char *disassembler_style;
-	const char *disassemblers_str;
-	const char *disassemblers[MAX_DISASSEMBLERS];
 	const char *prefix;
 	const char *prefix_strip;
 	unsigned int percent_type;
@@ -134,6 +137,8 @@ struct disasm_line {
 	struct annotation_line	 al;
 };
 
+extern const char * const perf_disassembler__strs[];
+
 void annotation_line__add(struct annotation_line *al, struct list_head *head);
 
 static inline double annotation_data__percent(struct annotation_data *data,
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index b7de4d9fd004..50c5c206b70e 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -2216,56 +2216,6 @@ out_free_command:
 	return err;
 }
 
-static int annotation_options__init_disassemblers(struct annotation_options *options)
-{
-	char *disassembler;
-
-	if (options->disassemblers_str == NULL) {
-		const char *default_disassemblers_str =
-#ifdef HAVE_LIBLLVM_SUPPORT
-				"llvm,"
-#endif
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
-				"capstone,"
-#endif
-				"objdump";
-
-		options->disassemblers_str = strdup(default_disassemblers_str);
-		if (!options->disassemblers_str)
-			goto out_enomem;
-	}
-
-	disassembler = strdup(options->disassemblers_str);
-	if (disassembler == NULL)
-		goto out_enomem;
-
-	while (1) {
-		char *comma = strchr(disassembler, ',');
-
-		if (comma != NULL)
-			*comma = '\0';
-
-		options->disassemblers[options->nr_disassemblers++] = strim(disassembler);
-
-		if (comma == NULL)
-			break;
-
-		disassembler = comma + 1;
-
-		if (options->nr_disassemblers >= MAX_DISASSEMBLERS) {
-			pr_debug("annotate.disassemblers can have at most %d entries, ignoring \"%s\"\n",
-				 MAX_DISASSEMBLERS, disassembler);
-			break;
-		}
-	}
-
-	return 0;
-
-out_enomem:
-	pr_err("Not enough memory for annotate.disassemblers\n");
-	return -1;
-}
-
 int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 {
 	struct annotation_options *options = args->options;
@@ -2274,7 +2224,6 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 	char symfs_filename[PATH_MAX];
 	bool delete_extract = false;
 	struct kcore_extract kce;
-	const char *disassembler;
 	bool decomp = false;
 	int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename));
 
@@ -2334,28 +2283,26 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 		}
 	}
 
-	err = annotation_options__init_disassemblers(options);
-	if (err)
-		goto out_remove_tmp;
-
 	err = -1;
+	for (u8 i = 0; i < ARRAY_SIZE(options->disassemblers) && err != 0; i++) {
+		enum perf_disassembler dis = options->disassemblers[i];
 
-	for (int i = 0; i < options->nr_disassemblers && err != 0; ++i) {
-		disassembler = options->disassemblers[i];
-
-		if (!strcmp(disassembler, "llvm"))
+		switch (dis) {
+		case PERF_DISASM_LLVM:
 			err = symbol__disassemble_llvm(symfs_filename, sym, args);
-		else if (!strcmp(disassembler, "capstone"))
+			break;
+		case PERF_DISASM_CAPSTONE:
 			err = symbol__disassemble_capstone(symfs_filename, sym, args);
-		else if (!strcmp(disassembler, "objdump"))
+			break;
+		case PERF_DISASM_OBJDUMP:
 			err = symbol__disassemble_objdump(symfs_filename, sym, args);
-		else
-			pr_debug("Unknown disassembler %s, skipping...\n", disassembler);
-	}
-
-	if (err == 0) {
-		pr_debug("Disassembled with %s\nannotate.disassemblers=%s\n",
-			 disassembler, options->disassemblers_str);
+			break;
+		case PERF_DISASM_UNKNOWN: /* End of disassemblers. */
+		default:
+			goto out_remove_tmp;
+		}
+		if (err == 0)
+			pr_debug("Disassembled with %s\n", perf_disassembler__strs[dis]);
 	}
 out_remove_tmp:
 	if (decomp)
-- 
cgit v1.2.3


From 5499b5ac0b2c661cc37190a23a4aee9308b3d3ee Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 27 Jan 2025 20:58:42 -0600
Subject: tools/power turbostat: Harden one-shot mode against cpu offline

when turbostat interval mode can't migrate to a CPU, it complains,
prints no data, re-initializes with the new CPU configuration
and starts a new interval.

But this strategy in the face of a CPU hotplug offline during an interval
doesn't help in one-shot mode.  When the missing CPU is discovered
at the end of the interval, the forked program has already returned
and there is nothing left for a new interval to measure.

So instead of aborting get_coutners() and delta_cpu() if a missing CPU
is detected, complain, but carry on and output what statistics are
actually present.

Use the same strategy for delta_cpu when aperf:mperf are observed
to have been reset -- complain, but carry on and print data for
the CPUs that are still present.

Interval mode error handling is unchanged.

One-shot mode can now do this:

$ sudo chcpu -e 1 ; sudo ./turbostat --quiet --show PkgWatt,Busy%,CPU chcpu -d 1
CPU 1 enabled
CPU 1 disabled
get_counters: Could not migrate to CPU 1
./turbostat: Counter reset detected
0.036920 sec
CPU	Busy%	PkgWatt
-	0.00	10.00
0	99.73	10.00
1	0.00
2	91.53
3	16.83

Suggested-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1f188a0908da..8df08819e7b4 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2063,6 +2063,8 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 {
 	int retval, pkg_no, core_no, thread_no, node_no;
 
+	retval = 0;
+
 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
 		for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
@@ -2078,14 +2080,12 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 					c = GET_CORE(core_base, core_no, node_no, pkg_no);
 					p = GET_PKG(pkg_base, pkg_no);
 
-					retval = func(t, c, p);
-					if (retval)
-						return retval;
+					retval |= func(t, c, p);
 				}
 			}
 		}
 	}
-	return 0;
+	return retval;
 }
 
 int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
@@ -3620,12 +3620,10 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
 
 	/* always calculate thread delta */
 	retval = delta_thread(t, t2, c2);	/* c2 is core delta */
-	if (retval)
-		return retval;
 
 	/* calculate package delta only for 1st core in package */
 	if (is_cpu_first_core_in_package(t, c, p))
-		retval = delta_package(p, p2);
+		retval |= delta_package(p, p2);
 
 	return retval;
 }
@@ -5748,6 +5746,8 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
 {
 	int retval, pkg_no, node_no, core_no, thread_no;
 
+	retval = 0;
+
 	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
 		for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
 			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
@@ -5769,14 +5769,12 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
 					p = GET_PKG(pkg_base, pkg_no);
 					p2 = GET_PKG(pkg_base2, pkg_no);
 
-					retval = func(t, c, p, t2, c2, p2);
-					if (retval)
-						return retval;
+					retval |= func(t, c, p, t2, c2, p2);
 				}
 			}
 		}
 	}
-	return 0;
+	return retval;
 }
 
 /*
@@ -9462,10 +9460,9 @@ int fork_it(char **argv)
 	timersub(&tv_odd, &tv_even, &tv_delta);
 	if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
 		fprintf(outf, "%s: Counter reset detected\n", progname);
-	else {
-		compute_average(EVEN_COUNTERS);
-		format_all_counters(EVEN_COUNTERS);
-	}
+
+	compute_average(EVEN_COUNTERS);
+	format_all_counters(EVEN_COUNTERS);
 
 	fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
 
-- 
cgit v1.2.3


From 5ce1e9bbb2a1d43cf9e613cb03e65ecdfd309fe9 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 17 Jan 2025 13:50:29 +0100
Subject: tools/power turbostat: Add CPU%c1e BIC for CWF

Intel Clearwater Forest report PMT telemetry with GUID 0x14421519, which
can be used to obtain module c1e residency counter of type tcore clock.

Add early support for the counter by using heuristic that should work
for the Clearwater Forest platforms.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 68 +++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 8df08819e7b4..364a44a7d7ae 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -205,6 +205,7 @@ struct msr_counter bic[] = {
 	{ 0x0, "SysWatt", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "Sys_J", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 },
+	{ 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 },
 };
 
 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@@ -270,6 +271,7 @@ struct msr_counter bic[] = {
 #define	BIC_SysWatt		(1ULL << 59)
 #define	BIC_Sys_J		(1ULL << 60)
 #define	BIC_NMI			(1ULL << 61)
+#define	BIC_CPU_c1e		(1ULL << 62)
 
 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die)
 #define BIC_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt)
@@ -1538,6 +1540,14 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = {
 #define PMT_MTL_DC6_GUID           0x1a067102
 #define PMT_MTL_DC6_SEQ            0
 
+#define PMT_COUNTER_CWF_MC1E_OFFSET_BASE          20936
+#define PMT_COUNTER_CWF_MC1E_OFFSET_INCREMENT     24
+#define PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE 12
+#define PMT_COUNTER_CWF_CPUS_PER_MODULE           4
+#define PMT_COUNTER_CWF_MC1E_LSB                  0
+#define PMT_COUNTER_CWF_MC1E_MSB                  63
+#define PMT_CWF_MC1E_GUID                         0x14421519
+
 unsigned long long tcore_clock_freq_hz = 800000000;
 
 #define PMT_COUNTER_NAME_SIZE_BYTES      16
@@ -9367,11 +9377,69 @@ int pmt_add_counter(unsigned int guid, unsigned int seq, const char *name, enum
 
 void pmt_init(void)
 {
+	int cpu_num;
+	unsigned long seq, offset, mod_num;
+
 	if (BIC_IS_ENABLED(BIC_Diec6)) {
 		pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME,
 				PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET,
 				SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY);
 	}
+
+	if (BIC_IS_ENABLED(BIC_CPU_c1e)) {
+		seq = 0;
+		offset = PMT_COUNTER_CWF_MC1E_OFFSET_BASE;
+		mod_num = 0;	/* Relative module number for current PMT file. */
+
+		/* Open the counter for each CPU. */
+		for (cpu_num = 0; cpu_num < topo.max_cpu_num;) {
+
+			if (cpu_is_not_allowed(cpu_num))
+				goto next_loop_iter;
+
+			/*
+			 * Set the scope to CPU, even though CWF report the counter per module.
+			 * CPUs inside the same module will read from the same location, instead of reporting zeros.
+			 *
+			 * CWF with newer firmware might require a PMT_TYPE_XTAL_TIME intead of PMT_TYPE_TCORE_CLOCK.
+			 */
+			pmt_add_counter(PMT_CWF_MC1E_GUID, seq, "CPU%c1e", PMT_TYPE_TCORE_CLOCK,
+					PMT_COUNTER_CWF_MC1E_LSB, PMT_COUNTER_CWF_MC1E_MSB, offset, SCOPE_CPU,
+					FORMAT_DELTA, cpu_num, PMT_OPEN_TRY);
+
+			/*
+			 * Rather complex logic for each time we go to the next loop iteration,
+			 * so keep it as a label.
+			 */
+next_loop_iter:
+			/*
+			 * Advance the cpu number and check if we should also advance offset to
+			 * the next counter inside the PMT file.
+			 *
+			 * On Clearwater Forest platform, the counter is reported per module,
+			 * so open the same counter for all of the CPUs inside the module.
+			 * That way, reported table show the correct value for all of the CPUs inside the module,
+			 * instead of zeros.
+			 */
+			++cpu_num;
+			if (cpu_num % PMT_COUNTER_CWF_CPUS_PER_MODULE == 0) {
+				offset += PMT_COUNTER_CWF_MC1E_OFFSET_INCREMENT;
+				++mod_num;
+			}
+
+			/*
+			 * There are PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE in each PMT file.
+			 *
+			 * If that number is reached, seq must be incremented to advance to the next file in a sequence.
+			 * Offset inside that file and a module counter has to be reset.
+			 */
+			if (mod_num == PMT_COUNTER_CWF_MC1E_NUM_MODULES_PER_FILE) {
+				++seq;
+				offset = PMT_COUNTER_CWF_MC1E_OFFSET_BASE;
+				mod_num = 0;
+			}
+		}
+	}
 }
 
 void turbostat_init()
-- 
cgit v1.2.3


From f6ab7384d554ba80ff4793259d75535874b366f5 Mon Sep 17 00:00:00 2001
From: Luo Yifan <luoyifan@cmss.chinamobile.com>
Date: Tue, 28 Jan 2025 23:27:01 +0900
Subject: tools/bootconfig: Fix the wrong format specifier

Use '%u' instead of '%d' for unsigned int.

Link: https://lore.kernel.org/all/20241105011048.201629-1-luoyifan@cmss.chinamobile.com/

Fixes: 973780011106 ("tools/bootconfig: Suppress non-error messages")
Signed-off-by: Luo Yifan <luoyifan@cmss.chinamobile.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 tools/bootconfig/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 156b62a163c5..8a48cc2536f5 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -226,7 +226,7 @@ static int load_xbc_from_initrd(int fd, char **buf)
 	/* Wrong Checksum */
 	rcsum = xbc_calc_checksum(*buf, size);
 	if (csum != rcsum) {
-		pr_err("checksum error: %d != %d\n", csum, rcsum);
+		pr_err("checksum error: %u != %u\n", csum, rcsum);
 		return -EINVAL;
 	}
 
@@ -395,7 +395,7 @@ static int apply_xbc(const char *path, const char *xbc_path)
 	xbc_get_info(&ret, NULL);
 	printf("\tNumber of nodes: %d\n", ret);
 	printf("\tSize: %u bytes\n", (unsigned int)size);
-	printf("\tChecksum: %d\n", (unsigned int)csum);
+	printf("\tChecksum: %u\n", (unsigned int)csum);
 
 	/* TODO: Check the options by schema */
 	xbc_exit();
-- 
cgit v1.2.3


From c7b87ce0dd10b64b68a0b22cb83bbd556e28fe81 Mon Sep 17 00:00:00 2001
From: Howard Chu <howardchu95@gmail.com>
Date: Tue, 21 Jan 2025 18:55:19 -0800
Subject: perf trace: Fix runtime error of index out of bounds

libtraceevent parses and returns an array of argument fields, sometimes
larger than RAW_SYSCALL_ARGS_NUM (6) because it includes "__syscall_nr",
idx will traverse to index 6 (7th element) whereas sc->fmt->arg holds 6
elements max, creating an out-of-bounds access. This runtime error is
found by UBsan. The error message:

  $ sudo UBSAN_OPTIONS=print_stacktrace=1 ./perf trace -a --max-events=1
  builtin-trace.c:1966:35: runtime error: index 6 out of bounds for type 'syscall_arg_fmt [6]'
    #0 0x5c04956be5fe in syscall__alloc_arg_fmts /home/howard/hw/linux-perf/tools/perf/builtin-trace.c:1966
    #1 0x5c04956c0510 in trace__read_syscall_info /home/howard/hw/linux-perf/tools/perf/builtin-trace.c:2110
    #2 0x5c04956c372b in trace__syscall_info /home/howard/hw/linux-perf/tools/perf/builtin-trace.c:2436
    #3 0x5c04956d2f39 in trace__init_syscalls_bpf_prog_array_maps /home/howard/hw/linux-perf/tools/perf/builtin-trace.c:3897
    #4 0x5c04956d6d25 in trace__run /home/howard/hw/linux-perf/tools/perf/builtin-trace.c:4335
    #5 0x5c04956e112e in cmd_trace /home/howard/hw/linux-perf/tools/perf/builtin-trace.c:5502
    #6 0x5c04956eda7d in run_builtin /home/howard/hw/linux-perf/tools/perf/perf.c:351
    #7 0x5c04956ee0a8 in handle_internal_command /home/howard/hw/linux-perf/tools/perf/perf.c:404
    #8 0x5c04956ee37f in run_argv /home/howard/hw/linux-perf/tools/perf/perf.c:448
    #9 0x5c04956ee8e9 in main /home/howard/hw/linux-perf/tools/perf/perf.c:556
    #10 0x79eb3622a3b7 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58
    #11 0x79eb3622a47a in __libc_start_main_impl ../csu/libc-start.c:360
    #12 0x5c04955422d4 in _start (/home/howard/hw/linux-perf/tools/perf/perf+0x4e02d4) (BuildId: 5b6cab2d59e96a4341741765ad6914a4d784dbc6)

     0.000 ( 0.014 ms): Chrome_ChildIO/117244 write(fd: 238, buf: !, count: 1)                                      = 1

Fixes: 5e58fcfaf4c6 ("perf trace: Allow allocating sc->arg_fmt even without the syscall tracepoint")
Signed-off-by: Howard Chu <howardchu95@gmail.com>
Link: https://lore.kernel.org/r/20250122025519.361873-1-howardchu95@gmail.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d7c7d29291fb..d466447ae928 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2107,8 +2107,12 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 		return PTR_ERR(sc->tp_format);
 	}
 
+	/*
+	 * The tracepoint format contains __syscall_nr field, so it's one more
+	 * than the actual number of syscall arguments.
+	 */
 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
-					RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields))
+					RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
 		return -ENOMEM;
 
 	sc->args = sc->tp_format->format.fields;
-- 
cgit v1.2.3


From 72d81e10628be6a948463259cbb6d3b670b20054 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 28 Jan 2025 09:06:29 -0800
Subject: perf test: Skip syscall enum test if no landlock syscall

The perf trace enum augmentation test specifically targets landlock_
add_rule syscall but IIUC it's an optional and can be opt-out by a
kernel config.

Currently trace_landlock() runs `perf test -w landlock` before the
actual testing to check the availability but it's not enough since the
workload always returns 0.  Instead it could check if perf trace output
has 'landlock' string.

Fixes: d66763fed30f0bd8c ("perf test trace_btf_enum: Add regression test for the BTF augmentation of enums in 'perf trace'")
Reviewed-by: Howard Chu <howardchu95@gmail.com>
Link: https://lore.kernel.org/r/20250128170629.1251574-1-namhyung@kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/trace_btf_enum.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh
index 5a3b8a5a9b5c..8d1e6bbeac90 100755
--- a/tools/perf/tests/shell/trace_btf_enum.sh
+++ b/tools/perf/tests/shell/trace_btf_enum.sh
@@ -26,8 +26,12 @@ check_vmlinux() {
 trace_landlock() {
   echo "Tracing syscall ${syscall}"
 
-  # test flight just to see if landlock_add_rule and libbpf are available
-  $TESTPROG
+  # test flight just to see if landlock_add_rule is available
+  if ! perf trace $TESTPROG 2>&1 | grep -q landlock
+  then
+    echo "No landlock system call found, skipping to non-syscall tracing."
+    return
+  fi
 
   if perf trace -e $syscall $TESTPROG 2>&1 | \
      grep -q -E ".*landlock_add_rule\(ruleset_fd: 11, rule_type: (LANDLOCK_RULE_PATH_BENEATH|LANDLOCK_RULE_NET_PORT), rule_attr: 0x[a-f0-9]+, flags: 45\) = -1.*"
-- 
cgit v1.2.3


From 9fae5884bb0e3480dbb69314b82ed3d8f8482eef Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 18 Dec 2024 11:55:51 +0000
Subject: perf cpumap: Fix die and cluster IDs

Now that filename__read_int() returns -errno instead of -1 these
statements need to be updated otherwise error values will be used as
die IDs.

This appears as a -2 die ID when the platform doesn't export one:

  $ perf stat --per-core -a -- true

  S36-D-2-C0            1               9.45 msec cpu-clock

And the session topology test fails:

  $ perf test -vvv topology

  CPU 0, core 0, socket 36
  CPU 1, core 1, socket 36
  CPU 2, core 2, socket 36
  CPU 3, core 3, socket 36
  FAILED tests/topology.c:137 Cpu map - Die ID doesn't match
  ---- end(-1) ----
  38: Session topology                                                : FAILED!

Fixes: 05be17eed774 ("tool api fs: Correctly encode errno for read/write open failures")
Reported-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20241218115552.912517-1-james.clark@linaro.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/cpumap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 27094211edd8..5c329ad614e9 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -293,7 +293,7 @@ struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data)
 
 	die = cpu__get_die_id(cpu);
 	/* There is no die_id on legacy system. */
-	if (die == -1)
+	if (die < 0)
 		die = 0;
 
 	/*
@@ -322,7 +322,7 @@ struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data)
 	struct aggr_cpu_id id;
 
 	/* There is no cluster_id on legacy system. */
-	if (cluster == -1)
+	if (cluster < 0)
 		cluster = 0;
 
 	id = aggr_cpu_id__die(cpu, data);
-- 
cgit v1.2.3


From 852a00c4281d3c4cf82020421cc9b5b05d53e93f Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Tue, 28 Jan 2025 14:15:29 +0100
Subject: vsock/test: Introduce vsock_bind()

Add a helper for socket()+bind(). Adapt callers.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Luigi Leonardi <leonardi@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-3-1cf57065b770@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/util.c       | 57 +++++++++++++++++-----------------------
 tools/testing/vsock/util.h       |  1 +
 tools/testing/vsock/vsock_test.c | 17 +-----------
 3 files changed, 26 insertions(+), 49 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 7058dc614c25..6e36f9371532 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -96,33 +96,43 @@ void vsock_wait_remote_close(int fd)
 	close(epollfd);
 }
 
-/* Bind to <bind_port>, connect to <cid, port> and return the file descriptor. */
-int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type)
+/* Create socket <type>, bind to <cid, port> and return the file descriptor. */
+int vsock_bind(unsigned int cid, unsigned int port, int type)
 {
-	struct sockaddr_vm sa_client = {
-		.svm_family = AF_VSOCK,
-		.svm_cid = VMADDR_CID_ANY,
-		.svm_port = bind_port,
-	};
-	struct sockaddr_vm sa_server = {
+	struct sockaddr_vm sa = {
 		.svm_family = AF_VSOCK,
 		.svm_cid = cid,
 		.svm_port = port,
 	};
+	int fd;
 
-	int client_fd, ret;
-
-	client_fd = socket(AF_VSOCK, type, 0);
-	if (client_fd < 0) {
+	fd = socket(AF_VSOCK, type, 0);
+	if (fd < 0) {
 		perror("socket");
 		exit(EXIT_FAILURE);
 	}
 
-	if (bind(client_fd, (struct sockaddr *)&sa_client, sizeof(sa_client))) {
+	if (bind(fd, (struct sockaddr *)&sa, sizeof(sa))) {
 		perror("bind");
 		exit(EXIT_FAILURE);
 	}
 
+	return fd;
+}
+
+/* Bind to <bind_port>, connect to <cid, port> and return the file descriptor. */
+int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type)
+{
+	struct sockaddr_vm sa_server = {
+		.svm_family = AF_VSOCK,
+		.svm_cid = cid,
+		.svm_port = port,
+	};
+
+	int client_fd, ret;
+
+	client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type);
+
 	timeout_begin(TIMEOUT);
 	do {
 		ret = connect(client_fd, (struct sockaddr *)&sa_server, sizeof(sa_server));
@@ -192,28 +202,9 @@ int vsock_seqpacket_connect(unsigned int cid, unsigned int port)
 /* Listen on <cid, port> and return the file descriptor. */
 static int vsock_listen(unsigned int cid, unsigned int port, int type)
 {
-	union {
-		struct sockaddr sa;
-		struct sockaddr_vm svm;
-	} addr = {
-		.svm = {
-			.svm_family = AF_VSOCK,
-			.svm_port = port,
-			.svm_cid = cid,
-		},
-	};
 	int fd;
 
-	fd = socket(AF_VSOCK, type, 0);
-	if (fd < 0) {
-		perror("socket");
-		exit(EXIT_FAILURE);
-	}
-
-	if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) {
-		perror("bind");
-		exit(EXIT_FAILURE);
-	}
+	fd = vsock_bind(cid, port, type);
 
 	if (listen(fd, 1) < 0) {
 		perror("listen");
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index e62f46b2b92a..077842905bc3 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -43,6 +43,7 @@ int vsock_connect(unsigned int cid, unsigned int port, int type);
 int vsock_accept(unsigned int cid, unsigned int port,
 		 struct sockaddr_vm *clientaddrp, int type);
 int vsock_stream_connect(unsigned int cid, unsigned int port);
+int vsock_bind(unsigned int cid, unsigned int port, int type);
 int vsock_bind_connect(unsigned int cid, unsigned int port,
 		       unsigned int bind_port, int type);
 int vsock_seqpacket_connect(unsigned int cid, unsigned int port);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 1eebbc0d5f61..daa4f3ca9b6e 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -113,24 +113,9 @@ static void test_stream_bind_only_client(const struct test_opts *opts)
 
 static void test_stream_bind_only_server(const struct test_opts *opts)
 {
-	union {
-		struct sockaddr sa;
-		struct sockaddr_vm svm;
-	} addr = {
-		.svm = {
-			.svm_family = AF_VSOCK,
-			.svm_port = opts->peer_port,
-			.svm_cid = VMADDR_CID_ANY,
-		},
-	};
 	int fd;
 
-	fd = socket(AF_VSOCK, SOCK_STREAM, 0);
-
-	if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) {
-		perror("bind");
-		exit(EXIT_FAILURE);
-	}
+	fd = vsock_bind(VMADDR_CID_ANY, opts->peer_port, SOCK_STREAM);
 
 	/* Notify the client that the server is ready */
 	control_writeln("BIND");
-- 
cgit v1.2.3


From ac12b7e2912d60fceefaacc78ee8b5aec8b51c04 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Tue, 28 Jan 2025 14:15:30 +0100
Subject: vsock/test: Introduce vsock_connect_fd()

Distill timeout-guarded vsock_connect_fd(). Adapt callers.

Suggested-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-4-1cf57065b770@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/util.c | 45 +++++++++++++++++----------------------------
 tools/testing/vsock/util.h |  1 +
 2 files changed, 18 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 6e36f9371532..de25892f865f 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -120,27 +120,33 @@ int vsock_bind(unsigned int cid, unsigned int port, int type)
 	return fd;
 }
 
-/* Bind to <bind_port>, connect to <cid, port> and return the file descriptor. */
-int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type)
+int vsock_connect_fd(int fd, unsigned int cid, unsigned int port)
 {
-	struct sockaddr_vm sa_server = {
+	struct sockaddr_vm sa = {
 		.svm_family = AF_VSOCK,
 		.svm_cid = cid,
 		.svm_port = port,
 	};
-
-	int client_fd, ret;
-
-	client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type);
+	int ret;
 
 	timeout_begin(TIMEOUT);
 	do {
-		ret = connect(client_fd, (struct sockaddr *)&sa_server, sizeof(sa_server));
+		ret = connect(fd, (struct sockaddr *)&sa, sizeof(sa));
 		timeout_check("connect");
 	} while (ret < 0 && errno == EINTR);
 	timeout_end();
 
-	if (ret < 0) {
+	return ret;
+}
+
+/* Bind to <bind_port>, connect to <cid, port> and return the file descriptor. */
+int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type)
+{
+	int client_fd;
+
+	client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type);
+
+	if (vsock_connect_fd(client_fd, cid, port)) {
 		perror("connect");
 		exit(EXIT_FAILURE);
 	}
@@ -151,17 +157,6 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po
 /* Connect to <cid, port> and return the file descriptor. */
 int vsock_connect(unsigned int cid, unsigned int port, int type)
 {
-	union {
-		struct sockaddr sa;
-		struct sockaddr_vm svm;
-	} addr = {
-		.svm = {
-			.svm_family = AF_VSOCK,
-			.svm_port = port,
-			.svm_cid = cid,
-		},
-	};
-	int ret;
 	int fd;
 
 	control_expectln("LISTENING");
@@ -172,20 +167,14 @@ int vsock_connect(unsigned int cid, unsigned int port, int type)
 		exit(EXIT_FAILURE);
 	}
 
-	timeout_begin(TIMEOUT);
-	do {
-		ret = connect(fd, &addr.sa, sizeof(addr.svm));
-		timeout_check("connect");
-	} while (ret < 0 && errno == EINTR);
-	timeout_end();
-
-	if (ret < 0) {
+	if (vsock_connect_fd(fd, cid, port)) {
 		int old_errno = errno;
 
 		close(fd);
 		fd = -1;
 		errno = old_errno;
 	}
+
 	return fd;
 }
 
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 077842905bc3..d1f765ce3eee 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -39,6 +39,7 @@ struct test_case {
 void init_signals(void);
 unsigned int parse_cid(const char *str);
 unsigned int parse_port(const char *str);
+int vsock_connect_fd(int fd, unsigned int cid, unsigned int port);
 int vsock_connect(unsigned int cid, unsigned int port, int type);
 int vsock_accept(unsigned int cid, unsigned int port,
 		 struct sockaddr_vm *clientaddrp, int type);
-- 
cgit v1.2.3


From 301a62dfb0d0dff79460ed3a54e1f8dbd8db52e9 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Tue, 28 Jan 2025 14:15:31 +0100
Subject: vsock/test: Add test for UAF due to socket unbinding

Fail the autobind, then trigger a transport reassign. Socket might get
unbound from unbound_sockets, which then leads to a reference count
underflow.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-5-1cf57065b770@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c | 58 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index daa4f3ca9b6e..92cfd92bbfdc 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1693,6 +1693,59 @@ static void test_stream_msgzcopy_leak_zcskb_server(const struct test_opts *opts)
 	close(fd);
 }
 
+#define MAX_PORT_RETRIES	24	/* net/vmw_vsock/af_vsock.c */
+
+/* Test attempts to trigger a transport release for an unbound socket. This can
+ * lead to a reference count mishandling.
+ */
+static void test_stream_transport_uaf_client(const struct test_opts *opts)
+{
+	int sockets[MAX_PORT_RETRIES];
+	struct sockaddr_vm addr;
+	int fd, i, alen;
+
+	fd = vsock_bind(VMADDR_CID_ANY, VMADDR_PORT_ANY, SOCK_STREAM);
+
+	alen = sizeof(addr);
+	if (getsockname(fd, (struct sockaddr *)&addr, &alen)) {
+		perror("getsockname");
+		exit(EXIT_FAILURE);
+	}
+
+	for (i = 0; i < MAX_PORT_RETRIES; ++i)
+		sockets[i] = vsock_bind(VMADDR_CID_ANY, ++addr.svm_port,
+					SOCK_STREAM);
+
+	close(fd);
+	fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+	if (fd < 0) {
+		perror("socket");
+		exit(EXIT_FAILURE);
+	}
+
+	if (!vsock_connect_fd(fd, addr.svm_cid, addr.svm_port)) {
+		perror("Unexpected connect() #1 success");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Vulnerable system may crash now. */
+	if (!vsock_connect_fd(fd, VMADDR_CID_HOST, VMADDR_PORT_ANY)) {
+		perror("Unexpected connect() #2 success");
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+	while (i--)
+		close(sockets[i]);
+
+	control_writeln("DONE");
+}
+
+static void test_stream_transport_uaf_server(const struct test_opts *opts)
+{
+	control_expectln("DONE");
+}
+
 static struct test_case test_cases[] = {
 	{
 		.name = "SOCK_STREAM connection reset",
@@ -1838,6 +1891,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_msgzcopy_leak_zcskb_client,
 		.run_server = test_stream_msgzcopy_leak_zcskb_server,
 	},
+	{
+		.name = "SOCK_STREAM transport release use-after-free",
+		.run_client = test_stream_transport_uaf_client,
+		.run_server = test_stream_transport_uaf_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From 4695f64e028dd1407d077565fbdb30ddb364180a Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Tue, 28 Jan 2025 14:15:32 +0100
Subject: vsock/test: Add test for connect() retries

Deliberately fail a connect() attempt; expect error. Then verify that
subsequent attempt (using the same socket) can still succeed, rather than
fail outright.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Luigi Leonardi <leonardi@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-6-1cf57065b770@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c | 47 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 92cfd92bbfdc..dfff8b288265 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1746,6 +1746,48 @@ static void test_stream_transport_uaf_server(const struct test_opts *opts)
 	control_expectln("DONE");
 }
 
+static void test_stream_connect_retry_client(const struct test_opts *opts)
+{
+	int fd;
+
+	fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+	if (fd < 0) {
+		perror("socket");
+		exit(EXIT_FAILURE);
+	}
+
+	if (!vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) {
+		fprintf(stderr, "Unexpected connect() #1 success\n");
+		exit(EXIT_FAILURE);
+	}
+
+	control_writeln("LISTEN");
+	control_expectln("LISTENING");
+
+	if (vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) {
+		perror("connect() #2");
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+}
+
+static void test_stream_connect_retry_server(const struct test_opts *opts)
+{
+	int fd;
+
+	control_expectln("LISTEN");
+
+	fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
+	if (fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	vsock_wait_remote_close(fd);
+	close(fd);
+}
+
 static struct test_case test_cases[] = {
 	{
 		.name = "SOCK_STREAM connection reset",
@@ -1896,6 +1938,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_transport_uaf_client,
 		.run_server = test_stream_transport_uaf_server,
 	},
+	{
+		.name = "SOCK_STREAM retry failed connect()",
+		.run_client = test_stream_connect_retry_client,
+		.run_server = test_stream_connect_retry_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From f7bf624b1fedf232195804ac0f6584cb3e4b86bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 27 Jan 2025 14:13:43 +0100
Subject: selftests/net: Add test for loading devbound XDP program in generic
 mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a test to bpf_offload.py for loading a devbound XDP program in
generic mode, checking that it fails correctly.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250127131344.238147-2-toke@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/bpf_offload.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py
index d10f420e4ef6..fd0d959914e4 100755
--- a/tools/testing/selftests/net/bpf_offload.py
+++ b/tools/testing/selftests/net/bpf_offload.py
@@ -215,12 +215,14 @@ def bpftool_map_list_wait(expected=0, n_retry=20, ns=""):
     raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps))
 
 def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None,
-                      fail=True, include_stderr=False):
+                      fail=True, include_stderr=False, dev_bind=None):
     args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name)
     if prog_type is not None:
         args += " type " + prog_type
     if dev is not None:
         args += " dev " + dev
+    elif dev_bind is not None:
+        args += " xdpmeta_dev " + dev_bind
     if len(maps):
         args += " map " + " map ".join(maps)
 
@@ -980,6 +982,16 @@ try:
     rm("/sys/fs/bpf/offload")
     sim.wait_for_flush()
 
+    bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/devbound",
+                      dev_bind=sim['ifname'])
+    devbound = bpf_pinned("/sys/fs/bpf/devbound")
+    start_test("Test dev-bound program in generic mode...")
+    ret, _, err = sim.set_xdp(devbound, "generic", fail=False, include_stderr=True)
+    fail(ret == 0, "devbound program in generic mode allowed")
+    check_extack(err, "Can't attach device-bound programs in generic mode.", args)
+    rm("/sys/fs/bpf/devbound")
+    sim.wait_for_flush()
+
     start_test("Test XDP load failure...")
     sim.dfs["dev/bpf_bind_verifier_accept"] = 0
     ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload",
-- 
cgit v1.2.3


From 2c4627c8ced77855b106c7104ecab70837d53799 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 2 Feb 2025 10:43:02 -0600
Subject: tools/power turbostat: version 2025.02.02

Summary of Changes since 2024.11.30:

Fix regression in 2023.11.07 that affinitized forked child
in one-shot mode.

Harden one-shot mode against hotplug online/offline

Enable RAPL SysWatt column by default.

Add initial PTL, CWF platform support.

Harden initial PMT code in response to early use.

Enable first built-in PMT counter: CWF c1e residency

Refuse to run on unsupported platforms without --force,
to encourage updating to a version that supports the system,
and to avoid no-so-useful measurement results.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 364a44a7d7ae..8d5011a0bf60 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -9559,7 +9559,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2025.01.14 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2025.02.02 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
-- 
cgit v1.2.3